import sys # symbols NULL = "(null)" OPERATOR = "(operator)" NUMBER = "(literal)" STRING = "(literal)" IDENT = "(name)" COMMENT = "(comment)" # Comments should be ignored DOC = "(documentation)" # Documentation should be ignored EOF = "(end)" INDENT = "(indent)" # Indicate a new indentation DEDENT = "(dedent)" # Indicates a deden ##NULL = 0 ##OPERATOR = 1 ##NUMBER = 2 ##STRING = 3 ##IDENT = 4 ##COMMENT = 5 # Comments should be ignored ##DOC = 6 ##EOF = 7 ##INDENT = 8 ##DEDENT = 9 COMMENT_CONT = 10 def Mark(m, (l, p)): print("%s at %d:%d"%(m, l, p)) def Ident(line, pos, start) : # Operators in spin are weird and we need special handling for these operator = ('AND', 'OR', 'NOT') token = u'' max = len(line) while pos < max and (line[pos].isalnum() or line[pos] == u'_') : token += line[pos] pos += 1 # check if it is an operator if token in operator: if line[pos] == u'=': # assignment ? token += u'=' pos += 1 return (OPERATOR, token, pos) return (IDENT, token, pos) def Number(line, pos, start, base) : maxint = 4294967295 # 32 bit unsigned integer b = u'0123456789ABCDEF' token = u'' isfloat = False max = len(line) while pos < max : ch = line[pos] if ch.upper() in b[:base]: # Correct digit for selected base? token += ch elif ch == u'.' : # check for range '..' if line[pos+1] == u'.' : break if base != 10 : # float only allowed for decimal numbers Mark("invalid number", start) break if token.find(ch) > 0 : Mark("Invalid number", start) break isfloat = True token += ch elif base == 10 and ch.upper()=='E' : # check for exponent if not(line[pos+1] == u'-' or line[pos+1].isdigit()) : Mark("Invalid number", start) break if token.find(ch.lower()) > 0 or token.find(ch.upper()) > 0 : Mark("Invalid number", start) break isfloat = True token += ch elif ch != u'_' : break # _ is used as seperator, allow and skip it pos += 1 ## Should this be here or does this belong in the parser? ## if isfloat: ## import struct ## ## packed = struct.pack(' maxint : ## Mark("Number too large", start) return (NUMBER, token, pos) def String(line, pos, start) : token = u'' end = False max = len(line) while pos < max and line[pos] != u'"': token += line[pos] pos += 1 if pos == max : Mark("String not terminated", start) return (STRING, token, pos+1) def Comment(line, pos, start, doc, multi) : token = u'' end = False max = len(line) if multi : sym = COMMENT_CONT # if comment does not end in this line, continue elif doc : sym = DOC # Documentation comment else: sym = COMMENT # single line only while pos < max : ch = line[pos] pos += 1 if multi and ch == u'}' : # multi-line comment ends with '}' if not doc : sym = COMMENT break if line[pos] == u'}': # multi-line document comment ends with '}}' sym = DOC pos += 1 # position should point to the last character of the token break token += ch return (sym, token, pos) def Operator(line, pos, start) : suffix = u'=*/-+<>#^|~@.' token = line[pos] pos += 1 max = len(line) while pos < max and line[pos] in suffix : token += line[pos] pos += 1 return(OPERATOR, token, pos) def scan(readline): # characters that are operators or start an operator operators = u'&*+-=#~<>:/?!|@.' tabsize = 8 # Initialize variables lineNo = 0 indents = [0] multi = doc = False # loop over lines in stream while True: try: line = readline() except StopIteration: break lineNo += 1 start = pos = indent = 0 max = len(line) while pos < max : # Handle multi line comments with a little state machine # COMMENT_CONT => multi-line comment, need more data # COMMENT => comment is fetched # DOC => documentation comment is fetched if multi: (sym, value, pos) = Comment(line, pos, (sl, start), doc, multi) token += value if sym != COMMENT_CONT: multi = doc = False else: continue else: token = line[pos] pos += 1 # Determine indent at beginning of line (start = 0) # TODO: # - Treat multi-line comments as white-space # - Indentation is only relevant for specific statements # This is allowed # REPEAT # OUTA[0..1]~~ # OUTA[0..1]~ # And belongs to the same scope :( if not start : if token == u' ' : # Count space indent += 1 continue elif token == u'\t' : indent = (indent/tabsize + 1)*tabsize # next tab position continue elif token == u'\f' : # reset indent indent = 0 continue elif token.isspace() : continue # skip empty lines if indent > indents[-1] : # new indent? indents.append(indent) # save it yield(INDENT, indent, (lineNo, 0), (lineNo, pos)) # and tell parser if indent not in indents : # error if non existing indent Mark("unindent does not match any indentation level", (lineNo, pos)) while indent < indents[-1] : # go back to correct indent level indents = indents[:-1] yield(DEDENT, indent, (lineNo, 0), (lineNo, pos)) # and tell parser if token.isspace() : continue #skip whitespace start = pos sl = lineNo # save for multi-line comments if token == u"'" : # single line comment if line[pos] == u"'" : # single line documentation comment doc = True pos += 1 (sym, token, pos) = Comment(line, pos, (sl, start), doc, multi) elif token == u'{' : # multi-line comment multi = True if line[pos] == u'{' : # multi-line documentation comment doc = True pos += 1 (sym, token, pos) = Comment(line, pos, (sl, start), doc, multi) if sym == COMMENT_CONT : continue # get complete comment elif token == u'"' : # String (sym, token, pos) = String(line, pos, (sl, start)) elif token == u'%' : # Binary number if line[pos] == u'%' : # Quadrany number pos += 1 (sym, token, pos) = Number(line, pos, (sl, start), 4) else : (sym, token, pos) = Number(line, pos, (sl, start), 2) elif token == u'$' : # Hexadecimal number (sym, token, pos) = Number(line, pos, (sl, start), 16) elif token.isdigit() : # integer or float (sym, token, pos) = Number(line, pos-1, (sl, start), 10) elif token.isalpha() or token == u'_' : # Identifier (sym, token, pos) = Ident(line, pos-1, (sl, start)) elif token in operators : # special operators (sym, token, pos) = Operator(line, pos-1, (sl, start)) else : sym = OPERATOR # everything else is considered an operator yield sym, token, (sl, start), (lineNo, pos-1) # end line scan # end file scan def test(program=None): if program : print ">>>", program from cStringIO import StringIO f = StringIO(program) else : import codecs f = codecs.open('test.spin','r','utf_16') tokens = scan(f.next) for sym, token, (l,s), _ in tokens: print "%d:%d %s = %s"%(l,s,sym, token) if __name__=="main": # Strings test('"this is a string"\n') test('"this is not terminated\n') test('"this is a string""and this another"\n') test('"this is a string""and this 1 2"\n') # Numbers test("%1010101 %%1230 $AB70 1234567 3.41516 2.75e10") test("%1210101 %%1234 $AH70 1B34567 3.415.16 2.75e10e5") # Comments test("''test this\na=b\n'b=5") test("a=5{{b=a \nshoul{blah}\n dbe done}} b=a") # Operators test("AND= AND OR= OR NOT NOT= &= += = == |=") # load spin file test()