import sys

# symbols
NULL = "(null)"
OPERATOR = "(operator)"
NUMBER = "(literal)"
STRING = "(literal)"
IDENT = "(name)"
COMMENT = "(comment)"           # Comments should be ignored
DOC = "(documentation)"         # Documentation should be ignored
EOF = "(end)"
INDENT = "(indent)"             # Indicate a new indentation
DEDENT = "(dedent)"             # Indicates a deden
##NULL = 0
##OPERATOR = 1
##NUMBER = 2
##STRING = 3
##IDENT = 4
##COMMENT = 5           # Comments should be ignored
##DOC = 6
##EOF = 7
##INDENT = 8
##DEDENT = 9
COMMENT_CONT = 10


def Mark(m, (l, p)):
  print("%s at %d:%d"%(m, l, p))

def Ident(line, pos, start) :
  # Operators in spin are weird and we need special handling for these
  operator = ('AND', 'OR', 'NOT')

  token = u''
  max = len(line)
  while pos < max and (line[pos].isalnum() or line[pos] == u'_') :
    token += line[pos]
    pos += 1

  # check if it is an operator
  if token in operator:
    if line[pos] == u'=':       # assignment ?
      token += u'='
      pos += 1
    return (OPERATOR, token, pos)

  return (IDENT, token, pos)

def Number(line, pos, start, base) :
  maxint = 4294967295 # 32 bit unsigned integer
  b = u'0123456789ABCDEF'
  token = u''
  isfloat = False
  max = len(line)
  while pos < max :
    ch = line[pos]
    if ch.upper() in b[:base]:  # Correct digit for selected base?
      token += ch
    elif ch == u'.' :           # check for range '..'
      if line[pos+1] == u'.' : break
      if base != 10 :           # float only allowed for decimal numbers
        Mark("invalid number", start)
        break
      if token.find(ch) > 0 :
        Mark("Invalid number", start)
        break
      isfloat = True
      token += ch
    elif base == 10 and ch.upper()=='E' :   # check for exponent
      if not(line[pos+1] == u'-' or line[pos+1].isdigit()) :
        Mark("Invalid number", start)
        break
      if token.find(ch.lower()) > 0 or token.find(ch.upper()) > 0 :
        Mark("Invalid number", start)
        break
      isfloat = True
      token += ch
    elif ch != u'_' : break     # _ is used as seperator, allow and skip it
    pos += 1

## Should this be here or does this belong in the parser?
##  if isfloat:
##    import struct
##
##    packed = struct.pack('<f',float(token))
##    number = struct.unpack('l',packed)[0]
##  else:
##    number = int(token, base)
##  if number > maxint :
##      Mark("Number too large", start)
  return (NUMBER, token, pos)

def String(line, pos, start) :
  token = u''
  end = False
  max = len(line)
  while pos < max and line[pos] != u'"':
    token += line[pos]
    pos += 1

  if pos == max :
    Mark("String not terminated", start)
  return (STRING, token, pos+1)

def Comment(line, pos, start, doc, multi) :
  token = u''
  end = False
  max = len(line)
  if multi : sym = COMMENT_CONT # if comment does not end in this line, continue
  elif doc : sym = DOC          # Documentation comment
  else: sym = COMMENT           # single line only

  while pos < max :
    ch = line[pos]
    pos += 1
    if multi and ch == u'}' :  # multi-line comment ends with '}'
      if not doc :
        sym = COMMENT
        break
      if line[pos] == u'}':    # multi-line document comment ends with '}}'
        sym = DOC
        pos += 1               # position should point to the last character of the token
        break
    token += ch

  return (sym, token, pos)

def Operator(line, pos, start) :
  suffix = u'=*/-+<>#^|~@.'
  token = line[pos]
  pos += 1
  max = len(line)
  while pos < max and line[pos] in suffix :
    token += line[pos]
    pos += 1
  return(OPERATOR, token, pos)

def scan(readline):
  # characters that are operators or start an operator
  operators = u'&*+-=#~<>:/?!|@.'
  tabsize = 8
  # Initialize variables
  lineNo = 0
  indents = [0]
  multi = doc = False

  # loop over lines in stream
  while True:
    try:
      line = readline()
    except StopIteration:
      break

    lineNo += 1
    start = pos = indent = 0
    max = len(line)

    while pos < max :
      # Handle multi line comments with a little state machine
      # COMMENT_CONT => multi-line comment, need more data
      # COMMENT      => comment is fetched
      # DOC          => documentation comment is fetched
      if multi:
        (sym, value, pos) = Comment(line, pos, (sl, start), doc, multi)
        token += value
        if sym != COMMENT_CONT:
          multi = doc = False
        else:
          continue
      else:
        token = line[pos]
        pos += 1

        # Determine indent at beginning of line (start = 0)
        # TODO:
        #    - Treat multi-line comments as white-space
        #    - Indentation is only relevant for specific statements
        #      This is allowed
        #        REPEAT
        #         OUTA[0..1]~~
        #              OUTA[0..1]~
        #      And belongs to the same scope :(
        if not start :
          if token == u' ' :                        # Count space
            indent += 1
            continue
          elif token == u'\t' :
            indent = (indent/tabsize + 1)*tabsize   # next tab position
            continue
          elif token == u'\f' :                     # reset indent
            indent = 0
            continue
          elif token.isspace() : continue           # skip empty lines

          if indent > indents[-1] :                 # new indent?
            indents.append(indent)                  # save it
            yield(INDENT, indent, (lineNo, 0), (lineNo, pos))   # and tell parser
          if indent not in indents :                # error if non existing indent
            Mark("unindent does not match any indentation level", (lineNo, pos))
          while indent < indents[-1] :              # go back to correct indent level
            indents = indents[:-1]
            yield(DEDENT, indent, (lineNo, 0), (lineNo,  pos))   # and tell parser

        if token.isspace() : continue   #skip whitespace

        start = pos
        sl = lineNo                     # save for multi-line comments

        if token == u"'" :              # single line comment
          if line[pos] == u"'" :        # single line documentation comment
            doc = True
            pos += 1
          (sym, token, pos) = Comment(line, pos, (sl, start), doc, multi)
        elif token == u'{' :            # multi-line comment
          multi = True
          if line[pos] == u'{' :        # multi-line documentation comment
            doc = True
            pos += 1
          (sym, token, pos) = Comment(line, pos, (sl, start), doc, multi)
          if sym == COMMENT_CONT : continue # get complete comment
        elif token == u'"' :            # String
          (sym, token, pos) = String(line, pos, (sl, start))
        elif token == u'%' :            # Binary number
          if line[pos] == u'%' :        # Quadrany number
            pos += 1
            (sym, token, pos) = Number(line, pos, (sl, start), 4)
          else :
            (sym, token, pos) = Number(line, pos, (sl, start), 2)
        elif token == u'$' :            # Hexadecimal number
          (sym, token, pos) = Number(line, pos, (sl, start), 16)
        elif token.isdigit() :          # integer or float
          (sym, token, pos) = Number(line, pos-1, (sl, start), 10)
        elif token.isalpha() or token == u'_' :  # Identifier
          (sym, token, pos) = Ident(line, pos-1, (sl, start))
        elif token in operators :       # special operators
          (sym, token, pos) = Operator(line, pos-1, (sl, start))
        else : sym = OPERATOR           # everything else is considered an operator

      yield sym, token, (sl, start), (lineNo, pos-1)
    # end line scan
  # end file scan

def test(program=None):
  if program :
    print ">>>", program
    from cStringIO import StringIO
    f = StringIO(program)
  else :
    import codecs
    f = codecs.open('test.spin','r','utf_16')

  tokens = scan(f.next)
  for sym, token, (l,s), _ in tokens:
    print "%d:%d %s = %s"%(l,s,sym, token)


if __name__=="main":
  # Strings
  test('"this is a string"\n')
  test('"this is not terminated\n')
  test('"this is a string""and this another"\n')
  test('"this is a string""and this 1 2"\n')
  # Numbers
  test("%1010101 %%1230 $AB70 1234567 3.41516 2.75e10")
  test("%1210101 %%1234 $AH70 1B34567 3.415.16 2.75e10e5")

  # Comments
  test("''test this\na=b\n'b=5")
  test("a=5{{b=a \nshoul{blah}\n dbe done}} b=a")
  # Operators
  test("AND= AND OR= OR NOT NOT= &= += = == |=")
  # load spin file
  test()