nand2tetris/compiler/tokenizer.py

import re
from keywords import Keyword
from html import escape
from enum import Enum
# Superclass in some sense
class Token(Enum):
  KEYWORD = 1
  SYMBOL = 2

class JackTokenizer:
  SYMBOL_MAP = {
    '{': Keyword.BRACE_OPEN ,
    '}': Keyword.BRACE_CLOSE ,
    '(': Keyword.PARAN_OPEN ,
    ')': Keyword.PARAN_CLOSE ,
    '[': Keyword.SQUARE_OPEN ,
    ']': Keyword.SQUARE_CLOSE ,
    '.': Keyword.DOT ,
    ';': Keyword.SEMICOLON ,
    '+': Keyword.PLUS ,
    '-': Keyword.MINUS ,
    '*': Keyword.MUL ,
    '/': Keyword.DIV ,
    '&': Keyword.AND ,
    '|': Keyword.OR ,
    '<': Keyword.LT ,
    '>': Keyword.GT ,
    '=': Keyword.EQ ,
    '~': Keyword.NOT ,
    ',': Keyword.COMMA,
  }

  KEYWORD_MAP = {
    "class": Keyword.CLASS,
    "method": Keyword.METHOD,
    "function": Keyword.FUNCTION,
    "constructor": Keyword.CONSTRUCTOR,
    "int": Keyword.INT,
    "boolean": Keyword.BOOLEAN,
    "char": Keyword.CHAR,
    "void": Keyword.VOID,
    "var": Keyword.VAR,
    "static": Keyword.STATIC,
    "field": Keyword.FIELD,
    "let": Keyword.LET,
    "do": Keyword.DO,
    "if": Keyword.IF,
    "else": Keyword.ELSE,
    "while": Keyword.WHILE,
    "return": Keyword.RETURN,
    "true": Keyword.TRUE,
    "false": Keyword.FALSE,
    "null": Keyword.NULL,
    "this"    : Keyword.THIS
  }
  """ Returns the type of the current token """
  def tokenType(self):
    t = self.current_token()
    if t in ['class','constructor','function','method','field','static','var','int','char','boolean','void','true','false','null','this','let','do','if','else','while','return']:
      return JackTokenizer.KEYWORD_MAP[t]
    elif re.compile("(\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.)").match(t):
      return JackTokenizer.SYMBOL_MAP[t]
    elif re.compile("\d+").match(t):
      return Keyword.INTEGERCONSTANT
    elif re.compile("\".*\"").match(t):
      return Keyword.STRINGCONSTANT
    else:
      # TODO: Put an assert to ensure valid identifier
      return Keyword.IDENTIFIER
    pass

  def printable_token(self):
    if self.tokenType() == Keyword.STRINGCONSTANT:
      return self.current_token()[1:-1]
    else:
      return escape(self.current_token(), True)

  def assert_type(self, t):
    if(t == Token.SYMBOL):
      assert(self.tokenType() in SYMBOL_MAP.values())
    elif(t == Token.KEYWORD):
      assert(self.tokenType() in KEYWORD_MAP.values())
    else:
      assert(self.tokenType() == t)

  """ Returns the character which is the current token """
  def symbol(self):
    self.assert_type(Token.SYMBOL)
    return self.current_token()

  """ Returns the identifier which is the current token """
  def identifier(self):
    self.assert_type(Token.IDENTIFIER)
    return self.current_token()

  """ Returns the integer value of the current token """
  def intVal(self):
    self.assert_type(Keyword.INTEGERCONSTANT)
    return int(self.token)

  """ Returns a list of tokens for that line """
  def parse_line(self, line):
    line = line.strip()
    # If this line as a single line comment anywhere
    # strip the line to start of //
    if line.find("//") != -1:
      line = line[:line.find("//")].strip()

    if self.insideMultiLineComment:
      if line.find("*/") == -1:
        # The comment doesn't end in this line
        return []
      else:
        self.insideMultiLineComment = False
        # comments ends here, huzzah!
        line = line[:line.find("*/")].strip()

    # Same for the multi-line comment, but this time
    # Also set insideMultiLineComment = true
    elif line.find("/*") != -1:
      # The comment ends on the same line
      if line.find("*/") != -1:
        # TODO: this also breaks on /* inside strings :(
        # TODO: This also breaks on multiple multi-line comments on the same line
        line = line[:line.find("/*")] + line[line.find("*/") + 2:].strip()
      else:
        line = line[:line.find("/*")].strip()
        self.insideMultiLineComment = True

    # We don't need no empty lines
    if len(line) == 0:
      return []
    else:
      # Regex contains 3 parts:
      # 1. Keywords
      # 2. Symbols
      # 3. Identifiers
      # 4. Strings
      regex = re.compile("(class|constructor|function|method|field|static|var|int|char|boolean|void|true|false|null|this|let|do|if|else|while|return|\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.|[a-zA-Z_]+\w*|\".*\")")
      return [e.strip() for e in regex.split(line) if e != None and e.strip()!='']

  def has_more_tokens(self):
    return self.ptr < len(self.tokens)

  def current_token(self):
    return self.tokens[self.ptr]

  def advance(self):
    self.ptr += 1

  def __init__(self, filename, print_xml=False):
    self.ptr = 0
    self.insideMultiLineComment = False
    self.file = open(filename, 'r')
    self.tokens = []
    for line in self.file:
      self.tokens += self.parse_line(line)

    if(print_xml):
      self.print_xml(self.xml_file(filename))

  def xml_file(self, jack_file):
    return jack_file + "T.xml"

  """ Returns a single row of XML for the Compilation Engine """
  def xml_row(self):
    t = self.tokenType()
    if t in JackTokenizer.SYMBOL_MAP.values():
      t = 'symbol'
    elif t in JackTokenizer.KEYWORD_MAP.values():
      t = 'keyword'
    else:
      t = t.name.lower()
    return "<{type}> {value} </{type}>\n".format(type=t, value=self.printable_token())

  def print_xml(self, xml_filename):
    with open(xml_filename, 'w') as f:
      f.write("<tokens>\n")
      while self.has_more_tokens():
        f.write(self.xml_row())
        self.advance()
      f.write("</tokens>\n")
Break up the file a bit, haven't committed compilation stuff yet 2020-06-18 11:56:32 +00:00			`import re`
			`from keywords import Keyword`
			`from html import escape`
			`from enum import Enum`
			`# Superclass in some sense`
			`class Token(Enum):`
			`KEYWORD = 1`
			`SYMBOL = 2`

			`class JackTokenizer:`
			`SYMBOL_MAP = {`
			`'{': Keyword.BRACE_OPEN ,`
			`'}': Keyword.BRACE_CLOSE ,`
			`'(': Keyword.PARAN_OPEN ,`
			`')': Keyword.PARAN_CLOSE ,`
			`'[': Keyword.SQUARE_OPEN ,`
			`']': Keyword.SQUARE_CLOSE ,`
			`'.': Keyword.DOT ,`
			`';': Keyword.SEMICOLON ,`
			`'+': Keyword.PLUS ,`
			`'-': Keyword.MINUS ,`
			`'*': Keyword.MUL ,`
			`'/': Keyword.DIV ,`
			`'&': Keyword.AND ,`
			`'\|': Keyword.OR ,`
			`'<': Keyword.LT ,`
			`'>': Keyword.GT ,`
			`'=': Keyword.EQ ,`
			`'~': Keyword.NOT ,`
			`',': Keyword.COMMA,`
			`}`

			`KEYWORD_MAP = {`
			`"class": Keyword.CLASS,`
			`"method": Keyword.METHOD,`
			`"function": Keyword.FUNCTION,`
			`"constructor": Keyword.CONSTRUCTOR,`
			`"int": Keyword.INT,`
			`"boolean": Keyword.BOOLEAN,`
			`"char": Keyword.CHAR,`
			`"void": Keyword.VOID,`
			`"var": Keyword.VAR,`
			`"static": Keyword.STATIC,`
			`"field": Keyword.FIELD,`
			`"let": Keyword.LET,`
			`"do": Keyword.DO,`
			`"if": Keyword.IF,`
			`"else": Keyword.ELSE,`
			`"while": Keyword.WHILE,`
			`"return": Keyword.RETURN,`
			`"true": Keyword.TRUE,`
			`"false": Keyword.FALSE,`
			`"null": Keyword.NULL,`
			`"this" : Keyword.THIS`
			`}`
			`""" Returns the type of the current token """`
			`def tokenType(self):`
			`t = self.current_token()`
			`if t in ['class','constructor','function','method','field','static','var','int','char','boolean','void','true','false','null','this','let','do','if','else','while','return']:`
			`return JackTokenizer.KEYWORD_MAP[t]`
			`elif re.compile("(\(\|\)\|\[\|\]\|,\|\+\|-\|;\|<\|>\|=\|~\|&\|{\|}\|\*\|\/\|\\|\|\.)").match(t):`
			`return JackTokenizer.SYMBOL_MAP[t]`
			`elif re.compile("\d+").match(t):`
			`return Keyword.INTEGERCONSTANT`
			`elif re.compile("\".*\"").match(t):`
			`return Keyword.STRINGCONSTANT`
			`else:`
			`# TODO: Put an assert to ensure valid identifier`
			`return Keyword.IDENTIFIER`
			`pass`

			`def printable_token(self):`
			`if self.tokenType() == Keyword.STRINGCONSTANT:`
			`return self.current_token()[1:-1]`
			`else:`
			`return escape(self.current_token(), True)`

			`def assert_type(self, t):`
			`if(t == Token.SYMBOL):`
			`assert(self.tokenType() in SYMBOL_MAP.values())`
			`elif(t == Token.KEYWORD):`
			`assert(self.tokenType() in KEYWORD_MAP.values())`
			`else:`
			`assert(self.tokenType() == t)`

			`""" Returns the character which is the current token """`
			`def symbol(self):`
			`self.assert_type(Token.SYMBOL)`
			`return self.current_token()`

			`""" Returns the identifier which is the current token """`
			`def identifier(self):`
			`self.assert_type(Token.IDENTIFIER)`
			`return self.current_token()`

			`""" Returns the integer value of the current token """`
			`def intVal(self):`
			`self.assert_type(Keyword.INTEGERCONSTANT)`
			`return int(self.token)`

			`""" Returns a list of tokens for that line """`
			`def parse_line(self, line):`
			`line = line.strip()`
			`# If this line as a single line comment anywhere`
			`# strip the line to start of //`
			`if line.find("//") != -1:`
			`line = line[:line.find("//")].strip()`

			`if self.insideMultiLineComment:`
			`if line.find("*/") == -1:`
			`# The comment doesn't end in this line`
			`return []`
			`else:`
			`self.insideMultiLineComment = False`
			`# comments ends here, huzzah!`
			`line = line[:line.find("*/")].strip()`

			`# Same for the multi-line comment, but this time`
			`# Also set insideMultiLineComment = true`
			`elif line.find("/*") != -1:`
			`# The comment ends on the same line`
			`if line.find("*/") != -1:`
			`# TODO: this also breaks on /* inside strings :(`
			`# TODO: This also breaks on multiple multi-line comments on the same line`
			`line = line[:line.find("/")] + line[line.find("/") + 2:].strip()`
			`else:`
			`line = line[:line.find("/*")].strip()`
			`self.insideMultiLineComment = True`

			`# We don't need no empty lines`
			`if len(line) == 0:`
			`return []`
			`else:`
			`# Regex contains 3 parts:`
			`# 1. Keywords`
			`# 2. Symbols`
			`# 3. Identifiers`
			`# 4. Strings`
			`regex = re.compile("(class\|constructor\|function\|method\|field\|static\|var\|int\|char\|boolean\|void\|true\|false\|null\|this\|let\|do\|if\|else\|while\|return\|\(\|\)\|\[\|\]\|,\|\+\|-\|;\|<\|>\|=\|~\|&\|{\|}\|\\|\/\|\\|\|\.\|[a-zA-Z_]+\w\|\".*\")")`
			`return [e.strip() for e in regex.split(line) if e != None and e.strip()!='']`

			`def has_more_tokens(self):`
			`return self.ptr < len(self.tokens)`

			`def current_token(self):`
			`return self.tokens[self.ptr]`

			`def advance(self):`
			`self.ptr += 1`

			`def __init__(self, filename, print_xml=False):`
			`self.ptr = 0`
			`self.insideMultiLineComment = False`
			`self.file = open(filename, 'r')`
			`self.tokens = []`
			`for line in self.file:`
			`self.tokens += self.parse_line(line)`

			`if(print_xml):`
			`self.print_xml(self.xml_file(filename))`

			`def xml_file(self, jack_file):`
			`return jack_file + "T.xml"`

			`""" Returns a single row of XML for the Compilation Engine """`
			`def xml_row(self):`
			`t = self.tokenType()`
			`if t in JackTokenizer.SYMBOL_MAP.values():`
			`t = 'symbol'`
			`elif t in JackTokenizer.KEYWORD_MAP.values():`
			`t = 'keyword'`
			`else:`
			`t = t.name.lower()`
			`return "<{type}> {value} </{type}>\n".format(type=t, value=self.printable_token())`

			`def print_xml(self, xml_filename):`
			`with open(xml_filename, 'w') as f:`
			`f.write("<tokens>\n")`
			`while self.has_more_tokens():`
			`f.write(self.xml_row())`
			`self.advance()`
			`f.write("</tokens>\n")`