nand2tetris/compiler/__init__.py

from enum import Enum
import re
import sys
from html import escape

class Token(Enum):
  KEYWORD = 1
  SYMBOL = 2
  IDENTIFIER = 3
  INTEGERCONSTANT = 4
  STRINGCONSTANT = 5
  UNKNOWN = 6

class Keyword(Enum):
  CLASS = 1
  METHOD = 2
  FUNCTION = 3
  CONSTRUCTOR = 4
  INT = 5
  BOOLEAN = 6
  CHAR = 7
  VOID = 8
  VAR = 9
  STATIC = 10
  FIELD = 11
  LET = 12
  DO = 13
  IF = 14
  ELSE = 15
  WHILE = 16
  RETURN = 17
  TRUE = 18
  FALSE = 19
  NULL = 20
  THIS = 21

class JackAnalyzer:
  def __init__(self):
    pass

class JackTokenizer:

  """ Returns the type of the current token """
  def tokenType(self):
    t = self.current_token()
    if t in ['class','constructor','function','method','field','static','var','int','char','boolean','void','true','false','null','this','let','do','if','else','while','return']:
      return Token.KEYWORD
    elif re.compile("(\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.)").match(t):
      return Token.SYMBOL
    elif re.compile("\d+").match(t):
      return Token.INTEGERCONSTANT
    elif re.compile("\".*\"").match(t):
      return Token.STRINGCONSTANT
    else:
      return Token.IDENTIFIER
    pass

  def printable_token(self):
    if self.tokenType() == Token.STRINGCONSTANT:
      return self.current_token()[1:-1]
    else:
      return escape(self.current_token(), True)

  """ Returns the character which is the current token """
  def symbol(self):
    if self.tokenType() != Token.SYMBOL:
      raise RuntimeError("Should only be called when tokenType is SYMBOL")

  """ Returns the identifier which is the current token """
  def identifier(self):
    if self.tokenType() != Token.IDENTIFIER:
      raise RuntimeError("Should only be called when tokenType is IDENTIFIER")

  """ Returns the integer value of the current token """
  def intVal(self):
    if self.tokenType() != Token.INTEGERCONSTANT:
      raise RuntimeError("Should only be called when tokenType is INTEGERCONSTANT")
    return int(self.token)

  """ Returns a list of tokens for that line """
  def parse_line(self, line):
    line = line.strip()
    # If this line as a single line comment anywhere
    # strip the line to start of //
    if line.find("//") != -1:
      line = line[:line.find("//")].strip()

    if self.insideMultiLineComment:
      if line.find("*/") == -1:
        # The comment doesn't end in this line
        return []
      else:
        self.insideMultiLineComment = False
        # comments ends here, huzzah!
        line = line[:line.find("*/")].strip()

    # Same for the multi-line comment, but this time
    # Also set insideMultiLineComment = true
    elif line.find("/*") != -1:
      # The comment ends on the same line
      if line.find("*/") != -1:
        # TODO: this also breaks on /* inside strings :(
        # TODO: This also breaks on multiple multi-line comments on the same line
        line = line[:line.find("/*")] + line[line.find("*/") + 2:].strip()
      else:
        line = line[:line.find("/*")].strip()
        self.insideMultiLineComment = True

    # We don't need no empty lines
    if len(line) == 0:
      return []
    else:
      # Regex contains 3 parts:
      # 1. Keywords
      # 2. Symbols
      # 3. Identifiers
      # 4. Strings
      regex = re.compile("(class|constructor|function|method|field|static|var|int|char|boolean|void|true|false|null|this|let|do|if|else|while|return|\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.|[a-zA-Z_]+\w*|\".*\")")
      return [e.strip() for e in regex.split(line) if e != None and e.strip()!='']

  def has_more_tokens(self):
    return self.ptr < len(self.tokens)

  def current_token(self):
    return self.tokens[self.ptr]

  def advance(self):
    self.ptr += 1

  def __init__(self, filename, print_xml=False):
    self.ptr = 0
    self.insideMultiLineComment = False
    self.file = open(filename, 'r')
    self.tokens = []
    for line in self.file:
      self.tokens += self.parse_line(line)

    if(print_xml):
      self.print_xml(self.xml_file(filename))

  def xml_file(self, jack_file):
    return jack_file + "T.xml"

  def print_xml(self, xml_filename):
    with open(xml_filename, 'w') as f:
      f.write("<tokens>\n")
      while self.has_more_tokens():
        f.write("<{type}> {value} </{type}>\n".format(type=self.tokenType().name.lower(), value=self.printable_token()))
        self.advance()
      f.write("</tokens>\n")

class CompilationEngine:
  def __init__(self):
    pass

if __name__ == '__main__':
  jt = JackTokenizer(sys.argv[1], True)