Break up the file a bit, haven't committed compilation stuff yet

2020-06-18 17:26:32 +05:30 · 2020-06-18 17:26:32 +05:30 · c71dd21128
parent cece143368
commit c71dd21128
6 changed files with 243 additions and 157 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
 tools/
+**/__pycache__/
--- a/NOTES.md
+++ b/NOTES.md
@ -105,3 +105,9 @@ I can definitely solve it, but I want to do it properly. I've also realized why
 2. PHP's language documentation is aimed at users, while Python throws so much useless stuff at you. I'm yet to find a language documentation that rivals PHP, to be fair - but Python gets so much wrong. Searching for "condition" on Python docs gets you: a page on something called Condition Objects, Conditional Expressions, and "More on conditions" - none of which actually detail what are the conditional statements and how they work. Look at the [control structures](https://www.php.net/manual/en/language.control-structures.php) page on PHP website instead. Python docs also like talking about language implementation details too much. For eg, BNF notation is peppered throughout the docs. PHP on the other hand uses only one language in its docs - PHP.
 3. Lack of examples in documentation. You're left to figure out so many things. PHP gets this correct, for every function in the standard library. If examples are missing, the comments will usually have them.
 4. Static Typing
+
+
+## Compilation Engine
+
+I'm hard-coding stuff a lot, with a lot of asserts
+would be nice once I have structure to actually generate the rules from the GRAMMAR
--- a/compiler/README.md
+++ b/compiler/README.md
@ -251,7 +251,7 @@ let do if else while return

 #### `symbol`

-`{}()[].m;+-*/&|<>=~`
+`{}()[].;+-*/&|<>=~`

 #### `integerConstant`

@ -296,6 +296,7 @@ int | char | boolean | className
 (void | type)
 subRoutineName
 ( parameterList )
+subroutineBody
 ```

 #### `parameterList`
@ -367,5 +368,3 @@ TODO
 |keyword,symbol,integerConstant,stringConstant,identifier|class,classVarDec,subroutineDec,parameterList,subroutineBody,varDec|
 ||statements,whileStatement,ifStatement,returnStatement,letStatement,doStatement|
 ||expression,term,expressionList|
-
-
--- a/compiler/init.py
+++ b/compiler/init.py
@ -1,158 +1,8 @@
-from enum import Enum
-import re
+from tokenizer import JackTokenizer
 import sys
-from html import escape
-
-class Token(Enum):
-  KEYWORD = 1
-  SYMBOL = 2
-  IDENTIFIER = 3
-  INTEGERCONSTANT = 4
-  STRINGCONSTANT = 5
-  UNKNOWN = 6
-
-class Keyword(Enum):
-  CLASS = 1
-  METHOD = 2
-  FUNCTION = 3
-  CONSTRUCTOR = 4
-  INT = 5
-  BOOLEAN = 6
-  CHAR = 7
-  VOID = 8
-  VAR = 9
-  STATIC = 10
-  FIELD = 11
-  LET = 12
-  DO = 13
-  IF = 14
-  ELSE = 15
-  WHILE = 16
-  RETURN = 17
-  TRUE = 18
-  FALSE = 19
-  NULL = 20
-  THIS = 21
-
-class JackAnalyzer:
-  def __init__(self):
-    pass
-
-class JackTokenizer:
-
-  """ Returns the type of the current token """
-  def tokenType(self):
-    t = self.current_token()
-    if t in ['class','constructor','function','method','field','static','var','int','char','boolean','void','true','false','null','this','let','do','if','else','while','return']:
-      return Token.KEYWORD
-    elif re.compile("(\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.)").match(t):
-      return Token.SYMBOL
-    elif re.compile("\d+").match(t):
-      return Token.INTEGERCONSTANT
-    elif re.compile("\".*\"").match(t):
-      return Token.STRINGCONSTANT
-    else:
-      return Token.IDENTIFIER
-    pass
-
-  def printable_token(self):
-    if self.tokenType() == Token.STRINGCONSTANT:
-      return self.current_token()[1:-1]
-    else:
-      return escape(self.current_token(), True)
-
-  """ Returns the character which is the current token """
-  def symbol(self):
-    if self.tokenType() != Token.SYMBOL:
-      raise RuntimeError("Should only be called when tokenType is SYMBOL")
-
-  """ Returns the identifier which is the current token """
-  def identifier(self):
-    if self.tokenType() != Token.IDENTIFIER:
-      raise RuntimeError("Should only be called when tokenType is IDENTIFIER")
-
-  """ Returns the integer value of the current token """
-  def intVal(self):
-    if self.tokenType() != Token.INTEGERCONSTANT:
-      raise RuntimeError("Should only be called when tokenType is INTEGERCONSTANT")
-    return int(self.token)
-
-  """ Returns a list of tokens for that line """
-  def parse_line(self, line):
-    line = line.strip()
-    # If this line as a single line comment anywhere
-    # strip the line to start of //
-    if line.find("//") != -1:
-      line = line[:line.find("//")].strip()
-
-    if self.insideMultiLineComment:
-      if line.find("*/") == -1:
-        # The comment doesn't end in this line
-        return []
-      else:
-        self.insideMultiLineComment = False
-        # comments ends here, huzzah!
-        line = line[:line.find("*/")].strip()
-
-    # Same for the multi-line comment, but this time
-    # Also set insideMultiLineComment = true
-    elif line.find("/*") != -1:
-      # The comment ends on the same line
-      if line.find("*/") != -1:
-        # TODO: this also breaks on /* inside strings :(
-        # TODO: This also breaks on multiple multi-line comments on the same line
-        line = line[:line.find("/*")] + line[line.find("*/") + 2:].strip()
-      else:
-        line = line[:line.find("/*")].strip()
-        self.insideMultiLineComment = True
-
-    # We don't need no empty lines
-    if len(line) == 0:
-      return []
-    else:
-      # Regex contains 3 parts:
-      # 1. Keywords
-      # 2. Symbols
-      # 3. Identifiers
-      # 4. Strings
-      regex = re.compile("(class|constructor|function|method|field|static|var|int|char|boolean|void|true|false|null|this|let|do|if|else|while|return|\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.|[a-zA-Z_]+\w*|\".*\")")
-      return [e.strip() for e in regex.split(line) if e != None and e.strip()!='']
-
-  def has_more_tokens(self):
-    return self.ptr < len(self.tokens)
-
-  def current_token(self):
-    return self.tokens[self.ptr]
-
-  def advance(self):
-    self.ptr += 1
-
-  def __init__(self, filename, print_xml=False):
-    self.ptr = 0
-    self.insideMultiLineComment = False
-    self.file = open(filename, 'r')
-    self.tokens = []
-    for line in self.file:
-      self.tokens += self.parse_line(line)
-
-    if(print_xml):
-      self.print_xml(self.xml_file(filename))
-
-  def xml_file(self, jack_file):
-    return jack_file + "T.xml"
-
-  def print_xml(self, xml_filename):
-    with open(xml_filename, 'w') as f:
-      f.write("<tokens>\n")
-      while self.has_more_tokens():
-        f.write("<{type}> {value} </{type}>\n".format(type=self.tokenType().name.lower(), value=self.printable_token()))
-        self.advance()
-      f.write("</tokens>\n")
-
-class CompilationEngine:
-  def __init__(self):
-    pass

 if __name__ == '__main__':
-  jt = JackTokenizer(sys.argv[1], True)
+  j = JackTokenizer(sys.argv[1], True)
+  # c = CompilationEngine(sys.argv[1])
+  # c.CompileClass()

--- a/compiler/keywords.py
+++ b/compiler/keywords.py
@ -0,0 +1,48 @@
+from enum import Enum,Flag,auto
+
+class Keyword(Flag):
+  CLASS = auto()
+  METHOD = auto()
+  FUNCTION = auto()
+  CONSTRUCTOR = auto()
+  INT = auto()
+  BOOLEAN = auto()
+  CHAR = auto()
+  VOID = auto()
+  VAR = auto()
+  STATIC = auto()
+  FIELD = auto()
+  LET = auto()
+  DO = auto()
+  IF = auto()
+  ELSE = auto()
+  WHILE = auto()
+  RETURN = auto()
+  TRUE = auto()
+  FALSE = auto()
+  NULL = auto()
+  THIS = auto()
+  # Symbols Start here
+  BRACE_OPEN = auto()
+  BRACE_CLOSE = auto()
+  PARAN_OPEN = auto()
+  PARAN_CLOSE = auto()
+  SQUARE_OPEN = auto()
+  SQUARE_CLOSE = auto()
+  DOT = auto()
+  SEMICOLON = auto()
+  PLUS = auto()
+  MINUS = auto()
+  MUL = auto()
+  DIV = auto()
+  AND = auto()
+  OR = auto()
+  LT = auto()
+  GT = auto()
+  EQ = auto()
+  NOT = auto()
+  COMMA = auto()
+  # Other Tokens
+  IDENTIFIER = auto()
+  INTEGERCONSTANT = auto()
+  STRINGCONSTANT = auto()
--- a/compiler/tokenizer.py
+++ b/compiler/tokenizer.py
@ -0,0 +1,182 @@
+import re
+from keywords import Keyword
+from html import escape
+from enum import Enum
+# Superclass in some sense
+class Token(Enum):
+  KEYWORD = 1
+  SYMBOL = 2
+
+class JackTokenizer:
+  SYMBOL_MAP = {
+    '{': Keyword.BRACE_OPEN ,
+    '}': Keyword.BRACE_CLOSE ,
+    '(': Keyword.PARAN_OPEN ,
+    ')': Keyword.PARAN_CLOSE ,
+    '[': Keyword.SQUARE_OPEN ,
+    ']': Keyword.SQUARE_CLOSE ,
+    '.': Keyword.DOT ,
+    ';': Keyword.SEMICOLON ,
+    '+': Keyword.PLUS ,
+    '-': Keyword.MINUS ,
+    '*': Keyword.MUL ,
+    '/': Keyword.DIV ,
+    '&': Keyword.AND ,
+    '|': Keyword.OR ,
+    '<': Keyword.LT ,
+    '>': Keyword.GT ,
+    '=': Keyword.EQ ,
+    '~': Keyword.NOT ,
+    ',': Keyword.COMMA,
+  }
+
+  KEYWORD_MAP = {
+    "class": Keyword.CLASS,
+    "method": Keyword.METHOD,
+    "function": Keyword.FUNCTION,
+    "constructor": Keyword.CONSTRUCTOR,
+    "int": Keyword.INT,
+    "boolean": Keyword.BOOLEAN,
+    "char": Keyword.CHAR,
+    "void": Keyword.VOID,
+    "var": Keyword.VAR,
+    "static": Keyword.STATIC,
+    "field": Keyword.FIELD,
+    "let": Keyword.LET,
+    "do": Keyword.DO,
+    "if": Keyword.IF,
+    "else": Keyword.ELSE,
+    "while": Keyword.WHILE,
+    "return": Keyword.RETURN,
+    "true": Keyword.TRUE,
+    "false": Keyword.FALSE,
+    "null": Keyword.NULL,
+    "this"    : Keyword.THIS
+  }
+  """ Returns the type of the current token """
+  def tokenType(self):
+    t = self.current_token()
+    if t in ['class','constructor','function','method','field','static','var','int','char','boolean','void','true','false','null','this','let','do','if','else','while','return']:
+      return JackTokenizer.KEYWORD_MAP[t]
+    elif re.compile("(\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.)").match(t):
+      return JackTokenizer.SYMBOL_MAP[t]
+    elif re.compile("\d+").match(t):
+      return Keyword.INTEGERCONSTANT
+    elif re.compile("\".*\"").match(t):
+      return Keyword.STRINGCONSTANT
+    else:
+      # TODO: Put an assert to ensure valid identifier
+      return Keyword.IDENTIFIER
+    pass
+
+  def printable_token(self):
+    if self.tokenType() == Keyword.STRINGCONSTANT:
+      return self.current_token()[1:-1]
+    else:
+      return escape(self.current_token(), True)
+
+  def assert_type(self, t):
+    if(t == Token.SYMBOL):
+      assert(self.tokenType() in SYMBOL_MAP.values())
+    elif(t == Token.KEYWORD):
+      assert(self.tokenType() in KEYWORD_MAP.values())
+    else:
+      assert(self.tokenType() == t)
+
+  """ Returns the character which is the current token """
+  def symbol(self):
+    self.assert_type(Token.SYMBOL)
+    return self.current_token()
+
+  """ Returns the identifier which is the current token """
+  def identifier(self):
+    self.assert_type(Token.IDENTIFIER)
+    return self.current_token()
+
+  """ Returns the integer value of the current token """
+  def intVal(self):
+    self.assert_type(Keyword.INTEGERCONSTANT)
+    return int(self.token)
+
+  """ Returns a list of tokens for that line """
+  def parse_line(self, line):
+    line = line.strip()
+    # If this line as a single line comment anywhere
+    # strip the line to start of //
+    if line.find("//") != -1:
+      line = line[:line.find("//")].strip()
+
+    if self.insideMultiLineComment:
+      if line.find("*/") == -1:
+        # The comment doesn't end in this line
+        return []
+      else:
+        self.insideMultiLineComment = False
+        # comments ends here, huzzah!
+        line = line[:line.find("*/")].strip()
+
+    # Same for the multi-line comment, but this time
+    # Also set insideMultiLineComment = true
+    elif line.find("/*") != -1:
+      # The comment ends on the same line
+      if line.find("*/") != -1:
+        # TODO: this also breaks on /* inside strings :(
+        # TODO: This also breaks on multiple multi-line comments on the same line
+        line = line[:line.find("/*")] + line[line.find("*/") + 2:].strip()
+      else:
+        line = line[:line.find("/*")].strip()
+        self.insideMultiLineComment = True
+
+    # We don't need no empty lines
+    if len(line) == 0:
+      return []
+    else:
+      # Regex contains 3 parts:
+      # 1. Keywords
+      # 2. Symbols
+      # 3. Identifiers
+      # 4. Strings
+      regex = re.compile("(class|constructor|function|method|field|static|var|int|char|boolean|void|true|false|null|this|let|do|if|else|while|return|\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.|[a-zA-Z_]+\w*|\".*\")")
+      return [e.strip() for e in regex.split(line) if e != None and e.strip()!='']
+
+  def has_more_tokens(self):
+    return self.ptr < len(self.tokens)
+
+  def current_token(self):
+    return self.tokens[self.ptr]
+
+  def advance(self):
+    self.ptr += 1
+
+  def __init__(self, filename, print_xml=False):
+    self.ptr = 0
+    self.insideMultiLineComment = False
+    self.file = open(filename, 'r')
+    self.tokens = []
+    for line in self.file:
+      self.tokens += self.parse_line(line)
+
+    if(print_xml):
+      self.print_xml(self.xml_file(filename))
+
+  def xml_file(self, jack_file):
+    return jack_file + "T.xml"
+
+  """ Returns a single row of XML for the Compilation Engine """
+  def xml_row(self):
+    t = self.tokenType()
+    if t in JackTokenizer.SYMBOL_MAP.values():
+      t = 'symbol'
+    elif t in JackTokenizer.KEYWORD_MAP.values():
+      t = 'keyword'
+    else:
+      t = t.name.lower()
+    return "<{type}> {value} </{type}>\n".format(type=t, value=self.printable_token())
+
+  def print_xml(self, xml_filename):
+    with open(xml_filename, 'w') as f:
+      f.write("<tokens>\n")
+      while self.has_more_tokens():
+        f.write(self.xml_row())
+        self.advance()
+      f.write("</tokens>\n")