diff --git a/.gitignore b/.gitignore index c50f923..c2d2941 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ tools/ +**/__pycache__/ diff --git a/NOTES.md b/NOTES.md index 4d296fc..61d5561 100644 --- a/NOTES.md +++ b/NOTES.md @@ -105,3 +105,9 @@ I can definitely solve it, but I want to do it properly. I've also realized why 2. PHP's language documentation is aimed at users, while Python throws so much useless stuff at you. I'm yet to find a language documentation that rivals PHP, to be fair - but Python gets so much wrong. Searching for "condition" on Python docs gets you: a page on something called Condition Objects, Conditional Expressions, and "More on conditions" - none of which actually detail what are the conditional statements and how they work. Look at the [control structures](https://www.php.net/manual/en/language.control-structures.php) page on PHP website instead. Python docs also like talking about language implementation details too much. For eg, BNF notation is peppered throughout the docs. PHP on the other hand uses only one language in its docs - PHP. 3. Lack of examples in documentation. You're left to figure out so many things. PHP gets this correct, for every function in the standard library. If examples are missing, the comments will usually have them. 4. Static Typing + + +## Compilation Engine + +I'm hard-coding stuff a lot, with a lot of asserts +would be nice once I have structure to actually generate the rules from the GRAMMAR diff --git a/compiler/README.md b/compiler/README.md index 66e19a3..574c992 100644 --- a/compiler/README.md +++ b/compiler/README.md @@ -251,7 +251,7 @@ let do if else while return #### `symbol` -`{}()[].m;+-*/&|<>=~` +`{}()[].;+-*/&|<>=~` #### `integerConstant` @@ -296,6 +296,7 @@ int | char | boolean | className (void | type) subRoutineName ( parameterList ) +subroutineBody ``` #### `parameterList` @@ -367,5 +368,3 @@ TODO |keyword,symbol,integerConstant,stringConstant,identifier|class,classVarDec,subroutineDec,parameterList,subroutineBody,varDec| ||statements,whileStatement,ifStatement,returnStatement,letStatement,doStatement| ||expression,term,expressionList| - - diff --git a/compiler/__init__.py b/compiler/__init__.py index cdf5d25..89f550f 100644 --- a/compiler/__init__.py +++ b/compiler/__init__.py @@ -1,158 +1,8 @@ -from enum import Enum -import re +from tokenizer import JackTokenizer import sys -from html import escape - -class Token(Enum): - KEYWORD = 1 - SYMBOL = 2 - IDENTIFIER = 3 - INTEGERCONSTANT = 4 - STRINGCONSTANT = 5 - UNKNOWN = 6 - -class Keyword(Enum): - CLASS = 1 - METHOD = 2 - FUNCTION = 3 - CONSTRUCTOR = 4 - INT = 5 - BOOLEAN = 6 - CHAR = 7 - VOID = 8 - VAR = 9 - STATIC = 10 - FIELD = 11 - LET = 12 - DO = 13 - IF = 14 - ELSE = 15 - WHILE = 16 - RETURN = 17 - TRUE = 18 - FALSE = 19 - NULL = 20 - THIS = 21 - -class JackAnalyzer: - def __init__(self): - pass - -class JackTokenizer: - - """ Returns the type of the current token """ - def tokenType(self): - t = self.current_token() - if t in ['class','constructor','function','method','field','static','var','int','char','boolean','void','true','false','null','this','let','do','if','else','while','return']: - return Token.KEYWORD - elif re.compile("(\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.)").match(t): - return Token.SYMBOL - elif re.compile("\d+").match(t): - return Token.INTEGERCONSTANT - elif re.compile("\".*\"").match(t): - return Token.STRINGCONSTANT - else: - return Token.IDENTIFIER - pass - - def printable_token(self): - if self.tokenType() == Token.STRINGCONSTANT: - return self.current_token()[1:-1] - else: - return escape(self.current_token(), True) - - """ Returns the character which is the current token """ - def symbol(self): - if self.tokenType() != Token.SYMBOL: - raise RuntimeError("Should only be called when tokenType is SYMBOL") - - """ Returns the identifier which is the current token """ - def identifier(self): - if self.tokenType() != Token.IDENTIFIER: - raise RuntimeError("Should only be called when tokenType is IDENTIFIER") - - """ Returns the integer value of the current token """ - def intVal(self): - if self.tokenType() != Token.INTEGERCONSTANT: - raise RuntimeError("Should only be called when tokenType is INTEGERCONSTANT") - return int(self.token) - - """ Returns a list of tokens for that line """ - def parse_line(self, line): - line = line.strip() - # If this line as a single line comment anywhere - # strip the line to start of // - if line.find("//") != -1: - line = line[:line.find("//")].strip() - - if self.insideMultiLineComment: - if line.find("*/") == -1: - # The comment doesn't end in this line - return [] - else: - self.insideMultiLineComment = False - # comments ends here, huzzah! - line = line[:line.find("*/")].strip() - - # Same for the multi-line comment, but this time - # Also set insideMultiLineComment = true - elif line.find("/*") != -1: - # The comment ends on the same line - if line.find("*/") != -1: - # TODO: this also breaks on /* inside strings :( - # TODO: This also breaks on multiple multi-line comments on the same line - line = line[:line.find("/*")] + line[line.find("*/") + 2:].strip() - else: - line = line[:line.find("/*")].strip() - self.insideMultiLineComment = True - - # We don't need no empty lines - if len(line) == 0: - return [] - else: - # Regex contains 3 parts: - # 1. Keywords - # 2. Symbols - # 3. Identifiers - # 4. Strings - regex = re.compile("(class|constructor|function|method|field|static|var|int|char|boolean|void|true|false|null|this|let|do|if|else|while|return|\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.|[a-zA-Z_]+\w*|\".*\")") - return [e.strip() for e in regex.split(line) if e != None and e.strip()!=''] - - def has_more_tokens(self): - return self.ptr < len(self.tokens) - - def current_token(self): - return self.tokens[self.ptr] - - def advance(self): - self.ptr += 1 - - def __init__(self, filename, print_xml=False): - self.ptr = 0 - self.insideMultiLineComment = False - self.file = open(filename, 'r') - self.tokens = [] - for line in self.file: - self.tokens += self.parse_line(line) - - if(print_xml): - self.print_xml(self.xml_file(filename)) - - def xml_file(self, jack_file): - return jack_file + "T.xml" - - def print_xml(self, xml_filename): - with open(xml_filename, 'w') as f: - f.write("\n") - while self.has_more_tokens(): - f.write("<{type}> {value} \n".format(type=self.tokenType().name.lower(), value=self.printable_token())) - self.advance() - f.write("\n") - -class CompilationEngine: - def __init__(self): - pass if __name__ == '__main__': - jt = JackTokenizer(sys.argv[1], True) + j = JackTokenizer(sys.argv[1], True) + # c = CompilationEngine(sys.argv[1]) + # c.CompileClass() diff --git a/compiler/keywords.py b/compiler/keywords.py new file mode 100644 index 0000000..5260b14 --- /dev/null +++ b/compiler/keywords.py @@ -0,0 +1,48 @@ +from enum import Enum,Flag,auto + +class Keyword(Flag): + CLASS = auto() + METHOD = auto() + FUNCTION = auto() + CONSTRUCTOR = auto() + INT = auto() + BOOLEAN = auto() + CHAR = auto() + VOID = auto() + VAR = auto() + STATIC = auto() + FIELD = auto() + LET = auto() + DO = auto() + IF = auto() + ELSE = auto() + WHILE = auto() + RETURN = auto() + TRUE = auto() + FALSE = auto() + NULL = auto() + THIS = auto() + # Symbols Start here + BRACE_OPEN = auto() + BRACE_CLOSE = auto() + PARAN_OPEN = auto() + PARAN_CLOSE = auto() + SQUARE_OPEN = auto() + SQUARE_CLOSE = auto() + DOT = auto() + SEMICOLON = auto() + PLUS = auto() + MINUS = auto() + MUL = auto() + DIV = auto() + AND = auto() + OR = auto() + LT = auto() + GT = auto() + EQ = auto() + NOT = auto() + COMMA = auto() + # Other Tokens + IDENTIFIER = auto() + INTEGERCONSTANT = auto() + STRINGCONSTANT = auto() diff --git a/compiler/tokenizer.py b/compiler/tokenizer.py new file mode 100644 index 0000000..c5b8641 --- /dev/null +++ b/compiler/tokenizer.py @@ -0,0 +1,182 @@ +import re +from keywords import Keyword +from html import escape +from enum import Enum +# Superclass in some sense +class Token(Enum): + KEYWORD = 1 + SYMBOL = 2 + +class JackTokenizer: + SYMBOL_MAP = { + '{': Keyword.BRACE_OPEN , + '}': Keyword.BRACE_CLOSE , + '(': Keyword.PARAN_OPEN , + ')': Keyword.PARAN_CLOSE , + '[': Keyword.SQUARE_OPEN , + ']': Keyword.SQUARE_CLOSE , + '.': Keyword.DOT , + ';': Keyword.SEMICOLON , + '+': Keyword.PLUS , + '-': Keyword.MINUS , + '*': Keyword.MUL , + '/': Keyword.DIV , + '&': Keyword.AND , + '|': Keyword.OR , + '<': Keyword.LT , + '>': Keyword.GT , + '=': Keyword.EQ , + '~': Keyword.NOT , + ',': Keyword.COMMA, + } + + KEYWORD_MAP = { + "class": Keyword.CLASS, + "method": Keyword.METHOD, + "function": Keyword.FUNCTION, + "constructor": Keyword.CONSTRUCTOR, + "int": Keyword.INT, + "boolean": Keyword.BOOLEAN, + "char": Keyword.CHAR, + "void": Keyword.VOID, + "var": Keyword.VAR, + "static": Keyword.STATIC, + "field": Keyword.FIELD, + "let": Keyword.LET, + "do": Keyword.DO, + "if": Keyword.IF, + "else": Keyword.ELSE, + "while": Keyword.WHILE, + "return": Keyword.RETURN, + "true": Keyword.TRUE, + "false": Keyword.FALSE, + "null": Keyword.NULL, + "this" : Keyword.THIS + } + """ Returns the type of the current token """ + def tokenType(self): + t = self.current_token() + if t in ['class','constructor','function','method','field','static','var','int','char','boolean','void','true','false','null','this','let','do','if','else','while','return']: + return JackTokenizer.KEYWORD_MAP[t] + elif re.compile("(\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.)").match(t): + return JackTokenizer.SYMBOL_MAP[t] + elif re.compile("\d+").match(t): + return Keyword.INTEGERCONSTANT + elif re.compile("\".*\"").match(t): + return Keyword.STRINGCONSTANT + else: + # TODO: Put an assert to ensure valid identifier + return Keyword.IDENTIFIER + pass + + def printable_token(self): + if self.tokenType() == Keyword.STRINGCONSTANT: + return self.current_token()[1:-1] + else: + return escape(self.current_token(), True) + + def assert_type(self, t): + if(t == Token.SYMBOL): + assert(self.tokenType() in SYMBOL_MAP.values()) + elif(t == Token.KEYWORD): + assert(self.tokenType() in KEYWORD_MAP.values()) + else: + assert(self.tokenType() == t) + + """ Returns the character which is the current token """ + def symbol(self): + self.assert_type(Token.SYMBOL) + return self.current_token() + + """ Returns the identifier which is the current token """ + def identifier(self): + self.assert_type(Token.IDENTIFIER) + return self.current_token() + + """ Returns the integer value of the current token """ + def intVal(self): + self.assert_type(Keyword.INTEGERCONSTANT) + return int(self.token) + + """ Returns a list of tokens for that line """ + def parse_line(self, line): + line = line.strip() + # If this line as a single line comment anywhere + # strip the line to start of // + if line.find("//") != -1: + line = line[:line.find("//")].strip() + + if self.insideMultiLineComment: + if line.find("*/") == -1: + # The comment doesn't end in this line + return [] + else: + self.insideMultiLineComment = False + # comments ends here, huzzah! + line = line[:line.find("*/")].strip() + + # Same for the multi-line comment, but this time + # Also set insideMultiLineComment = true + elif line.find("/*") != -1: + # The comment ends on the same line + if line.find("*/") != -1: + # TODO: this also breaks on /* inside strings :( + # TODO: This also breaks on multiple multi-line comments on the same line + line = line[:line.find("/*")] + line[line.find("*/") + 2:].strip() + else: + line = line[:line.find("/*")].strip() + self.insideMultiLineComment = True + + # We don't need no empty lines + if len(line) == 0: + return [] + else: + # Regex contains 3 parts: + # 1. Keywords + # 2. Symbols + # 3. Identifiers + # 4. Strings + regex = re.compile("(class|constructor|function|method|field|static|var|int|char|boolean|void|true|false|null|this|let|do|if|else|while|return|\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.|[a-zA-Z_]+\w*|\".*\")") + return [e.strip() for e in regex.split(line) if e != None and e.strip()!=''] + + def has_more_tokens(self): + return self.ptr < len(self.tokens) + + def current_token(self): + return self.tokens[self.ptr] + + def advance(self): + self.ptr += 1 + + def __init__(self, filename, print_xml=False): + self.ptr = 0 + self.insideMultiLineComment = False + self.file = open(filename, 'r') + self.tokens = [] + for line in self.file: + self.tokens += self.parse_line(line) + + if(print_xml): + self.print_xml(self.xml_file(filename)) + + def xml_file(self, jack_file): + return jack_file + "T.xml" + + """ Returns a single row of XML for the Compilation Engine """ + def xml_row(self): + t = self.tokenType() + if t in JackTokenizer.SYMBOL_MAP.values(): + t = 'symbol' + elif t in JackTokenizer.KEYWORD_MAP.values(): + t = 'keyword' + else: + t = t.name.lower() + return "<{type}> {value} \n".format(type=t, value=self.printable_token()) + + def print_xml(self, xml_filename): + with open(xml_filename, 'w') as f: + f.write("\n") + while self.has_more_tokens(): + f.write(self.xml_row()) + self.advance() + f.write("\n")