diff --git a/compiler/__init__.py b/compiler/__init__.py new file mode 100644 index 0000000..87c976c --- /dev/null +++ b/compiler/__init__.py @@ -0,0 +1,126 @@ +from enum import Enum +import re + +class Token(Enum): + KEYWORD = 1 + SYMBOL = 2 + IDENTIFIER = 3 + INT_CONST = 4 + STRING_CONST = 5 + +class Keyword(Enum): + CLASS = 1 + METHOD = 2 + FUNCTION = 3 + CONSTRUCTOR = 4 + INT = 5 + BOOLEAN = 6 + CHAR = 7 + VOID = 8 + VAR = 9 + STATIC = 10 + FIELD = 11 + LET = 12 + DO = 13 + IF = 14 + ELSE = 15 + WHILE = 16 + RETURN = 17 + TRUE = 18 + FALSE = 19 + NULL = 20 + THIS = 21 + +class JackAnalyzer: + def __init__(self): + pass + +class JackTokenizer: + + # KEYWORD_REGEXES='(class|constructor|function|method|field|static|var|int|char|boolean|void|true|false|null|this|let|do|if|else|while|return)' + + # SYMBOL_REGEXES = [ + # "{","}","\(","\)","]","[" + # ] + + """ Returns the type of the current token """ + def tokenType(self): + pass + + """ Returns the character which is the current token """ + def symbol(self): + if self.tokenType() != Token.SYMBOL: + raise RuntimeError("Should only be called when tokenType is SYMBOL") + + """ Returns the identifier which is the current token """ + def identifier(self): + if self.tokenType() != Token.IDENTIFIER: + raise RuntimeError("Should only be called when tokenType is IDENTIFIER") + + """ Returns the integer value of the current token """ + def intVal(self): + if self.tokenType() != Token.INT_CONST: + raise RuntimeError("Should only be called when tokenType is INT_CONST") + return int(self.token) + + """ Returns a list of tokens for that line """ + def parse_line(self, line): + line = line.strip() + # If this line as a single line comment anywhere + # strip the line to start of // + if line.find("//") != -1: + # print("Starting single line comment on %s" % line) + line = line[:line.find("//")].strip() + + if self.insideMultiLineComment: + if line.find("*/") == -1: + # print("Still inside multi line comment, continuing %s" % line) + # The comment doesn't end in this line + return [] + else: + # print("Closing multi line comment, continuing %s" % line) + self.insideMultiLineComment = False + # comments ends here, huzzah! + line = line[:line.find("*/")].strip() + + # Same for the multi-line comment, but this time + # Also set insideMultiLineComment = true + elif line.find("/*") != -1: + # The comment ends on the same line + if line.find("*/") != -1: + # TODO: This doesn't handle multiple multi-line comments on the same line + # TODO: this also breaks on /* inside strings :( + line = line[:line.find("/*")] + line[line.find("*/") + 2:].strip() + # print("This line has a /* and */ %s" % line) + # print("This line has a /* and */ %s" % len(line)) + else: + # print("Starting multi line comment on %s" % line) + line = line[:line.find("/*")].strip() + self.insideMultiLineComment = True + + # We don't need no empty lines + if len(line) == 0: + return [] + else: + regex = re.compile("(class|constructor|function|method|field|static|var|int|char|boolean|void|true|false|null|this|let|do|if|else|while|return)|(\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\|)") + tokens = regex.split(line) + return [e.strip() for e in tokens if e != None and e.strip()!=''] + + def advance(self): + self.tokens = [] + for line in self.file: + self.tokens += self.parse_line(line) + + print(self.tokens) + + def __init__(self, filename): + self.insideMultiLineComment = False + self.file = open(filename, 'r') + +class CompilationEngine: + def __init__(self): + pass + +if __name__ == '__main__': + jt = JackTokenizer("../projects/10/Square/Square.jack") + jt.advance()