import re from keywords import Keyword from html import escape from enum import Enum # Superclass in some sense class Token(Enum): KEYWORD = 1 SYMBOL = 2 class JackTokenizer: SYMBOL_MAP = { '{': Keyword.BRACE_OPEN , '}': Keyword.BRACE_CLOSE , '(': Keyword.PARAN_OPEN , ')': Keyword.PARAN_CLOSE , '[': Keyword.SQUARE_OPEN , ']': Keyword.SQUARE_CLOSE , '.': Keyword.DOT , ';': Keyword.SEMICOLON , '+': Keyword.PLUS , '-': Keyword.MINUS , '*': Keyword.MUL , '/': Keyword.DIV , '&': Keyword.AND , '|': Keyword.OR , '<': Keyword.LT , '>': Keyword.GT , '=': Keyword.EQ , '~': Keyword.NOT , ',': Keyword.COMMA, } KEYWORD_MAP = { "class": Keyword.CLASS, "method": Keyword.METHOD, "function": Keyword.FUNCTION, "constructor": Keyword.CONSTRUCTOR, "int": Keyword.INT, "boolean": Keyword.BOOLEAN, "char": Keyword.CHAR, "void": Keyword.VOID, "var": Keyword.VAR, "static": Keyword.STATIC, "field": Keyword.FIELD, "let": Keyword.LET, "do": Keyword.DO, "if": Keyword.IF, "else": Keyword.ELSE, "while": Keyword.WHILE, "return": Keyword.RETURN, "true": Keyword.TRUE, "false": Keyword.FALSE, "null": Keyword.NULL, "this" : Keyword.THIS } """ Returns the type of the current token """ def tokenType(self): t = self.current_token() if t in ['class','constructor','function','method','field','static','var','int','char','boolean','void','true','false','null','this','let','do','if','else','while','return']: return JackTokenizer.KEYWORD_MAP[t] elif re.compile("(\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.)").match(t): return JackTokenizer.SYMBOL_MAP[t] elif re.compile("\d+").match(t): return Keyword.INTEGERCONSTANT elif re.compile("\".*\"").match(t): return Keyword.STRINGCONSTANT else: # TODO: Put an assert to ensure valid identifier return Keyword.IDENTIFIER pass def printable_token(self): if self.tokenType() == Keyword.STRINGCONSTANT: return self.current_token()[1:-1] else: return escape(self.current_token(), True) def assert_type(self, t): if(t == Token.SYMBOL): assert(self.tokenType() in SYMBOL_MAP.values()) elif(t == Token.KEYWORD): assert(self.tokenType() in KEYWORD_MAP.values()) else: assert(self.tokenType() == t) """ Returns the character which is the current token """ def symbol(self): self.assert_type(Token.SYMBOL) return self.current_token() """ Returns the identifier which is the current token """ def identifier(self): self.assert_type(Token.IDENTIFIER) return self.current_token() """ Returns the integer value of the current token """ def intVal(self): self.assert_type(Keyword.INTEGERCONSTANT) return int(self.token) """ Returns a list of tokens for that line """ def parse_line(self, line): line = line.strip() # If this line as a single line comment anywhere # strip the line to start of // if line.find("//") != -1: line = line[:line.find("//")].strip() if self.insideMultiLineComment: if line.find("*/") == -1: # The comment doesn't end in this line return [] else: self.insideMultiLineComment = False # comments ends here, huzzah! line = line[:line.find("*/")].strip() # Same for the multi-line comment, but this time # Also set insideMultiLineComment = true elif line.find("/*") != -1: # The comment ends on the same line if line.find("*/") != -1: # TODO: this also breaks on /* inside strings :( # TODO: This also breaks on multiple multi-line comments on the same line line = line[:line.find("/*")] + line[line.find("*/") + 2:].strip() else: line = line[:line.find("/*")].strip() self.insideMultiLineComment = True # We don't need no empty lines if len(line) == 0: return [] else: # Regex contains 3 parts: # 1. Keywords # 2. Symbols # 3. Identifiers # 4. Strings regex = re.compile("(class|constructor|function|method|field|static|var|int|char|boolean|void|true|false|null|this|let|do|if|else|while|return|\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.|[a-zA-Z_]+\w*|\".*\")") return [e.strip() for e in regex.split(line) if e != None and e.strip()!=''] def has_more_tokens(self): return self.ptr < len(self.tokens) def current_token(self): return self.tokens[self.ptr] def advance(self): self.ptr += 1 def __init__(self, filename, print_xml=False): self.ptr = 0 self.insideMultiLineComment = False self.file = open(filename, 'r') self.tokens = [] for line in self.file: self.tokens += self.parse_line(line) if(print_xml): self.print_xml(self.xml_file(filename)) def xml_file(self, jack_file): return jack_file + "T.xml" """ Returns a single row of XML for the Compilation Engine """ def xml_row(self): t = self.tokenType() if t in JackTokenizer.SYMBOL_MAP.values(): t = 'symbol' elif t in JackTokenizer.KEYWORD_MAP.values(): t = 'keyword' else: t = t.name.lower() return "<{type}> {value} \n".format(type=t, value=self.printable_token()) def print_xml(self, xml_filename): with open(xml_filename, 'w') as f: f.write("\n") while self.has_more_tokens(): f.write(self.xml_row()) self.advance() f.write("\n")