nand2tetris/compiler/tokenizer.py

183 lines
5.4 KiB
Python
Raw Normal View History

import re
from keywords import Keyword
from html import escape
from enum import Enum
# Superclass in some sense
class Token(Enum):
KEYWORD = 1
SYMBOL = 2
class JackTokenizer:
SYMBOL_MAP = {
'{': Keyword.BRACE_OPEN ,
'}': Keyword.BRACE_CLOSE ,
'(': Keyword.PARAN_OPEN ,
')': Keyword.PARAN_CLOSE ,
'[': Keyword.SQUARE_OPEN ,
']': Keyword.SQUARE_CLOSE ,
'.': Keyword.DOT ,
';': Keyword.SEMICOLON ,
'+': Keyword.PLUS ,
'-': Keyword.MINUS ,
'*': Keyword.MUL ,
'/': Keyword.DIV ,
'&': Keyword.AND ,
'|': Keyword.OR ,
'<': Keyword.LT ,
'>': Keyword.GT ,
'=': Keyword.EQ ,
'~': Keyword.NOT ,
',': Keyword.COMMA,
}
KEYWORD_MAP = {
"class": Keyword.CLASS,
"method": Keyword.METHOD,
"function": Keyword.FUNCTION,
"constructor": Keyword.CONSTRUCTOR,
"int": Keyword.INT,
"boolean": Keyword.BOOLEAN,
"char": Keyword.CHAR,
"void": Keyword.VOID,
"var": Keyword.VAR,
"static": Keyword.STATIC,
"field": Keyword.FIELD,
"let": Keyword.LET,
"do": Keyword.DO,
"if": Keyword.IF,
"else": Keyword.ELSE,
"while": Keyword.WHILE,
"return": Keyword.RETURN,
"true": Keyword.TRUE,
"false": Keyword.FALSE,
"null": Keyword.NULL,
"this" : Keyword.THIS
}
""" Returns the type of the current token """
def tokenType(self):
t = self.current_token()
if t in ['class','constructor','function','method','field','static','var','int','char','boolean','void','true','false','null','this','let','do','if','else','while','return']:
return JackTokenizer.KEYWORD_MAP[t]
elif re.compile("(\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.)").match(t):
return JackTokenizer.SYMBOL_MAP[t]
elif re.compile("\d+").match(t):
return Keyword.INTEGERCONSTANT
elif re.compile("\".*\"").match(t):
return Keyword.STRINGCONSTANT
else:
# TODO: Put an assert to ensure valid identifier
return Keyword.IDENTIFIER
pass
def printable_token(self):
if self.tokenType() == Keyword.STRINGCONSTANT:
return self.current_token()[1:-1]
else:
return escape(self.current_token(), True)
def assert_type(self, t):
if(t == Token.SYMBOL):
assert(self.tokenType() in SYMBOL_MAP.values())
elif(t == Token.KEYWORD):
assert(self.tokenType() in KEYWORD_MAP.values())
else:
assert(self.tokenType() == t)
""" Returns the character which is the current token """
def symbol(self):
self.assert_type(Token.SYMBOL)
return self.current_token()
""" Returns the identifier which is the current token """
def identifier(self):
self.assert_type(Token.IDENTIFIER)
return self.current_token()
""" Returns the integer value of the current token """
def intVal(self):
self.assert_type(Keyword.INTEGERCONSTANT)
return int(self.token)
""" Returns a list of tokens for that line """
def parse_line(self, line):
line = line.strip()
# If this line as a single line comment anywhere
# strip the line to start of //
if line.find("//") != -1:
line = line[:line.find("//")].strip()
if self.insideMultiLineComment:
if line.find("*/") == -1:
# The comment doesn't end in this line
return []
else:
self.insideMultiLineComment = False
# comments ends here, huzzah!
line = line[:line.find("*/")].strip()
# Same for the multi-line comment, but this time
# Also set insideMultiLineComment = true
elif line.find("/*") != -1:
# The comment ends on the same line
if line.find("*/") != -1:
# TODO: this also breaks on /* inside strings :(
# TODO: This also breaks on multiple multi-line comments on the same line
line = line[:line.find("/*")] + line[line.find("*/") + 2:].strip()
else:
line = line[:line.find("/*")].strip()
self.insideMultiLineComment = True
# We don't need no empty lines
if len(line) == 0:
return []
else:
# Regex contains 3 parts:
# 1. Keywords
# 2. Symbols
# 3. Identifiers
# 4. Strings
regex = re.compile("(class|constructor|function|method|field|static|var|int|char|boolean|void|true|false|null|this|let|do|if|else|while|return|\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.|[a-zA-Z_]+\w*|\".*\")")
return [e.strip() for e in regex.split(line) if e != None and e.strip()!='']
def has_more_tokens(self):
return self.ptr < len(self.tokens)
def current_token(self):
return self.tokens[self.ptr]
def advance(self):
self.ptr += 1
def __init__(self, filename, print_xml=False):
self.ptr = 0
self.insideMultiLineComment = False
self.file = open(filename, 'r')
self.tokens = []
for line in self.file:
self.tokens += self.parse_line(line)
if(print_xml):
self.print_xml(self.xml_file(filename))
def xml_file(self, jack_file):
return jack_file + "T.xml"
""" Returns a single row of XML for the Compilation Engine """
def xml_row(self):
t = self.tokenType()
if t in JackTokenizer.SYMBOL_MAP.values():
t = 'symbol'
elif t in JackTokenizer.KEYWORD_MAP.values():
t = 'keyword'
else:
t = t.name.lower()
return "<{type}> {value} </{type}>\n".format(type=t, value=self.printable_token())
def print_xml(self, xml_filename):
with open(xml_filename, 'w') as f:
f.write("<tokens>\n")
while self.has_more_tokens():
f.write(self.xml_row())
self.advance()
f.write("</tokens>\n")