183 lines
5.4 KiB
Python
183 lines
5.4 KiB
Python
import re
|
|
from keywords import Keyword
|
|
from html import escape
|
|
from enum import Enum
|
|
# Superclass in some sense
|
|
class Token(Enum):
|
|
KEYWORD = 1
|
|
SYMBOL = 2
|
|
|
|
class JackTokenizer:
|
|
SYMBOL_MAP = {
|
|
'{': Keyword.BRACE_OPEN ,
|
|
'}': Keyword.BRACE_CLOSE ,
|
|
'(': Keyword.PARAN_OPEN ,
|
|
')': Keyword.PARAN_CLOSE ,
|
|
'[': Keyword.SQUARE_OPEN ,
|
|
']': Keyword.SQUARE_CLOSE ,
|
|
'.': Keyword.DOT ,
|
|
';': Keyword.SEMICOLON ,
|
|
'+': Keyword.PLUS ,
|
|
'-': Keyword.MINUS ,
|
|
'*': Keyword.MUL ,
|
|
'/': Keyword.DIV ,
|
|
'&': Keyword.AND ,
|
|
'|': Keyword.OR ,
|
|
'<': Keyword.LT ,
|
|
'>': Keyword.GT ,
|
|
'=': Keyword.EQ ,
|
|
'~': Keyword.NOT ,
|
|
',': Keyword.COMMA,
|
|
}
|
|
|
|
KEYWORD_MAP = {
|
|
"class": Keyword.CLASS,
|
|
"method": Keyword.METHOD,
|
|
"function": Keyword.FUNCTION,
|
|
"constructor": Keyword.CONSTRUCTOR,
|
|
"int": Keyword.INT,
|
|
"boolean": Keyword.BOOLEAN,
|
|
"char": Keyword.CHAR,
|
|
"void": Keyword.VOID,
|
|
"var": Keyword.VAR,
|
|
"static": Keyword.STATIC,
|
|
"field": Keyword.FIELD,
|
|
"let": Keyword.LET,
|
|
"do": Keyword.DO,
|
|
"if": Keyword.IF,
|
|
"else": Keyword.ELSE,
|
|
"while": Keyword.WHILE,
|
|
"return": Keyword.RETURN,
|
|
"true": Keyword.TRUE,
|
|
"false": Keyword.FALSE,
|
|
"null": Keyword.NULL,
|
|
"this" : Keyword.THIS
|
|
}
|
|
""" Returns the type of the current token """
|
|
def tokenType(self):
|
|
t = self.current_token()
|
|
if t in ['class','constructor','function','method','field','static','var','int','char','boolean','void','true','false','null','this','let','do','if','else','while','return']:
|
|
return JackTokenizer.KEYWORD_MAP[t]
|
|
elif re.compile("(\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.)").match(t):
|
|
return JackTokenizer.SYMBOL_MAP[t]
|
|
elif re.compile("\d+").match(t):
|
|
return Keyword.INTEGERCONSTANT
|
|
elif re.compile("\".*\"").match(t):
|
|
return Keyword.STRINGCONSTANT
|
|
else:
|
|
# TODO: Put an assert to ensure valid identifier
|
|
return Keyword.IDENTIFIER
|
|
pass
|
|
|
|
def printable_token(self):
|
|
if self.tokenType() == Keyword.STRINGCONSTANT:
|
|
return self.current_token()[1:-1]
|
|
else:
|
|
return escape(self.current_token(), True)
|
|
|
|
def assert_type(self, t):
|
|
if(t == Token.SYMBOL):
|
|
assert(self.tokenType() in SYMBOL_MAP.values())
|
|
elif(t == Token.KEYWORD):
|
|
assert(self.tokenType() in KEYWORD_MAP.values())
|
|
else:
|
|
assert(self.tokenType() == t)
|
|
|
|
""" Returns the character which is the current token """
|
|
def symbol(self):
|
|
self.assert_type(Token.SYMBOL)
|
|
return self.current_token()
|
|
|
|
""" Returns the identifier which is the current token """
|
|
def identifier(self):
|
|
self.assert_type(Token.IDENTIFIER)
|
|
return self.current_token()
|
|
|
|
""" Returns the integer value of the current token """
|
|
def intVal(self):
|
|
self.assert_type(Keyword.INTEGERCONSTANT)
|
|
return int(self.token)
|
|
|
|
""" Returns a list of tokens for that line """
|
|
def parse_line(self, line):
|
|
line = line.strip()
|
|
# If this line as a single line comment anywhere
|
|
# strip the line to start of //
|
|
if line.find("//") != -1:
|
|
line = line[:line.find("//")].strip()
|
|
|
|
if self.insideMultiLineComment:
|
|
if line.find("*/") == -1:
|
|
# The comment doesn't end in this line
|
|
return []
|
|
else:
|
|
self.insideMultiLineComment = False
|
|
# comments ends here, huzzah!
|
|
line = line[:line.find("*/")].strip()
|
|
|
|
# Same for the multi-line comment, but this time
|
|
# Also set insideMultiLineComment = true
|
|
elif line.find("/*") != -1:
|
|
# The comment ends on the same line
|
|
if line.find("*/") != -1:
|
|
# TODO: this also breaks on /* inside strings :(
|
|
# TODO: This also breaks on multiple multi-line comments on the same line
|
|
line = line[:line.find("/*")] + line[line.find("*/") + 2:].strip()
|
|
else:
|
|
line = line[:line.find("/*")].strip()
|
|
self.insideMultiLineComment = True
|
|
|
|
# We don't need no empty lines
|
|
if len(line) == 0:
|
|
return []
|
|
else:
|
|
# Regex contains 3 parts:
|
|
# 1. Keywords
|
|
# 2. Symbols
|
|
# 3. Identifiers
|
|
# 4. Strings
|
|
regex = re.compile("(class|constructor|function|method|field|static|var|int|char|boolean|void|true|false|null|this|let|do|if|else|while|return|\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.|[a-zA-Z_]+\w*|\".*\")")
|
|
return [e.strip() for e in regex.split(line) if e != None and e.strip()!='']
|
|
|
|
def has_more_tokens(self):
|
|
return self.ptr < len(self.tokens)
|
|
|
|
def current_token(self):
|
|
return self.tokens[self.ptr]
|
|
|
|
def advance(self):
|
|
self.ptr += 1
|
|
|
|
def __init__(self, filename, print_xml=False):
|
|
self.ptr = 0
|
|
self.insideMultiLineComment = False
|
|
self.file = open(filename, 'r')
|
|
self.tokens = []
|
|
for line in self.file:
|
|
self.tokens += self.parse_line(line)
|
|
|
|
if(print_xml):
|
|
self.print_xml(self.xml_file(filename))
|
|
|
|
def xml_file(self, jack_file):
|
|
return jack_file + "T.xml"
|
|
|
|
""" Returns a single row of XML for the Compilation Engine """
|
|
def xml_row(self):
|
|
t = self.tokenType()
|
|
if t in JackTokenizer.SYMBOL_MAP.values():
|
|
t = 'symbol'
|
|
elif t in JackTokenizer.KEYWORD_MAP.values():
|
|
t = 'keyword'
|
|
else:
|
|
t = t.name.lower()
|
|
return "<{type}> {value} </{type}>\n".format(type=t, value=self.printable_token())
|
|
|
|
def print_xml(self, xml_filename):
|
|
with open(xml_filename, 'w') as f:
|
|
f.write("<tokens>\n")
|
|
while self.has_more_tokens():
|
|
f.write(self.xml_row())
|
|
self.advance()
|
|
f.write("</tokens>\n")
|