mirror of https://github.com/captn3m0/nand2tetris
Break up the file a bit, haven't committed compilation stuff yet
parent
cece143368
commit
c71dd21128
@ -1 +1,2 @@
|
||||
tools/
|
||||
**/__pycache__/
|
||||
|
@ -1,158 +1,8 @@
|
||||
from enum import Enum
|
||||
import re
|
||||
from tokenizer import JackTokenizer
|
||||
import sys
|
||||
from html import escape
|
||||
|
||||
class Token(Enum):
|
||||
KEYWORD = 1
|
||||
SYMBOL = 2
|
||||
IDENTIFIER = 3
|
||||
INTEGERCONSTANT = 4
|
||||
STRINGCONSTANT = 5
|
||||
UNKNOWN = 6
|
||||
|
||||
class Keyword(Enum):
|
||||
CLASS = 1
|
||||
METHOD = 2
|
||||
FUNCTION = 3
|
||||
CONSTRUCTOR = 4
|
||||
INT = 5
|
||||
BOOLEAN = 6
|
||||
CHAR = 7
|
||||
VOID = 8
|
||||
VAR = 9
|
||||
STATIC = 10
|
||||
FIELD = 11
|
||||
LET = 12
|
||||
DO = 13
|
||||
IF = 14
|
||||
ELSE = 15
|
||||
WHILE = 16
|
||||
RETURN = 17
|
||||
TRUE = 18
|
||||
FALSE = 19
|
||||
NULL = 20
|
||||
THIS = 21
|
||||
|
||||
class JackAnalyzer:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
class JackTokenizer:
|
||||
|
||||
""" Returns the type of the current token """
|
||||
def tokenType(self):
|
||||
t = self.current_token()
|
||||
if t in ['class','constructor','function','method','field','static','var','int','char','boolean','void','true','false','null','this','let','do','if','else','while','return']:
|
||||
return Token.KEYWORD
|
||||
elif re.compile("(\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.)").match(t):
|
||||
return Token.SYMBOL
|
||||
elif re.compile("\d+").match(t):
|
||||
return Token.INTEGERCONSTANT
|
||||
elif re.compile("\".*\"").match(t):
|
||||
return Token.STRINGCONSTANT
|
||||
else:
|
||||
return Token.IDENTIFIER
|
||||
pass
|
||||
|
||||
def printable_token(self):
|
||||
if self.tokenType() == Token.STRINGCONSTANT:
|
||||
return self.current_token()[1:-1]
|
||||
else:
|
||||
return escape(self.current_token(), True)
|
||||
|
||||
""" Returns the character which is the current token """
|
||||
def symbol(self):
|
||||
if self.tokenType() != Token.SYMBOL:
|
||||
raise RuntimeError("Should only be called when tokenType is SYMBOL")
|
||||
|
||||
""" Returns the identifier which is the current token """
|
||||
def identifier(self):
|
||||
if self.tokenType() != Token.IDENTIFIER:
|
||||
raise RuntimeError("Should only be called when tokenType is IDENTIFIER")
|
||||
|
||||
""" Returns the integer value of the current token """
|
||||
def intVal(self):
|
||||
if self.tokenType() != Token.INTEGERCONSTANT:
|
||||
raise RuntimeError("Should only be called when tokenType is INTEGERCONSTANT")
|
||||
return int(self.token)
|
||||
|
||||
""" Returns a list of tokens for that line """
|
||||
def parse_line(self, line):
|
||||
line = line.strip()
|
||||
# If this line as a single line comment anywhere
|
||||
# strip the line to start of //
|
||||
if line.find("//") != -1:
|
||||
line = line[:line.find("//")].strip()
|
||||
|
||||
if self.insideMultiLineComment:
|
||||
if line.find("*/") == -1:
|
||||
# The comment doesn't end in this line
|
||||
return []
|
||||
else:
|
||||
self.insideMultiLineComment = False
|
||||
# comments ends here, huzzah!
|
||||
line = line[:line.find("*/")].strip()
|
||||
|
||||
# Same for the multi-line comment, but this time
|
||||
# Also set insideMultiLineComment = true
|
||||
elif line.find("/*") != -1:
|
||||
# The comment ends on the same line
|
||||
if line.find("*/") != -1:
|
||||
# TODO: this also breaks on /* inside strings :(
|
||||
# TODO: This also breaks on multiple multi-line comments on the same line
|
||||
line = line[:line.find("/*")] + line[line.find("*/") + 2:].strip()
|
||||
else:
|
||||
line = line[:line.find("/*")].strip()
|
||||
self.insideMultiLineComment = True
|
||||
|
||||
# We don't need no empty lines
|
||||
if len(line) == 0:
|
||||
return []
|
||||
else:
|
||||
# Regex contains 3 parts:
|
||||
# 1. Keywords
|
||||
# 2. Symbols
|
||||
# 3. Identifiers
|
||||
# 4. Strings
|
||||
regex = re.compile("(class|constructor|function|method|field|static|var|int|char|boolean|void|true|false|null|this|let|do|if|else|while|return|\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.|[a-zA-Z_]+\w*|\".*\")")
|
||||
return [e.strip() for e in regex.split(line) if e != None and e.strip()!='']
|
||||
|
||||
def has_more_tokens(self):
|
||||
return self.ptr < len(self.tokens)
|
||||
|
||||
def current_token(self):
|
||||
return self.tokens[self.ptr]
|
||||
|
||||
def advance(self):
|
||||
self.ptr += 1
|
||||
|
||||
def __init__(self, filename, print_xml=False):
|
||||
self.ptr = 0
|
||||
self.insideMultiLineComment = False
|
||||
self.file = open(filename, 'r')
|
||||
self.tokens = []
|
||||
for line in self.file:
|
||||
self.tokens += self.parse_line(line)
|
||||
|
||||
if(print_xml):
|
||||
self.print_xml(self.xml_file(filename))
|
||||
|
||||
def xml_file(self, jack_file):
|
||||
return jack_file + "T.xml"
|
||||
|
||||
def print_xml(self, xml_filename):
|
||||
with open(xml_filename, 'w') as f:
|
||||
f.write("<tokens>\n")
|
||||
while self.has_more_tokens():
|
||||
f.write("<{type}> {value} </{type}>\n".format(type=self.tokenType().name.lower(), value=self.printable_token()))
|
||||
self.advance()
|
||||
f.write("</tokens>\n")
|
||||
|
||||
class CompilationEngine:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
if __name__ == '__main__':
|
||||
jt = JackTokenizer(sys.argv[1], True)
|
||||
j = JackTokenizer(sys.argv[1], True)
|
||||
# c = CompilationEngine(sys.argv[1])
|
||||
# c.CompileClass()
|
||||
|
||||
|
@ -0,0 +1,48 @@
|
||||
from enum import Enum,Flag,auto
|
||||
|
||||
class Keyword(Flag):
|
||||
CLASS = auto()
|
||||
METHOD = auto()
|
||||
FUNCTION = auto()
|
||||
CONSTRUCTOR = auto()
|
||||
INT = auto()
|
||||
BOOLEAN = auto()
|
||||
CHAR = auto()
|
||||
VOID = auto()
|
||||
VAR = auto()
|
||||
STATIC = auto()
|
||||
FIELD = auto()
|
||||
LET = auto()
|
||||
DO = auto()
|
||||
IF = auto()
|
||||
ELSE = auto()
|
||||
WHILE = auto()
|
||||
RETURN = auto()
|
||||
TRUE = auto()
|
||||
FALSE = auto()
|
||||
NULL = auto()
|
||||
THIS = auto()
|
||||
# Symbols Start here
|
||||
BRACE_OPEN = auto()
|
||||
BRACE_CLOSE = auto()
|
||||
PARAN_OPEN = auto()
|
||||
PARAN_CLOSE = auto()
|
||||
SQUARE_OPEN = auto()
|
||||
SQUARE_CLOSE = auto()
|
||||
DOT = auto()
|
||||
SEMICOLON = auto()
|
||||
PLUS = auto()
|
||||
MINUS = auto()
|
||||
MUL = auto()
|
||||
DIV = auto()
|
||||
AND = auto()
|
||||
OR = auto()
|
||||
LT = auto()
|
||||
GT = auto()
|
||||
EQ = auto()
|
||||
NOT = auto()
|
||||
COMMA = auto()
|
||||
# Other Tokens
|
||||
IDENTIFIER = auto()
|
||||
INTEGERCONSTANT = auto()
|
||||
STRINGCONSTANT = auto()
|
@ -0,0 +1,182 @@
|
||||
import re
|
||||
from keywords import Keyword
|
||||
from html import escape
|
||||
from enum import Enum
|
||||
# Superclass in some sense
|
||||
class Token(Enum):
|
||||
KEYWORD = 1
|
||||
SYMBOL = 2
|
||||
|
||||
class JackTokenizer:
|
||||
SYMBOL_MAP = {
|
||||
'{': Keyword.BRACE_OPEN ,
|
||||
'}': Keyword.BRACE_CLOSE ,
|
||||
'(': Keyword.PARAN_OPEN ,
|
||||
')': Keyword.PARAN_CLOSE ,
|
||||
'[': Keyword.SQUARE_OPEN ,
|
||||
']': Keyword.SQUARE_CLOSE ,
|
||||
'.': Keyword.DOT ,
|
||||
';': Keyword.SEMICOLON ,
|
||||
'+': Keyword.PLUS ,
|
||||
'-': Keyword.MINUS ,
|
||||
'*': Keyword.MUL ,
|
||||
'/': Keyword.DIV ,
|
||||
'&': Keyword.AND ,
|
||||
'|': Keyword.OR ,
|
||||
'<': Keyword.LT ,
|
||||
'>': Keyword.GT ,
|
||||
'=': Keyword.EQ ,
|
||||
'~': Keyword.NOT ,
|
||||
',': Keyword.COMMA,
|
||||
}
|
||||
|
||||
KEYWORD_MAP = {
|
||||
"class": Keyword.CLASS,
|
||||
"method": Keyword.METHOD,
|
||||
"function": Keyword.FUNCTION,
|
||||
"constructor": Keyword.CONSTRUCTOR,
|
||||
"int": Keyword.INT,
|
||||
"boolean": Keyword.BOOLEAN,
|
||||
"char": Keyword.CHAR,
|
||||
"void": Keyword.VOID,
|
||||
"var": Keyword.VAR,
|
||||
"static": Keyword.STATIC,
|
||||
"field": Keyword.FIELD,
|
||||
"let": Keyword.LET,
|
||||
"do": Keyword.DO,
|
||||
"if": Keyword.IF,
|
||||
"else": Keyword.ELSE,
|
||||
"while": Keyword.WHILE,
|
||||
"return": Keyword.RETURN,
|
||||
"true": Keyword.TRUE,
|
||||
"false": Keyword.FALSE,
|
||||
"null": Keyword.NULL,
|
||||
"this" : Keyword.THIS
|
||||
}
|
||||
""" Returns the type of the current token """
|
||||
def tokenType(self):
|
||||
t = self.current_token()
|
||||
if t in ['class','constructor','function','method','field','static','var','int','char','boolean','void','true','false','null','this','let','do','if','else','while','return']:
|
||||
return JackTokenizer.KEYWORD_MAP[t]
|
||||
elif re.compile("(\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.)").match(t):
|
||||
return JackTokenizer.SYMBOL_MAP[t]
|
||||
elif re.compile("\d+").match(t):
|
||||
return Keyword.INTEGERCONSTANT
|
||||
elif re.compile("\".*\"").match(t):
|
||||
return Keyword.STRINGCONSTANT
|
||||
else:
|
||||
# TODO: Put an assert to ensure valid identifier
|
||||
return Keyword.IDENTIFIER
|
||||
pass
|
||||
|
||||
def printable_token(self):
|
||||
if self.tokenType() == Keyword.STRINGCONSTANT:
|
||||
return self.current_token()[1:-1]
|
||||
else:
|
||||
return escape(self.current_token(), True)
|
||||
|
||||
def assert_type(self, t):
|
||||
if(t == Token.SYMBOL):
|
||||
assert(self.tokenType() in SYMBOL_MAP.values())
|
||||
elif(t == Token.KEYWORD):
|
||||
assert(self.tokenType() in KEYWORD_MAP.values())
|
||||
else:
|
||||
assert(self.tokenType() == t)
|
||||
|
||||
""" Returns the character which is the current token """
|
||||
def symbol(self):
|
||||
self.assert_type(Token.SYMBOL)
|
||||
return self.current_token()
|
||||
|
||||
""" Returns the identifier which is the current token """
|
||||
def identifier(self):
|
||||
self.assert_type(Token.IDENTIFIER)
|
||||
return self.current_token()
|
||||
|
||||
""" Returns the integer value of the current token """
|
||||
def intVal(self):
|
||||
self.assert_type(Keyword.INTEGERCONSTANT)
|
||||
return int(self.token)
|
||||
|
||||
""" Returns a list of tokens for that line """
|
||||
def parse_line(self, line):
|
||||
line = line.strip()
|
||||
# If this line as a single line comment anywhere
|
||||
# strip the line to start of //
|
||||
if line.find("//") != -1:
|
||||
line = line[:line.find("//")].strip()
|
||||
|
||||
if self.insideMultiLineComment:
|
||||
if line.find("*/") == -1:
|
||||
# The comment doesn't end in this line
|
||||
return []
|
||||
else:
|
||||
self.insideMultiLineComment = False
|
||||
# comments ends here, huzzah!
|
||||
line = line[:line.find("*/")].strip()
|
||||
|
||||
# Same for the multi-line comment, but this time
|
||||
# Also set insideMultiLineComment = true
|
||||
elif line.find("/*") != -1:
|
||||
# The comment ends on the same line
|
||||
if line.find("*/") != -1:
|
||||
# TODO: this also breaks on /* inside strings :(
|
||||
# TODO: This also breaks on multiple multi-line comments on the same line
|
||||
line = line[:line.find("/*")] + line[line.find("*/") + 2:].strip()
|
||||
else:
|
||||
line = line[:line.find("/*")].strip()
|
||||
self.insideMultiLineComment = True
|
||||
|
||||
# We don't need no empty lines
|
||||
if len(line) == 0:
|
||||
return []
|
||||
else:
|
||||
# Regex contains 3 parts:
|
||||
# 1. Keywords
|
||||
# 2. Symbols
|
||||
# 3. Identifiers
|
||||
# 4. Strings
|
||||
regex = re.compile("(class|constructor|function|method|field|static|var|int|char|boolean|void|true|false|null|this|let|do|if|else|while|return|\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.|[a-zA-Z_]+\w*|\".*\")")
|
||||
return [e.strip() for e in regex.split(line) if e != None and e.strip()!='']
|
||||
|
||||
def has_more_tokens(self):
|
||||
return self.ptr < len(self.tokens)
|
||||
|
||||
def current_token(self):
|
||||
return self.tokens[self.ptr]
|
||||
|
||||
def advance(self):
|
||||
self.ptr += 1
|
||||
|
||||
def __init__(self, filename, print_xml=False):
|
||||
self.ptr = 0
|
||||
self.insideMultiLineComment = False
|
||||
self.file = open(filename, 'r')
|
||||
self.tokens = []
|
||||
for line in self.file:
|
||||
self.tokens += self.parse_line(line)
|
||||
|
||||
if(print_xml):
|
||||
self.print_xml(self.xml_file(filename))
|
||||
|
||||
def xml_file(self, jack_file):
|
||||
return jack_file + "T.xml"
|
||||
|
||||
""" Returns a single row of XML for the Compilation Engine """
|
||||
def xml_row(self):
|
||||
t = self.tokenType()
|
||||
if t in JackTokenizer.SYMBOL_MAP.values():
|
||||
t = 'symbol'
|
||||
elif t in JackTokenizer.KEYWORD_MAP.values():
|
||||
t = 'keyword'
|
||||
else:
|
||||
t = t.name.lower()
|
||||
return "<{type}> {value} </{type}>\n".format(type=t, value=self.printable_token())
|
||||
|
||||
def print_xml(self, xml_filename):
|
||||
with open(xml_filename, 'w') as f:
|
||||
f.write("<tokens>\n")
|
||||
while self.has_more_tokens():
|
||||
f.write(self.xml_row())
|
||||
self.advance()
|
||||
f.write("</tokens>\n")
|
Loading…
Reference in New Issue