Break up the file a bit, haven't committed compilation stuff yet

master
Nemo 2 years ago
parent cece143368
commit c71dd21128
  1. 1
      .gitignore
  2. 6
      NOTES.md
  3. 5
      compiler/README.md
  4. 158
      compiler/__init__.py
  5. 48
      compiler/keywords.py
  6. 182
      compiler/tokenizer.py

1
.gitignore vendored

@ -1 +1,2 @@
tools/
**/__pycache__/

@ -105,3 +105,9 @@ I can definitely solve it, but I want to do it properly. I've also realized why
2. PHP's language documentation is aimed at users, while Python throws so much useless stuff at you. I'm yet to find a language documentation that rivals PHP, to be fair - but Python gets so much wrong. Searching for "condition" on Python docs gets you: a page on something called Condition Objects, Conditional Expressions, and "More on conditions" - none of which actually detail what are the conditional statements and how they work. Look at the [control structures](https://www.php.net/manual/en/language.control-structures.php) page on PHP website instead. Python docs also like talking about language implementation details too much. For eg, BNF notation is peppered throughout the docs. PHP on the other hand uses only one language in its docs - PHP.
3. Lack of examples in documentation. You're left to figure out so many things. PHP gets this correct, for every function in the standard library. If examples are missing, the comments will usually have them.
4. Static Typing
## Compilation Engine
I'm hard-coding stuff a lot, with a lot of asserts
would be nice once I have structure to actually generate the rules from the GRAMMAR

@ -251,7 +251,7 @@ let do if else while return
#### `symbol`
`{}()[].m;+-*/&|<>=~`
`{}()[].;+-*/&|<>=~`
#### `integerConstant`
@ -296,6 +296,7 @@ int | char | boolean | className
(void | type)
subRoutineName
( parameterList )
subroutineBody
```
#### `parameterList`
@ -367,5 +368,3 @@ TODO
|keyword,symbol,integerConstant,stringConstant,identifier|class,classVarDec,subroutineDec,parameterList,subroutineBody,varDec|
||statements,whileStatement,ifStatement,returnStatement,letStatement,doStatement|
||expression,term,expressionList|

@ -1,158 +1,8 @@
from enum import Enum
import re
from tokenizer import JackTokenizer
import sys
from html import escape
class Token(Enum):
KEYWORD = 1
SYMBOL = 2
IDENTIFIER = 3
INTEGERCONSTANT = 4
STRINGCONSTANT = 5
UNKNOWN = 6
class Keyword(Enum):
CLASS = 1
METHOD = 2
FUNCTION = 3
CONSTRUCTOR = 4
INT = 5
BOOLEAN = 6
CHAR = 7
VOID = 8
VAR = 9
STATIC = 10
FIELD = 11
LET = 12
DO = 13
IF = 14
ELSE = 15
WHILE = 16
RETURN = 17
TRUE = 18
FALSE = 19
NULL = 20
THIS = 21
class JackAnalyzer:
def __init__(self):
pass
class JackTokenizer:
""" Returns the type of the current token """
def tokenType(self):
t = self.current_token()
if t in ['class','constructor','function','method','field','static','var','int','char','boolean','void','true','false','null','this','let','do','if','else','while','return']:
return Token.KEYWORD
elif re.compile("(\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.)").match(t):
return Token.SYMBOL
elif re.compile("\d+").match(t):
return Token.INTEGERCONSTANT
elif re.compile("\".*\"").match(t):
return Token.STRINGCONSTANT
else:
return Token.IDENTIFIER
pass
def printable_token(self):
if self.tokenType() == Token.STRINGCONSTANT:
return self.current_token()[1:-1]
else:
return escape(self.current_token(), True)
""" Returns the character which is the current token """
def symbol(self):
if self.tokenType() != Token.SYMBOL:
raise RuntimeError("Should only be called when tokenType is SYMBOL")
""" Returns the identifier which is the current token """
def identifier(self):
if self.tokenType() != Token.IDENTIFIER:
raise RuntimeError("Should only be called when tokenType is IDENTIFIER")
""" Returns the integer value of the current token """
def intVal(self):
if self.tokenType() != Token.INTEGERCONSTANT:
raise RuntimeError("Should only be called when tokenType is INTEGERCONSTANT")
return int(self.token)
""" Returns a list of tokens for that line """
def parse_line(self, line):
line = line.strip()
# If this line as a single line comment anywhere
# strip the line to start of //
if line.find("//") != -1:
line = line[:line.find("//")].strip()
if self.insideMultiLineComment:
if line.find("*/") == -1:
# The comment doesn't end in this line
return []
else:
self.insideMultiLineComment = False
# comments ends here, huzzah!
line = line[:line.find("*/")].strip()
# Same for the multi-line comment, but this time
# Also set insideMultiLineComment = true
elif line.find("/*") != -1:
# The comment ends on the same line
if line.find("*/") != -1:
# TODO: this also breaks on /* inside strings :(
# TODO: This also breaks on multiple multi-line comments on the same line
line = line[:line.find("/*")] + line[line.find("*/") + 2:].strip()
else:
line = line[:line.find("/*")].strip()
self.insideMultiLineComment = True
# We don't need no empty lines
if len(line) == 0:
return []
else:
# Regex contains 3 parts:
# 1. Keywords
# 2. Symbols
# 3. Identifiers
# 4. Strings
regex = re.compile("(class|constructor|function|method|field|static|var|int|char|boolean|void|true|false|null|this|let|do|if|else|while|return|\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.|[a-zA-Z_]+\w*|\".*\")")
return [e.strip() for e in regex.split(line) if e != None and e.strip()!='']
def has_more_tokens(self):
return self.ptr < len(self.tokens)
def current_token(self):
return self.tokens[self.ptr]
def advance(self):
self.ptr += 1
def __init__(self, filename, print_xml=False):
self.ptr = 0
self.insideMultiLineComment = False
self.file = open(filename, 'r')
self.tokens = []
for line in self.file:
self.tokens += self.parse_line(line)
if(print_xml):
self.print_xml(self.xml_file(filename))
def xml_file(self, jack_file):
return jack_file + "T.xml"
def print_xml(self, xml_filename):
with open(xml_filename, 'w') as f:
f.write("<tokens>\n")
while self.has_more_tokens():
f.write("<{type}> {value} </{type}>\n".format(type=self.tokenType().name.lower(), value=self.printable_token()))
self.advance()
f.write("</tokens>\n")
class CompilationEngine:
def __init__(self):
pass
if __name__ == '__main__':
jt = JackTokenizer(sys.argv[1], True)
j = JackTokenizer(sys.argv[1], True)
# c = CompilationEngine(sys.argv[1])
# c.CompileClass()

@ -0,0 +1,48 @@
from enum import Enum,Flag,auto
class Keyword(Flag):
CLASS = auto()
METHOD = auto()
FUNCTION = auto()
CONSTRUCTOR = auto()
INT = auto()
BOOLEAN = auto()
CHAR = auto()
VOID = auto()
VAR = auto()
STATIC = auto()
FIELD = auto()
LET = auto()
DO = auto()
IF = auto()
ELSE = auto()
WHILE = auto()
RETURN = auto()
TRUE = auto()
FALSE = auto()
NULL = auto()
THIS = auto()
# Symbols Start here
BRACE_OPEN = auto()
BRACE_CLOSE = auto()
PARAN_OPEN = auto()
PARAN_CLOSE = auto()
SQUARE_OPEN = auto()
SQUARE_CLOSE = auto()
DOT = auto()
SEMICOLON = auto()
PLUS = auto()
MINUS = auto()
MUL = auto()
DIV = auto()
AND = auto()
OR = auto()
LT = auto()
GT = auto()
EQ = auto()
NOT = auto()
COMMA = auto()
# Other Tokens
IDENTIFIER = auto()
INTEGERCONSTANT = auto()
STRINGCONSTANT = auto()

@ -0,0 +1,182 @@
import re
from keywords import Keyword
from html import escape
from enum import Enum
# Superclass in some sense
class Token(Enum):
KEYWORD = 1
SYMBOL = 2
class JackTokenizer:
SYMBOL_MAP = {
'{': Keyword.BRACE_OPEN ,
'}': Keyword.BRACE_CLOSE ,
'(': Keyword.PARAN_OPEN ,
')': Keyword.PARAN_CLOSE ,
'[': Keyword.SQUARE_OPEN ,
']': Keyword.SQUARE_CLOSE ,
'.': Keyword.DOT ,
';': Keyword.SEMICOLON ,
'+': Keyword.PLUS ,
'-': Keyword.MINUS ,
'*': Keyword.MUL ,
'/': Keyword.DIV ,
'&': Keyword.AND ,
'|': Keyword.OR ,
'<': Keyword.LT ,
'>': Keyword.GT ,
'=': Keyword.EQ ,
'~': Keyword.NOT ,
',': Keyword.COMMA,
}
KEYWORD_MAP = {
"class": Keyword.CLASS,
"method": Keyword.METHOD,
"function": Keyword.FUNCTION,
"constructor": Keyword.CONSTRUCTOR,
"int": Keyword.INT,
"boolean": Keyword.BOOLEAN,
"char": Keyword.CHAR,
"void": Keyword.VOID,
"var": Keyword.VAR,
"static": Keyword.STATIC,
"field": Keyword.FIELD,
"let": Keyword.LET,
"do": Keyword.DO,
"if": Keyword.IF,
"else": Keyword.ELSE,
"while": Keyword.WHILE,
"return": Keyword.RETURN,
"true": Keyword.TRUE,
"false": Keyword.FALSE,
"null": Keyword.NULL,
"this" : Keyword.THIS
}
""" Returns the type of the current token """
def tokenType(self):
t = self.current_token()
if t in ['class','constructor','function','method','field','static','var','int','char','boolean','void','true','false','null','this','let','do','if','else','while','return']:
return JackTokenizer.KEYWORD_MAP[t]
elif re.compile("(\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.)").match(t):
return JackTokenizer.SYMBOL_MAP[t]
elif re.compile("\d+").match(t):
return Keyword.INTEGERCONSTANT
elif re.compile("\".*\"").match(t):
return Keyword.STRINGCONSTANT
else:
# TODO: Put an assert to ensure valid identifier
return Keyword.IDENTIFIER
pass
def printable_token(self):
if self.tokenType() == Keyword.STRINGCONSTANT:
return self.current_token()[1:-1]
else:
return escape(self.current_token(), True)
def assert_type(self, t):
if(t == Token.SYMBOL):
assert(self.tokenType() in SYMBOL_MAP.values())
elif(t == Token.KEYWORD):
assert(self.tokenType() in KEYWORD_MAP.values())
else:
assert(self.tokenType() == t)
""" Returns the character which is the current token """
def symbol(self):
self.assert_type(Token.SYMBOL)
return self.current_token()
""" Returns the identifier which is the current token """
def identifier(self):
self.assert_type(Token.IDENTIFIER)
return self.current_token()
""" Returns the integer value of the current token """
def intVal(self):
self.assert_type(Keyword.INTEGERCONSTANT)
return int(self.token)
""" Returns a list of tokens for that line """
def parse_line(self, line):
line = line.strip()
# If this line as a single line comment anywhere
# strip the line to start of //
if line.find("//") != -1:
line = line[:line.find("//")].strip()
if self.insideMultiLineComment:
if line.find("*/") == -1:
# The comment doesn't end in this line
return []
else:
self.insideMultiLineComment = False
# comments ends here, huzzah!
line = line[:line.find("*/")].strip()
# Same for the multi-line comment, but this time
# Also set insideMultiLineComment = true
elif line.find("/*") != -1:
# The comment ends on the same line
if line.find("*/") != -1:
# TODO: this also breaks on /* inside strings :(
# TODO: This also breaks on multiple multi-line comments on the same line
line = line[:line.find("/*")] + line[line.find("*/") + 2:].strip()
else:
line = line[:line.find("/*")].strip()
self.insideMultiLineComment = True
# We don't need no empty lines
if len(line) == 0:
return []
else:
# Regex contains 3 parts:
# 1. Keywords
# 2. Symbols
# 3. Identifiers
# 4. Strings
regex = re.compile("(class|constructor|function|method|field|static|var|int|char|boolean|void|true|false|null|this|let|do|if|else|while|return|\(|\)|\[|\]|,|\+|-|;|<|>|=|~|&|{|}|\*|\/|\||\.|[a-zA-Z_]+\w*|\".*\")")
return [e.strip() for e in regex.split(line) if e != None and e.strip()!='']
def has_more_tokens(self):
return self.ptr < len(self.tokens)
def current_token(self):
return self.tokens[self.ptr]
def advance(self):
self.ptr += 1
def __init__(self, filename, print_xml=False):
self.ptr = 0
self.insideMultiLineComment = False
self.file = open(filename, 'r')
self.tokens = []
for line in self.file:
self.tokens += self.parse_line(line)
if(print_xml):
self.print_xml(self.xml_file(filename))
def xml_file(self, jack_file):
return jack_file + "T.xml"
""" Returns a single row of XML for the Compilation Engine """
def xml_row(self):
t = self.tokenType()
if t in JackTokenizer.SYMBOL_MAP.values():
t = 'symbol'
elif t in JackTokenizer.KEYWORD_MAP.values():
t = 'keyword'
else:
t = t.name.lower()
return "<{type}> {value} </{type}>\n".format(type=t, value=self.printable_token())
def print_xml(self, xml_filename):
with open(xml_filename, 'w') as f:
f.write("<tokens>\n")
while self.has_more_tokens():
f.write(self.xml_row())
self.advance()
f.write("</tokens>\n")
Loading…
Cancel
Save