commit 8eaddccf913f02b26586d6fd42fdbe527c719bc0 Author: Zev Averbach Date: Sat Jul 26 14:25:29 2025 +0200 first diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..f84efeb --- /dev/null +++ b/TODO.md @@ -0,0 +1,69 @@ +# write a compiler driver - ✅ + +## Options + +- `--lex` -> run the lexer, don't parse +- `--parse` -> lex and parse, don't generate assembly +- `--codegen` -> lex, parse, generate assembly, but don't emit code + +## Steps + +### Preprocess + +```bash +> gcc -E -P -o .i +``` + +### Compile (stub for now) + +### Assemble and Link + +```bash +> gcc .s -o +``` + +# write a lexer - ✅ + +```python +WORD_BOUNDARY = '\b' +IDENTIFIER = "IDENTIFIER" +CONSTANT = "CONSTANT" +KW_INT = "KW_INT" +KW_VOID = "KW_VOID" +KW_RETURN = "KW_RETURN" +PAREN_OPEN = "PAREN_OPEN" +PAREN_CLOSE = "PARENT_CLOSE" +BRACE_OPEN = "BRACE_OPEN" +BRACE_CLOSE = "BRACE_CLOSE" +SEMICOLON = "SEMICOLON" + +REGEX_TABLE = ( + (r"[a-zA-Z_]\w*\b", IDENTIFIER), + (r"[0-9]+\b", CONSTANT), + (r"int\b", KW_INT), + (r"void\b", KW_VOID), + (r"return\b", KW_RETURN), + (r"\(", PAREN_OPEN), + (r"\)", PAREN_CLOSE), + (r"{", BRACE_OPEN), + (r"}", BRACE_CLOSE), + (r";", SEMICOLON), +) +``` + +```pseudocode +while input isn't empty: + if input starts with whitespace: + trim whitespace from start of input + else: + find longest match at start of input for any regex in REGEX_TABLE + if no match is found, raise an error + convert matching substring into a token + remove matching substring from start of input +``` + +We must return a list of tokens. Some have values (constants, keywords); others do not (PAREN_OPEN, SEMICOLON). + +# Write a parser + + diff --git a/compiler_driver.py b/compiler_driver.py new file mode 100644 index 0000000..aef3ab0 --- /dev/null +++ b/compiler_driver.py @@ -0,0 +1,76 @@ +from pathlib import Path +import subprocess +import sys +from typing import Sequence + +from lexer import Token, TokenWithValue, lex as _lex + + +def preprocess(fn: str) -> str: + output_fn = fn + ".i" + subprocess.run(f"gcc -E -P {fn} -o {output_fn}", shell=True) + return output_fn + + +def lex(fn) -> Sequence[Token | TokenWithValue]: + return _lex(Path(fn).read_text()) + + +def parse(tokens: list[Token | TokenWithValue]): + pass + + +def generate_assembly(): + pass + + +def emit_code(fn): + subprocess.run(f"gcc {fn} -o {fn.split('.')[0]}") + Path(fn).unlink() + + +FLAGS = "--lex, --parse, --codegen".split(", ") + + +def main(): + args = sys.argv + len_args = len(args) + skip_parse, skip_codegen, skip_emit = False, False, False + if len_args < 2: + print("please provide an input filename") + sys.exit(1) + elif len_args > 3: + print("provide one of --lex, --parse, --codegen, or no flags") + sys.exit(1) + elif len_args == 3: + flag = args[2].strip() + if flag not in FLAGS: + print("provide one of --lex, --parse, --codegen, or no flags") + sys.exit(1) + match flag: + case "--lex": + skip_parse = True + skip_codegen = True + skip_emit = True + case "--parse": + skip_codegen = True + skip_emit = True + case "--codegen": + skip_emit = True + fn = args[1] + print("preprocessing") + fni = preprocess(fn) + print("lexing") + tokens = lex(fni) + if not skip_parse: + print("parsing") + parse(tokens) + if not skip_codegen: + print("generating assembly (codegen)") + generate_assembly() + if not skip_emit: + emit_code() + + +if __name__ == "__main__": + main() diff --git a/lexer.py b/lexer.py new file mode 100644 index 0000000..fb2f5f7 --- /dev/null +++ b/lexer.py @@ -0,0 +1,180 @@ +import re +from typing import Sequence + + +class Token: + pass + + def __eq__(self, other) -> bool: + return self.__class__ == other.__class__ + + +class TokenWithValue: + value: str | int + + def __repr__(self): + return f"{self.__class__}({self.value})" + + def __eq__(self, other) -> bool: + return self.__class__ == other.__class__ and self.value == other.value + + +class Identifier(TokenWithValue): + def __init__(self, value: str): + self.value = value + + +class Constant(TokenWithValue): + def __init__(self, value: int | str): + self.value = int(value) + + +class KeywordInt(Token): + pass + + +class KeywordVoid(Token): + pass + + +class KeywordReturn(Token): + pass + + +class ParenOpen(Token): + pass + + +class ParenClose(Token): + pass + + +class BraceOpen(Token): + pass + + +class BraceClose(Token): + pass + + +class Semicolon(Token): + pass + + +EXPECTED = [ + KeywordInt(), + Identifier("main"), + ParenOpen(), + KeywordVoid(), + ParenClose(), + BraceOpen(), + KeywordReturn(), + Constant(2), + Semicolon(), + BraceClose(), +] + +IDENTIFIER = r"[a-zA-Z_]\w*\b" +CONSTANT = r"[0-9]+\b" +KW_INT = r"int\b" +KW_VOID = r"void\b" +KW_RETURN = r"return\b" +PAREN_OPEN = r"\(" +PAREN_CLOSE = r"\)" +BRACE_OPEN = r"{" +BRACE_CLOSE = r"}" +SEMICOLON = r";" +REGEX_TABLE = ( + ("IDENTIFIER", IDENTIFIER), + ("CONSTANT", CONSTANT), + ("KW_INT", KW_INT), + ("KW_VOID", KW_VOID), + ("KW_RETURN", KW_RETURN), + ("PAREN_OPEN", PAREN_OPEN), + ("PAREN_CLOSE", PAREN_CLOSE), + ("BRACE_OPEN", BRACE_OPEN), + ("BRACE_CLOSE", BRACE_CLOSE), + ("SEMICOLON", SEMICOLON), +) + +KWS = { + "KW_INT": KeywordInt, + "KW_VOID": KeywordVoid, + "KW_RETURN": KeywordReturn, +} + +TOK_NAME_TO_CLASS = KWS | { + "IDENTIFIER": Identifier, + "CONSTANT": Constant, + "PAREN_OPEN": ParenOpen, + "PAREN_CLOSE": ParenClose, + "BRACE_OPEN": BraceOpen, + "BRACE_CLOSE": BraceClose, + "SEMICOLON": Semicolon, +} + +KEYWORDS = ("return", "void", "int") + +INPUT = """int main(void) { + return 2; +} +""" + + +def lex(program_str) -> Sequence[Token]: + tokens = [] + idx = 0 + + # while input isn't empty: + while program_str[idx:]: + # if input starts with whitespace: + if program_str[idx].isspace(): + # trim whitespace from start of input + idx += 1 + continue + + all_matches = [] + + for name, regex in REGEX_TABLE: + matches = re.match(regex, program_str[idx:]) + if matches: + all_matches.append((name, matches[0])) + + if not all_matches: + # if no match is found, raise an error + raise SyntaxError(program_str[idx:]) + + # find longest match at start of input for any regex in REGEX_TABLE + longest_match = None + for kw, val in all_matches: + if kw.startswith("KW_"): + # keywords taken precedence + longest_match = kw, val + break + + if longest_match is None or val > longest_match[1]: + longest_match = kw, val + + kw, val = longest_match # type: ignore + # convert matching substring into a token + klass = TOK_NAME_TO_CLASS[kw] + if issubclass(klass, Token): + tok = klass() + else: + tok = klass(val) + substr = val + + if substr is None: + raise SyntaxError + + tokens.append(tok) + # remove matching substring from start of input + idx += len(substr) + + return tokens + + +if __name__ == "__main__": + res = lex(INPUT) + assert res == EXPECTED, str(res) + "\n\nEXPECTED:\n" + str(EXPECTED) + print("success")