first

2025-07-26 14:25:29 +02:00
commit 8eaddccf91
3 changed files with 325 additions and 0 deletions
--- a/TODO.md
+++ b/TODO.md
@@ -0,0 +1,69 @@
 # write a compiler driver - ✅
 ## Options
 - `--lex`   -> run the lexer, don't parse
 - `--parse` -> lex and parse, don't generate assembly
 - `--codegen` -> lex, parse, generate assembly, but don't emit code
 ## Steps
 ### Preprocess
 ```bash
 > gcc -E -P <input-file> -o <preprocessed-file>.i
 ```
 ### Compile (stub for now)
 ### Assemble and Link
 ```bash
 > gcc <assembly-file>.s -o <output-file>
 ```
 # write a lexer - ✅
 ```python
 WORD_BOUNDARY = '\b'
 IDENTIFIER = "IDENTIFIER"
 CONSTANT = "CONSTANT"
 KW_INT = "KW_INT"
 KW_VOID = "KW_VOID"
 KW_RETURN = "KW_RETURN"
 PAREN_OPEN = "PAREN_OPEN"
 PAREN_CLOSE = "PARENT_CLOSE"
 BRACE_OPEN = "BRACE_OPEN"
 BRACE_CLOSE = "BRACE_CLOSE"
 SEMICOLON = "SEMICOLON"
 REGEX_TABLE = (
    (r"[a-zA-Z_]\w*\b", IDENTIFIER),
    (r"[0-9]+\b", CONSTANT),
    (r"int\b", KW_INT),
    (r"void\b", KW_VOID),
    (r"return\b", KW_RETURN),
    (r"\(", PAREN_OPEN),
    (r"\)", PAREN_CLOSE),
    (r"{", BRACE_OPEN),
    (r"}", BRACE_CLOSE),
    (r";", SEMICOLON),
 )
 ```
 ```pseudocode
 while input isn't empty:
  if input starts with whitespace:
    trim whitespace from start of input
  else:
    find longest match at start of input for any regex in REGEX_TABLE
    if no match is found, raise an error
    convert matching substring into a token
    remove matching substring from start of input
 ```
 We must return a list of tokens. Some have values (constants, keywords); others do not (PAREN_OPEN, SEMICOLON).
 # Write a parser
--- a/compiler_driver.py
+++ b/compiler_driver.py
@@ -0,0 +1,76 @@
 from pathlib import Path
 import subprocess
 import sys
 from typing import Sequence
 from lexer import Token, TokenWithValue, lex as _lex
 def preprocess(fn: str) -> str:
    output_fn = fn + ".i"
    subprocess.run(f"gcc -E -P {fn} -o {output_fn}", shell=True)
    return output_fn
 def lex(fn) -> Sequence[Token | TokenWithValue]:
    return _lex(Path(fn).read_text())
 def parse(tokens: list[Token | TokenWithValue]):
    pass
 def generate_assembly():
    pass
 def emit_code(fn):
    subprocess.run(f"gcc {fn} -o {fn.split('.')[0]}")
    Path(fn).unlink()
 FLAGS = "--lex, --parse, --codegen".split(", ")
 def main():
    args = sys.argv
    len_args = len(args)
    skip_parse, skip_codegen, skip_emit = False, False, False
    if len_args < 2:
        print("please provide an input filename")
        sys.exit(1)
    elif len_args > 3:
        print("provide one of --lex, --parse, --codegen, or no flags")
        sys.exit(1)
    elif len_args == 3:
        flag = args[2].strip()
        if flag not in FLAGS:
            print("provide one of --lex, --parse, --codegen, or no flags")
            sys.exit(1)
        match flag:
            case "--lex":
                skip_parse = True
                skip_codegen = True
                skip_emit = True
            case "--parse":
                skip_codegen = True
                skip_emit = True
            case "--codegen":
                skip_emit = True
    fn = args[1]
    print("preprocessing")
    fni = preprocess(fn)
    print("lexing")
    tokens = lex(fni)
    if not skip_parse:
        print("parsing")
        parse(tokens)
    if not skip_codegen:
        print("generating assembly (codegen)")
        generate_assembly()
    if not skip_emit:
        emit_code()
 if __name__ == "__main__":
    main()
--- a/lexer.py
+++ b/lexer.py
@@ -0,0 +1,180 @@
 import re
 from typing import Sequence
 class Token:
    pass
    def __eq__(self, other) -> bool:
        return self.__class__ == other.__class__
 class TokenWithValue:
    value: str | int
    def __repr__(self):
        return f"{self.__class__}({self.value})"
    def __eq__(self, other) -> bool:
        return self.__class__ == other.__class__ and self.value == other.value
 class Identifier(TokenWithValue):
    def __init__(self, value: str):
        self.value = value
 class Constant(TokenWithValue):
    def __init__(self, value: int | str):
        self.value = int(value)
 class KeywordInt(Token):
    pass
 class KeywordVoid(Token):
    pass
 class KeywordReturn(Token):
    pass
 class ParenOpen(Token):
    pass
 class ParenClose(Token):
    pass
 class BraceOpen(Token):
    pass
 class BraceClose(Token):
    pass
 class Semicolon(Token):
    pass
 EXPECTED = [
    KeywordInt(),
    Identifier("main"),
    ParenOpen(),
    KeywordVoid(),
    ParenClose(),
    BraceOpen(),
    KeywordReturn(),
    Constant(2),
    Semicolon(),
    BraceClose(),
 ]
 IDENTIFIER = r"[a-zA-Z_]\w*\b"
 CONSTANT = r"[0-9]+\b"
 KW_INT = r"int\b"
 KW_VOID = r"void\b"
 KW_RETURN = r"return\b"
 PAREN_OPEN = r"\("
 PAREN_CLOSE = r"\)"
 BRACE_OPEN = r"{"
 BRACE_CLOSE = r"}"
 SEMICOLON = r";"
 REGEX_TABLE = (
    ("IDENTIFIER", IDENTIFIER),
    ("CONSTANT", CONSTANT),
    ("KW_INT", KW_INT),
    ("KW_VOID", KW_VOID),
    ("KW_RETURN", KW_RETURN),
    ("PAREN_OPEN", PAREN_OPEN),
    ("PAREN_CLOSE", PAREN_CLOSE),
    ("BRACE_OPEN", BRACE_OPEN),
    ("BRACE_CLOSE", BRACE_CLOSE),
    ("SEMICOLON", SEMICOLON),
 )
 KWS = {
    "KW_INT": KeywordInt,
    "KW_VOID": KeywordVoid,
    "KW_RETURN": KeywordReturn,
 }
 TOK_NAME_TO_CLASS = KWS | {
    "IDENTIFIER": Identifier,
    "CONSTANT": Constant,
    "PAREN_OPEN": ParenOpen,
    "PAREN_CLOSE": ParenClose,
    "BRACE_OPEN": BraceOpen,
    "BRACE_CLOSE": BraceClose,
    "SEMICOLON": Semicolon,
 }
 KEYWORDS = ("return", "void", "int")
 INPUT = """int main(void) {
    return 2;
 }
 """
 def lex(program_str) -> Sequence[Token]:
    tokens = []
    idx = 0
    # while input isn't empty:
    while program_str[idx:]:
        # if input starts with whitespace:
        if program_str[idx].isspace():
            # trim whitespace from start of input
            idx += 1
            continue
        all_matches = []
        for name, regex in REGEX_TABLE:
            matches = re.match(regex, program_str[idx:])
            if matches:
                all_matches.append((name, matches[0]))
        if not all_matches:
            # if no match is found, raise an error
            raise SyntaxError(program_str[idx:])
        # find longest match at start of input for any regex in REGEX_TABLE
        longest_match = None
        for kw, val in all_matches:
            if kw.startswith("KW_"):
                # keywords taken precedence
                longest_match = kw, val
                break
            if longest_match is None or val > longest_match[1]:
                longest_match = kw, val
        kw, val = longest_match  # type: ignore
        # convert matching substring into a token
        klass = TOK_NAME_TO_CLASS[kw]
        if issubclass(klass, Token):
            tok = klass()
        else:
            tok = klass(val)
        substr = val
        if substr is None:
            raise SyntaxError
        tokens.append(tok)
        # remove matching substring from start of input
        idx += len(substr)
    return tokens
 if __name__ == "__main__":
    res = lex(INPUT)
    assert res == EXPECTED, str(res) + "\n\nEXPECTED:\n" + str(EXPECTED)
    print("success")