added 'making a C compiler' dir

2025-07-27 19:40:14 +02:00
parent ea5a29c918
commit 174226384a
5 changed files with 392 additions and 1 deletions
--- a/2025/making_a_c_compiler
+++ b/2025/making_a_c_compiler
--- a/2025/making_a_c_compiler/TODO.md
+++ b/2025/making_a_c_compiler/TODO.md
@@ -0,0 +1,72 @@
+# write a compiler driver - ✅
+
+## Options
+
+- `--lex`   -> run the lexer, don't parse
+- `--parse` -> lex and parse, don't generate assembly
+- `--codegen` -> lex, parse, generate assembly, but don't emit code
+
+## Steps
+
+### Preprocess
+
+```bash
+> gcc -E -P <input-file> -o <preprocessed-file>.i
+```
+
+### Compile (stub for now)
+
+### Assemble and Link
+
+```bash
+> gcc <assembly-file>.s -o <output-file>
+```
+
+# write a lexer - ✅
+
+```python
+WORD_BOUNDARY = '\b'
+IDENTIFIER = "IDENTIFIER"
+CONSTANT = "CONSTANT"
+KW_INT = "KW_INT"
+KW_VOID = "KW_VOID"
+KW_RETURN = "KW_RETURN"
+PAREN_OPEN = "PAREN_OPEN"
+PAREN_CLOSE = "PARENT_CLOSE"
+BRACE_OPEN = "BRACE_OPEN"
+BRACE_CLOSE = "BRACE_CLOSE"
+SEMICOLON = "SEMICOLON"
+
+REGEX_TABLE = (
+    (r"[a-zA-Z_]\w*\b", IDENTIFIER),
+    (r"[0-9]+\b", CONSTANT),
+    (r"int\b", KW_INT),
+    (r"void\b", KW_VOID),
+    (r"return\b", KW_RETURN),
+    (r"\(", PAREN_OPEN),
+    (r"\)", PAREN_CLOSE),
+    (r"{", BRACE_OPEN),
+    (r"}", BRACE_CLOSE),
+    (r";", SEMICOLON),
+)
+```
+
+```pseudocode
+while input isn't empty:
+  if input starts with whitespace:
+    trim whitespace from start of input
+  else:
+    find longest match at start of input for any regex in REGEX_TABLE
+    if no match is found, raise an error
+    convert matching substring into a token
+    remove matching substring from start of input
+```
+
+We must return a list of tokens. Some have values (constants, keywords); others do not (PAREN_OPEN, SEMICOLON).
+
+# Write a parser
+
+- `program: Program` nodes can have only one child, a `function_definition`
+- `function_definition` has one child, `body: Statement`, and one attribute `name: Identifier`
+- `body` has one child, `return_value: Expression` 
+- we only have one `Express`, which is a `Constant`
--- a/2025/making_a_c_compiler/compiler_driver.py
+++ b/2025/making_a_c_compiler/compiler_driver.py
@@ -0,0 +1,76 @@
+from pathlib import Path
+import subprocess
+import sys
+from typing import Sequence
+
+from lexer import Token, TokenWithValue, lex as _lex
+
+
+def preprocess(fn: str) -> str:
+    output_fn = fn + ".i"
+    subprocess.run(f"gcc -E -P {fn} -o {output_fn}", shell=True)
+    return output_fn
+
+
+def lex(fn) -> Sequence[Token | TokenWithValue]:
+    return _lex(Path(fn).read_text())
+
+
+def parse(tokens: list[Token | TokenWithValue]):
+    pass
+
+
+def generate_assembly():
+    pass
+
+
+def emit_code(fn):
+    subprocess.run(f"gcc {fn} -o {fn.split('.')[0]}")
+    Path(fn).unlink()
+
+
+FLAGS = "--lex, --parse, --codegen".split(", ")
+
+
+def main():
+    args = sys.argv
+    len_args = len(args)
+    skip_parse, skip_codegen, skip_emit = False, False, False
+    if len_args < 2:
+        print("please provide an input filename")
+        sys.exit(1)
+    elif len_args > 3:
+        print("provide one of --lex, --parse, --codegen, or no flags")
+        sys.exit(1)
+    elif len_args == 3:
+        flag = args[2].strip()
+        if flag not in FLAGS:
+            print("provide one of --lex, --parse, --codegen, or no flags")
+            sys.exit(1)
+        match flag:
+            case "--lex":
+                skip_parse = True
+                skip_codegen = True
+                skip_emit = True
+            case "--parse":
+                skip_codegen = True
+                skip_emit = True
+            case "--codegen":
+                skip_emit = True
+    fn = args[1]
+    print("preprocessing")
+    fni = preprocess(fn)
+    print("lexing")
+    tokens = lex(fni)
+    if not skip_parse:
+        print("parsing")
+        parse(tokens)
+    if not skip_codegen:
+        print("generating assembly (codegen)")
+        generate_assembly()
+    if not skip_emit:
+        emit_code()
+
+
+if __name__ == "__main__":
+    main()
--- a/2025/making_a_c_compiler/lexer.py
+++ b/2025/making_a_c_compiler/lexer.py
@@ -0,0 +1,180 @@
+import re
+from typing import Sequence
+
+
+class Token:
+    pass
+
+    def __eq__(self, other) -> bool:
+        return self.__class__ == other.__class__
+
+
+class TokenWithValue:
+    value: str | int
+
+    def __repr__(self):
+        return f"{self.__class__}({self.value})"
+
+    def __eq__(self, other) -> bool:
+        return self.__class__ == other.__class__ and self.value == other.value
+
+
+class Identifier(TokenWithValue):
+    def __init__(self, value: str):
+        self.value = value
+
+
+class Constant(TokenWithValue):
+    def __init__(self, value: int | str):
+        self.value = int(value)
+
+
+class KeywordInt(Token):
+    pass
+
+
+class KeywordVoid(Token):
+    pass
+
+
+class KeywordReturn(Token):
+    pass
+
+
+class ParenOpen(Token):
+    pass
+
+
+class ParenClose(Token):
+    pass
+
+
+class BraceOpen(Token):
+    pass
+
+
+class BraceClose(Token):
+    pass
+
+
+class Semicolon(Token):
+    pass
+
+
+EXPECTED = [
+    KeywordInt(),
+    Identifier("main"),
+    ParenOpen(),
+    KeywordVoid(),
+    ParenClose(),
+    BraceOpen(),
+    KeywordReturn(),
+    Constant(2),
+    Semicolon(),
+    BraceClose(),
+]
+
+IDENTIFIER = r"[a-zA-Z_]\w*\b"
+CONSTANT = r"[0-9]+\b"
+KW_INT = r"int\b"
+KW_VOID = r"void\b"
+KW_RETURN = r"return\b"
+PAREN_OPEN = r"\("
+PAREN_CLOSE = r"\)"
+BRACE_OPEN = r"{"
+BRACE_CLOSE = r"}"
+SEMICOLON = r";"
+REGEX_TABLE = (
+    ("IDENTIFIER", IDENTIFIER),
+    ("CONSTANT", CONSTANT),
+    ("KW_INT", KW_INT),
+    ("KW_VOID", KW_VOID),
+    ("KW_RETURN", KW_RETURN),
+    ("PAREN_OPEN", PAREN_OPEN),
+    ("PAREN_CLOSE", PAREN_CLOSE),
+    ("BRACE_OPEN", BRACE_OPEN),
+    ("BRACE_CLOSE", BRACE_CLOSE),
+    ("SEMICOLON", SEMICOLON),
+)
+
+KWS = {
+    "KW_INT": KeywordInt,
+    "KW_VOID": KeywordVoid,
+    "KW_RETURN": KeywordReturn,
+}
+
+TOK_NAME_TO_CLASS = KWS | {
+    "IDENTIFIER": Identifier,
+    "CONSTANT": Constant,
+    "PAREN_OPEN": ParenOpen,
+    "PAREN_CLOSE": ParenClose,
+    "BRACE_OPEN": BraceOpen,
+    "BRACE_CLOSE": BraceClose,
+    "SEMICOLON": Semicolon,
+}
+
+KEYWORDS = ("return", "void", "int")
+
+INPUT = """int main(void) {
+    return 2;
+}
+"""
+
+
+def lex(program_str) -> Sequence[Token]:
+    tokens = []
+    idx = 0
+
+    # while input isn't empty:
+    while program_str[idx:]:
+        # if input starts with whitespace:
+        if program_str[idx].isspace():
+            # trim whitespace from start of input
+            idx += 1
+            continue
+
+        all_matches = []
+
+        for name, regex in REGEX_TABLE:
+            matches = re.match(regex, program_str[idx:])
+            if matches:
+                all_matches.append((name, matches[0]))
+
+        if not all_matches:
+            # if no match is found, raise an error
+            raise SyntaxError(program_str[idx:])
+
+        # find longest match at start of input for any regex in REGEX_TABLE
+        longest_match = None
+        for kw, val in all_matches:
+            if kw.startswith("KW_"):
+                # keywords taken precedence
+                longest_match = kw, val
+                break
+
+            if longest_match is None or val > longest_match[1]:
+                longest_match = kw, val
+
+        kw, val = longest_match  # type: ignore
+        # convert matching substring into a token
+        klass = TOK_NAME_TO_CLASS[kw]
+        if issubclass(klass, Token):
+            tok = klass()
+        else:
+            tok = klass(val)
+        substr = val
+
+        if substr is None:
+            raise SyntaxError
+
+        tokens.append(tok)
+        # remove matching substring from start of input
+        idx += len(substr)
+
+    return tokens
+
+
+if __name__ == "__main__":
+    res = lex(INPUT)
+    assert res == EXPECTED, str(res) + "\n\nEXPECTED:\n" + str(EXPECTED)
+    print("success")
--- a/2025/making_a_c_compiler/parser.py
+++ b/2025/making_a_c_compiler/parser.py
@@ -0,0 +1,64 @@
+from __future__ import annotations
+from enum import Enum
+
+
+from lexer import Token, TokenWithValue
+
+
+class Node:
+    def __init__(self, value: str | int):
+        self._value = value
+        self._children = []
+
+    def add_child(self, child: Node):
+        self._children.append(child)
+
+
+# EXPECTED = [
+#     KeywordInt(),
+#     Identifier("main"),
+#     ParenOpen(),
+#     KeywordVoid(),
+#     ParenClose(),
+#     BraceOpen(),
+#     KeywordReturn(),
+#     Constant(2),
+#     Semicolon(),
+#     BraceClose(),
+# ]
+
+
+type Identifier = str
+
+
+class FunctionDefinition:
+    name: Identifier
+    body: Statement
+
+
+class Expression(Enum):
+    Constant = "Constant"
+
+
+class Statement:
+    return_value: Expression
+
+
+class Program:
+    function_definition: FunctionDefinition
+
+
+def parse(tokens: list[Token | TokenWithValue]) -> Node:
+    ...
+
+
+"""
+- `program: Program` nodes can have only one child, a `function_definition`
+- `function_definition` has one child, `body: Statement`, and one attribute `name: Identifier`
+- `body` has one child, `return_value: Expression` 
+- we only have one `Expression`, which is a `Constant`
+"""
+
+if __name__ == "__main__":
+    # TODO: test that the tree is what it should be
+    ...