added 'making a C compiler' dir

This commit is contained in:
2025-07-27 19:40:14 +02:00
parent ea5a29c918
commit 174226384a
5 changed files with 392 additions and 1 deletions

Submodule 2025/making_a_c_compiler deleted from 8eaddccf91

View File

@@ -0,0 +1,72 @@
# write a compiler driver - ✅
## Options
- `--lex` -> run the lexer, don't parse
- `--parse` -> lex and parse, don't generate assembly
- `--codegen` -> lex, parse, generate assembly, but don't emit code
## Steps
### Preprocess
```bash
> gcc -E -P <input-file> -o <preprocessed-file>.i
```
### Compile (stub for now)
### Assemble and Link
```bash
> gcc <assembly-file>.s -o <output-file>
```
# write a lexer - ✅
```python
WORD_BOUNDARY = '\b'
IDENTIFIER = "IDENTIFIER"
CONSTANT = "CONSTANT"
KW_INT = "KW_INT"
KW_VOID = "KW_VOID"
KW_RETURN = "KW_RETURN"
PAREN_OPEN = "PAREN_OPEN"
PAREN_CLOSE = "PARENT_CLOSE"
BRACE_OPEN = "BRACE_OPEN"
BRACE_CLOSE = "BRACE_CLOSE"
SEMICOLON = "SEMICOLON"
REGEX_TABLE = (
(r"[a-zA-Z_]\w*\b", IDENTIFIER),
(r"[0-9]+\b", CONSTANT),
(r"int\b", KW_INT),
(r"void\b", KW_VOID),
(r"return\b", KW_RETURN),
(r"\(", PAREN_OPEN),
(r"\)", PAREN_CLOSE),
(r"{", BRACE_OPEN),
(r"}", BRACE_CLOSE),
(r";", SEMICOLON),
)
```
```pseudocode
while input isn't empty:
if input starts with whitespace:
trim whitespace from start of input
else:
find longest match at start of input for any regex in REGEX_TABLE
if no match is found, raise an error
convert matching substring into a token
remove matching substring from start of input
```
We must return a list of tokens. Some have values (constants, keywords); others do not (PAREN_OPEN, SEMICOLON).
# Write a parser
- `program: Program` nodes can have only one child, a `function_definition`
- `function_definition` has one child, `body: Statement`, and one attribute `name: Identifier`
- `body` has one child, `return_value: Expression`
- we only have one `Express`, which is a `Constant`

View File

@@ -0,0 +1,76 @@
from pathlib import Path
import subprocess
import sys
from typing import Sequence
from lexer import Token, TokenWithValue, lex as _lex
def preprocess(fn: str) -> str:
output_fn = fn + ".i"
subprocess.run(f"gcc -E -P {fn} -o {output_fn}", shell=True)
return output_fn
def lex(fn) -> Sequence[Token | TokenWithValue]:
return _lex(Path(fn).read_text())
def parse(tokens: list[Token | TokenWithValue]):
pass
def generate_assembly():
pass
def emit_code(fn):
subprocess.run(f"gcc {fn} -o {fn.split('.')[0]}")
Path(fn).unlink()
FLAGS = "--lex, --parse, --codegen".split(", ")
def main():
args = sys.argv
len_args = len(args)
skip_parse, skip_codegen, skip_emit = False, False, False
if len_args < 2:
print("please provide an input filename")
sys.exit(1)
elif len_args > 3:
print("provide one of --lex, --parse, --codegen, or no flags")
sys.exit(1)
elif len_args == 3:
flag = args[2].strip()
if flag not in FLAGS:
print("provide one of --lex, --parse, --codegen, or no flags")
sys.exit(1)
match flag:
case "--lex":
skip_parse = True
skip_codegen = True
skip_emit = True
case "--parse":
skip_codegen = True
skip_emit = True
case "--codegen":
skip_emit = True
fn = args[1]
print("preprocessing")
fni = preprocess(fn)
print("lexing")
tokens = lex(fni)
if not skip_parse:
print("parsing")
parse(tokens)
if not skip_codegen:
print("generating assembly (codegen)")
generate_assembly()
if not skip_emit:
emit_code()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,180 @@
import re
from typing import Sequence
class Token:
pass
def __eq__(self, other) -> bool:
return self.__class__ == other.__class__
class TokenWithValue:
value: str | int
def __repr__(self):
return f"{self.__class__}({self.value})"
def __eq__(self, other) -> bool:
return self.__class__ == other.__class__ and self.value == other.value
class Identifier(TokenWithValue):
def __init__(self, value: str):
self.value = value
class Constant(TokenWithValue):
def __init__(self, value: int | str):
self.value = int(value)
class KeywordInt(Token):
pass
class KeywordVoid(Token):
pass
class KeywordReturn(Token):
pass
class ParenOpen(Token):
pass
class ParenClose(Token):
pass
class BraceOpen(Token):
pass
class BraceClose(Token):
pass
class Semicolon(Token):
pass
EXPECTED = [
KeywordInt(),
Identifier("main"),
ParenOpen(),
KeywordVoid(),
ParenClose(),
BraceOpen(),
KeywordReturn(),
Constant(2),
Semicolon(),
BraceClose(),
]
IDENTIFIER = r"[a-zA-Z_]\w*\b"
CONSTANT = r"[0-9]+\b"
KW_INT = r"int\b"
KW_VOID = r"void\b"
KW_RETURN = r"return\b"
PAREN_OPEN = r"\("
PAREN_CLOSE = r"\)"
BRACE_OPEN = r"{"
BRACE_CLOSE = r"}"
SEMICOLON = r";"
REGEX_TABLE = (
("IDENTIFIER", IDENTIFIER),
("CONSTANT", CONSTANT),
("KW_INT", KW_INT),
("KW_VOID", KW_VOID),
("KW_RETURN", KW_RETURN),
("PAREN_OPEN", PAREN_OPEN),
("PAREN_CLOSE", PAREN_CLOSE),
("BRACE_OPEN", BRACE_OPEN),
("BRACE_CLOSE", BRACE_CLOSE),
("SEMICOLON", SEMICOLON),
)
KWS = {
"KW_INT": KeywordInt,
"KW_VOID": KeywordVoid,
"KW_RETURN": KeywordReturn,
}
TOK_NAME_TO_CLASS = KWS | {
"IDENTIFIER": Identifier,
"CONSTANT": Constant,
"PAREN_OPEN": ParenOpen,
"PAREN_CLOSE": ParenClose,
"BRACE_OPEN": BraceOpen,
"BRACE_CLOSE": BraceClose,
"SEMICOLON": Semicolon,
}
KEYWORDS = ("return", "void", "int")
INPUT = """int main(void) {
return 2;
}
"""
def lex(program_str) -> Sequence[Token]:
tokens = []
idx = 0
# while input isn't empty:
while program_str[idx:]:
# if input starts with whitespace:
if program_str[idx].isspace():
# trim whitespace from start of input
idx += 1
continue
all_matches = []
for name, regex in REGEX_TABLE:
matches = re.match(regex, program_str[idx:])
if matches:
all_matches.append((name, matches[0]))
if not all_matches:
# if no match is found, raise an error
raise SyntaxError(program_str[idx:])
# find longest match at start of input for any regex in REGEX_TABLE
longest_match = None
for kw, val in all_matches:
if kw.startswith("KW_"):
# keywords taken precedence
longest_match = kw, val
break
if longest_match is None or val > longest_match[1]:
longest_match = kw, val
kw, val = longest_match # type: ignore
# convert matching substring into a token
klass = TOK_NAME_TO_CLASS[kw]
if issubclass(klass, Token):
tok = klass()
else:
tok = klass(val)
substr = val
if substr is None:
raise SyntaxError
tokens.append(tok)
# remove matching substring from start of input
idx += len(substr)
return tokens
if __name__ == "__main__":
res = lex(INPUT)
assert res == EXPECTED, str(res) + "\n\nEXPECTED:\n" + str(EXPECTED)
print("success")

View File

@@ -0,0 +1,64 @@
from __future__ import annotations
from enum import Enum
from lexer import Token, TokenWithValue
class Node:
def __init__(self, value: str | int):
self._value = value
self._children = []
def add_child(self, child: Node):
self._children.append(child)
# EXPECTED = [
# KeywordInt(),
# Identifier("main"),
# ParenOpen(),
# KeywordVoid(),
# ParenClose(),
# BraceOpen(),
# KeywordReturn(),
# Constant(2),
# Semicolon(),
# BraceClose(),
# ]
type Identifier = str
class FunctionDefinition:
name: Identifier
body: Statement
class Expression(Enum):
Constant = "Constant"
class Statement:
return_value: Expression
class Program:
function_definition: FunctionDefinition
def parse(tokens: list[Token | TokenWithValue]) -> Node:
...
"""
- `program: Program` nodes can have only one child, a `function_definition`
- `function_definition` has one child, `body: Statement`, and one attribute `name: Identifier`
- `body` has one child, `return_value: Expression`
- we only have one `Expression`, which is a `Constant`
"""
if __name__ == "__main__":
# TODO: test that the tree is what it should be
...