Files
making-a-c-compiler/lexer.py
2025-07-26 14:25:29 +02:00

181 lines
3.6 KiB
Python

import re
from typing import Sequence
class Token:
pass
def __eq__(self, other) -> bool:
return self.__class__ == other.__class__
class TokenWithValue:
value: str | int
def __repr__(self):
return f"{self.__class__}({self.value})"
def __eq__(self, other) -> bool:
return self.__class__ == other.__class__ and self.value == other.value
class Identifier(TokenWithValue):
def __init__(self, value: str):
self.value = value
class Constant(TokenWithValue):
def __init__(self, value: int | str):
self.value = int(value)
class KeywordInt(Token):
pass
class KeywordVoid(Token):
pass
class KeywordReturn(Token):
pass
class ParenOpen(Token):
pass
class ParenClose(Token):
pass
class BraceOpen(Token):
pass
class BraceClose(Token):
pass
class Semicolon(Token):
pass
EXPECTED = [
KeywordInt(),
Identifier("main"),
ParenOpen(),
KeywordVoid(),
ParenClose(),
BraceOpen(),
KeywordReturn(),
Constant(2),
Semicolon(),
BraceClose(),
]
IDENTIFIER = r"[a-zA-Z_]\w*\b"
CONSTANT = r"[0-9]+\b"
KW_INT = r"int\b"
KW_VOID = r"void\b"
KW_RETURN = r"return\b"
PAREN_OPEN = r"\("
PAREN_CLOSE = r"\)"
BRACE_OPEN = r"{"
BRACE_CLOSE = r"}"
SEMICOLON = r";"
REGEX_TABLE = (
("IDENTIFIER", IDENTIFIER),
("CONSTANT", CONSTANT),
("KW_INT", KW_INT),
("KW_VOID", KW_VOID),
("KW_RETURN", KW_RETURN),
("PAREN_OPEN", PAREN_OPEN),
("PAREN_CLOSE", PAREN_CLOSE),
("BRACE_OPEN", BRACE_OPEN),
("BRACE_CLOSE", BRACE_CLOSE),
("SEMICOLON", SEMICOLON),
)
KWS = {
"KW_INT": KeywordInt,
"KW_VOID": KeywordVoid,
"KW_RETURN": KeywordReturn,
}
TOK_NAME_TO_CLASS = KWS | {
"IDENTIFIER": Identifier,
"CONSTANT": Constant,
"PAREN_OPEN": ParenOpen,
"PAREN_CLOSE": ParenClose,
"BRACE_OPEN": BraceOpen,
"BRACE_CLOSE": BraceClose,
"SEMICOLON": Semicolon,
}
KEYWORDS = ("return", "void", "int")
INPUT = """int main(void) {
return 2;
}
"""
def lex(program_str) -> Sequence[Token]:
tokens = []
idx = 0
# while input isn't empty:
while program_str[idx:]:
# if input starts with whitespace:
if program_str[idx].isspace():
# trim whitespace from start of input
idx += 1
continue
all_matches = []
for name, regex in REGEX_TABLE:
matches = re.match(regex, program_str[idx:])
if matches:
all_matches.append((name, matches[0]))
if not all_matches:
# if no match is found, raise an error
raise SyntaxError(program_str[idx:])
# find longest match at start of input for any regex in REGEX_TABLE
longest_match = None
for kw, val in all_matches:
if kw.startswith("KW_"):
# keywords taken precedence
longest_match = kw, val
break
if longest_match is None or val > longest_match[1]:
longest_match = kw, val
kw, val = longest_match # type: ignore
# convert matching substring into a token
klass = TOK_NAME_TO_CLASS[kw]
if issubclass(klass, Token):
tok = klass()
else:
tok = klass(val)
substr = val
if substr is None:
raise SyntaxError
tokens.append(tok)
# remove matching substring from start of input
idx += len(substr)
return tokens
if __name__ == "__main__":
res = lex(INPUT)
assert res == EXPECTED, str(res) + "\n\nEXPECTED:\n" + str(EXPECTED)
print("success")