""" S-Expression Parser Parses S-expressions into Python data structures: - Lists become Python lists - Symbols become Symbol objects - Numbers become int/float - Strings become str - Keywords (:foo) become Keyword objects """ import re from dataclasses import dataclass from typing import Any, List, Union @dataclass(frozen=True) class Symbol: """A symbol (identifier) in the S-expression.""" name: str def __repr__(self): return self.name @dataclass(frozen=True) class Keyword: """A keyword like :foo in the S-expression.""" name: str def __repr__(self): return f":{self.name}" # Token patterns TOKEN_PATTERNS = [ (r'\s+', None), # Whitespace (skip) (r';[^\n]*', None), # Comments (skip) (r'\(', 'LPAREN'), (r'\)', 'RPAREN'), (r'\[', 'LBRACKET'), (r'\]', 'RBRACKET'), (r"'", 'QUOTE'), (r'"([^"\\]|\\.)*"', 'STRING'), (r':[a-zA-Z_][a-zA-Z0-9_\-]*', 'KEYWORD'), (r'-?[0-9]+\.[0-9]+', 'FLOAT'), (r'-?[0-9]+', 'INT'), (r'#t|#f|true|false', 'BOOL'), (r'[a-zA-Z_+\-*/<>=!?][a-zA-Z0-9_+\-*/<>=!?]*', 'SYMBOL'), ] TOKEN_REGEX = '|'.join(f'(?P<{name}>{pattern})' if name else f'(?:{pattern})' for pattern, name in TOKEN_PATTERNS) def tokenize(source: str) -> List[tuple]: """Tokenize S-expression source code.""" tokens = [] for match in re.finditer(TOKEN_REGEX, source): kind = match.lastgroup value = match.group() if kind: tokens.append((kind, value)) return tokens def parse(source: str) -> Any: """Parse S-expression source into Python data structures.""" tokens = tokenize(source) pos = [0] # Use list for mutability in nested function def parse_expr(): if pos[0] >= len(tokens): raise SyntaxError("Unexpected end of input") kind, value = tokens[pos[0]] if kind == 'LPAREN': pos[0] += 1 items = [] while pos[0] < len(tokens) and tokens[pos[0]][0] != 'RPAREN': items.append(parse_expr()) if pos[0] >= len(tokens): raise SyntaxError("Missing closing parenthesis") pos[0] += 1 # Skip RPAREN return items if kind == 'LBRACKET': pos[0] += 1 items = [] while pos[0] < len(tokens) and tokens[pos[0]][0] != 'RBRACKET': items.append(parse_expr()) if pos[0] >= len(tokens): raise SyntaxError("Missing closing bracket") pos[0] += 1 # Skip RBRACKET return items elif kind == 'RPAREN': raise SyntaxError("Unexpected closing parenthesis") elif kind == 'QUOTE': pos[0] += 1 return [Symbol('quote'), parse_expr()] elif kind == 'STRING': pos[0] += 1 # Remove quotes and unescape return value[1:-1].replace('\\"', '"').replace('\\n', '\n') elif kind == 'INT': pos[0] += 1 return int(value) elif kind == 'FLOAT': pos[0] += 1 return float(value) elif kind == 'BOOL': pos[0] += 1 return value in ('#t', 'true') elif kind == 'KEYWORD': pos[0] += 1 return Keyword(value[1:]) # Remove leading : elif kind == 'SYMBOL': pos[0] += 1 return Symbol(value) else: raise SyntaxError(f"Unknown token: {kind} {value}") result = parse_expr() # Check for multiple top-level expressions if pos[0] < len(tokens): # Allow multiple top-level expressions, return as list results = [result] while pos[0] < len(tokens): results.append(parse_expr()) return results return result def parse_file(path: str) -> Any: """Parse an S-expression file.""" with open(path, 'r') as f: return parse(f.read()) # Convenience for pretty-printing def to_sexp(obj: Any) -> str: """Convert Python object back to S-expression string.""" if isinstance(obj, list): return '(' + ' '.join(to_sexp(x) for x in obj) + ')' elif isinstance(obj, Symbol): return obj.name elif isinstance(obj, Keyword): return f':{obj.name}' elif isinstance(obj, str): return f'"{obj}"' elif isinstance(obj, bool): return '#t' if obj else '#f' elif isinstance(obj, (int, float)): return str(obj) else: return repr(obj)