""" S-expression parser. Supports: - Lists: (a b c) - Vectors: [a b c] (sugar for lists) - Maps: {:key1 val1 :key2 val2} - Symbols: foo, bar-baz, ->, ~card - Keywords: :class, :id - Strings: "hello world" (with \\n, \\t, \\", \\\\ escapes) - Numbers: 42, 3.14, -1.5, 1e-3 - Comments: ; to end of line - Fragment: <> (empty-tag symbol for fragment groups) """ from __future__ import annotations import re from typing import Any from .types import Keyword, Symbol, NIL # --------------------------------------------------------------------------- # SxExpr — pre-built sx source marker # --------------------------------------------------------------------------- class SxExpr: """Pre-built sx source that serialize() outputs unquoted. Use this to nest sx call strings inside other sx_call() invocations without them being quoted as strings:: sx_call("parent", child=SxExpr(sx_call("child", x=1))) # => (~parent :child (~child :x 1)) """ __slots__ = ("source",) def __init__(self, source: str): self.source = source def __repr__(self) -> str: return f"SxExpr({self.source!r})" def __str__(self) -> str: return self.source def __add__(self, other: object) -> "SxExpr": return SxExpr(self.source + str(other)) def __radd__(self, other: object) -> "SxExpr": return SxExpr(str(other) + self.source) # --------------------------------------------------------------------------- # Errors # --------------------------------------------------------------------------- class ParseError(Exception): """Error during s-expression parsing.""" def __init__(self, message: str, position: int = 0, line: int = 1, col: int = 1): self.position = position self.line = line self.col = col super().__init__(f"{message} at line {line}, column {col}") # --------------------------------------------------------------------------- # Tokenizer # --------------------------------------------------------------------------- class Tokenizer: """Stateful tokenizer that walks an s-expression string.""" WHITESPACE = re.compile(r"\s+") COMMENT = re.compile(r";[^\n]*") STRING = re.compile(r'"(?:[^"\\]|\\.)*"') NUMBER = re.compile(r"-?(?:\d+\.?\d*|\.\d+)(?:[eE][+-]?\d+)?") KEYWORD = re.compile(r":[a-zA-Z_][a-zA-Z0-9_>:-]*") # Symbols may start with alpha, _, or common operator chars, plus ~ for components, # <> for the fragment symbol, and & for &key/&rest. SYMBOL = re.compile(r"[a-zA-Z_~*+\-><=/!?&][a-zA-Z0-9_~*+\-><=/!?.:&]*") def __init__(self, text: str): self.text = text self.pos = 0 self.line = 1 self.col = 1 def _advance(self, count: int = 1): for _ in range(count): if self.pos < len(self.text): if self.text[self.pos] == "\n": self.line += 1 self.col = 1 else: self.col += 1 self.pos += 1 def _skip_whitespace_and_comments(self): while self.pos < len(self.text): m = self.WHITESPACE.match(self.text, self.pos) if m: self._advance(m.end() - self.pos) continue m = self.COMMENT.match(self.text, self.pos) if m: self._advance(m.end() - self.pos) continue break def peek(self) -> str | None: self._skip_whitespace_and_comments() if self.pos >= len(self.text): return None return self.text[self.pos] def next_token(self) -> Any: self._skip_whitespace_and_comments() if self.pos >= len(self.text): return None char = self.text[self.pos] # Delimiters if char in "()[]{}": self._advance() return char # String if char == '"': m = self.STRING.match(self.text, self.pos) if not m: raise ParseError("Unterminated string", self.pos, self.line, self.col) self._advance(m.end() - self.pos) content = m.group()[1:-1] content = content.replace("\\n", "\n") content = content.replace("\\t", "\t") content = content.replace('\\"', '"') content = content.replace("\\/", "/") content = content.replace("\\\\", "\\") return content # Keyword if char == ":": m = self.KEYWORD.match(self.text, self.pos) if m: self._advance(m.end() - self.pos) return Keyword(m.group()[1:]) raise ParseError("Invalid keyword", self.pos, self.line, self.col) # Number (check before symbol because of leading -) if char.isdigit() or ( char == "-" and self.pos + 1 < len(self.text) and (self.text[self.pos + 1].isdigit() or self.text[self.pos + 1] == ".") ): m = self.NUMBER.match(self.text, self.pos) if m: self._advance(m.end() - self.pos) num_str = m.group() if "." in num_str or "e" in num_str or "E" in num_str: return float(num_str) return int(num_str) # Symbol m = self.SYMBOL.match(self.text, self.pos) if m: self._advance(m.end() - self.pos) name = m.group() # Built-in literal symbols if name == "true": return True if name == "false": return False if name == "nil": return NIL return Symbol(name) raise ParseError(f"Unexpected character: {char!r}", self.pos, self.line, self.col) # --------------------------------------------------------------------------- # Parsing # --------------------------------------------------------------------------- def parse(text: str) -> Any: """Parse a single s-expression from *text*. >>> parse('(div :class "main" (p "hello"))') [Symbol('div'), Keyword('class'), 'main', [Symbol('p'), 'hello']] """ tok = Tokenizer(text) result = _parse_expr(tok) if tok.peek() is not None: raise ParseError("Unexpected content after expression", tok.pos, tok.line, tok.col) return result def parse_all(text: str) -> list[Any]: """Parse zero or more s-expressions from *text*.""" tok = Tokenizer(text) results: list[Any] = [] while tok.peek() is not None: results.append(_parse_expr(tok)) return results def _parse_expr(tok: Tokenizer) -> Any: # Use peek() (raw character) for structural decisions so that string # values like ")" or "(" don't get confused with actual delimiters. raw = tok.peek() if raw is None: raise ParseError("Unexpected end of input", tok.pos, tok.line, tok.col) if raw in ")]}": tok.next_token() # consume the delimiter raise ParseError(f"Unexpected {raw!r}", tok.pos, tok.line, tok.col) if raw == "(": tok.next_token() # consume the '(' return _parse_list(tok, ")") if raw == "[": tok.next_token() # consume the '[' return _parse_list(tok, "]") if raw == "{": tok.next_token() # consume the '{' return _parse_map(tok) # Everything else: strings, keywords, symbols, numbers token = tok.next_token() return token def _parse_list(tok: Tokenizer, closer: str) -> list[Any]: items: list[Any] = [] while True: c = tok.peek() if c is None: raise ParseError(f"Unterminated list, expected {closer!r}", tok.pos, tok.line, tok.col) if c == closer: tok.next_token() return items items.append(_parse_expr(tok)) def _parse_map(tok: Tokenizer) -> dict[str, Any]: result: dict[str, Any] = {} while True: c = tok.peek() if c is None: raise ParseError("Unterminated map, expected '}'", tok.pos, tok.line, tok.col) if c == "}": tok.next_token() return result key_token = _parse_expr(tok) if isinstance(key_token, Keyword): key = key_token.name elif isinstance(key_token, str): key = key_token else: raise ParseError( f"Map key must be keyword or string, got {type(key_token).__name__}", tok.pos, tok.line, tok.col, ) result[key] = _parse_expr(tok) # --------------------------------------------------------------------------- # Serialization # --------------------------------------------------------------------------- def serialize(expr: Any, indent: int = 0, pretty: bool = False) -> str: """Serialize a value back to s-expression text.""" if isinstance(expr, SxExpr): return expr.source if isinstance(expr, list): if not expr: return "()" if pretty: return _serialize_pretty(expr, indent) items = [serialize(item, indent, False) for item in expr] return "(" + " ".join(items) + ")" if isinstance(expr, Symbol): return expr.name if isinstance(expr, Keyword): return f":{expr.name}" if isinstance(expr, str): escaped = ( expr.replace("\\", "\\\\") .replace('"', '\\"') .replace("\n", "\\n") .replace("\t", "\\t") .replace(" str: if not expr: return "()" inner_prefix = " " * (indent + 1) # Try compact first compact = serialize(expr, indent, False) if len(compact) < 72 and "\n" not in compact: return compact head = serialize(expr[0], indent + 1, False) parts = [f"({head}"] i = 1 while i < len(expr): item = expr[i] if isinstance(item, Keyword) and i + 1 < len(expr): key = serialize(item, 0, False) val = serialize(expr[i + 1], indent + 1, False) if len(val) < 50 and "\n" not in val: parts.append(f"{inner_prefix}{key} {val}") else: val_p = serialize(expr[i + 1], indent + 1, True) parts.append(f"{inner_prefix}{key} {val_p}") i += 2 else: item_str = serialize(item, indent + 1, True) parts.append(f"{inner_prefix}{item_str}") i += 1 return "\n".join(parts) + ")"