Files
rose-ash/shared/sx/parser.py
giles 03ba8e58e5
All checks were successful
Build and Deploy / build-and-deploy (push) Successful in 11m13s
Implement reader macros (#;, #|...|, #', #name) and #z3 demo
Reader macros in parser.sx spec, Python parser.py, and hand-written sx.js:
- #; datum comment: read and discard next expression
- #|...|  raw string: no escape processing
- #' quote shorthand: (quote expr)
- #name extensible dispatch: registered handler transforms next expression

#z3 reader macro demo (reader_z3.py): translates define-primitive
declarations from primitives.sx into SMT-LIB verification conditions.
Same source, two interpretations — bootstrappers compile to executable
code, #z3 extracts proof obligations.

48 parser tests (SX spec + Python), all passing. Rebootstrapped JS+Python.
Demo page at /plans/reader-macro-demo with side-by-side examples.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 20:21:40 +00:00

503 lines
16 KiB
Python

"""
S-expression parser.
Supports:
- Lists: (a b c)
- Vectors: [a b c] (sugar for lists)
- Maps: {:key1 val1 :key2 val2}
- Symbols: foo, bar-baz, ->, ~card
- Keywords: :class, :id
- Strings: "hello world" (with \\n, \\t, \\", \\\\ escapes)
- Numbers: 42, 3.14, -1.5, 1e-3
- Comments: ; to end of line
- Fragment: <> (empty-tag symbol for fragment groups)
"""
from __future__ import annotations
import re
from typing import Any
from .types import Keyword, Symbol, NIL
# ---------------------------------------------------------------------------
# Reader macro registry
# ---------------------------------------------------------------------------
_READER_MACROS: dict[str, Any] = {}
def register_reader_macro(name: str, handler: Any) -> None:
"""Register a reader macro handler: #name expr → handler(expr)."""
_READER_MACROS[name] = handler
# ---------------------------------------------------------------------------
# SxExpr — pre-built sx source marker
# ---------------------------------------------------------------------------
class SxExpr(str):
"""Pre-built sx source that serialize() outputs unquoted.
``SxExpr`` is a ``str`` subclass, so it works everywhere a plain
string does (join, startswith, f-strings, isinstance checks). The
only difference: ``serialize()`` emits it unquoted instead of
wrapping it in double-quotes.
Use this to nest sx call strings inside other sx_call() invocations
without them being quoted as strings::
sx_call("parent", child=sx_call("child", x=1))
# => (~parent :child (~child :x 1))
"""
def __new__(cls, source: str = "") -> "SxExpr":
return str.__new__(cls, source)
@property
def source(self) -> str:
"""The raw SX source string (backward compat)."""
return str.__str__(self)
def __repr__(self) -> str:
return f"SxExpr({str.__repr__(self)})"
def __add__(self, other: object) -> "SxExpr":
return SxExpr(str.__add__(self, str(other)))
def __radd__(self, other: object) -> "SxExpr":
return SxExpr(str.__add__(str(other), self))
# ---------------------------------------------------------------------------
# Errors
# ---------------------------------------------------------------------------
_ESCAPE_MAP = {"n": "\n", "t": "\t", "r": "\r", "0": "\0", '"': '"', "\\": "\\", "/": "/"}
def _unescape_string(s: str) -> str:
"""Process escape sequences in a parsed string, character by character."""
out: list[str] = []
i = 0
while i < len(s):
if s[i] == "\\" and i + 1 < len(s):
nxt = s[i + 1]
out.append(_ESCAPE_MAP.get(nxt, nxt))
i += 2
else:
out.append(s[i])
i += 1
return "".join(out)
class ParseError(Exception):
"""Error during s-expression parsing."""
def __init__(self, message: str, position: int = 0, line: int = 1, col: int = 1):
self.position = position
self.line = line
self.col = col
super().__init__(f"{message} at line {line}, column {col}")
# ---------------------------------------------------------------------------
# Tokenizer
# ---------------------------------------------------------------------------
class Tokenizer:
"""Stateful tokenizer that walks an s-expression string."""
WHITESPACE = re.compile(r"\s+")
COMMENT = re.compile(r";[^\n]*")
STRING = re.compile(r'"(?:[^"\\]|\\.)*"')
NUMBER = re.compile(r"-?(?:\d+\.?\d*|\.\d+)(?:[eE][+-]?\d+)?")
KEYWORD = re.compile(r":[a-zA-Z_~*+\-><=/!?&\[]{1}[a-zA-Z0-9_~*+\-><=/!?.:&/\[\]#,]*")
# Symbols may start with alpha, _, or common operator chars, plus ~ for components,
# <> for the fragment symbol, and & for &key/&rest.
SYMBOL = re.compile(r"[a-zA-Z_~*+\-><=/!?&][a-zA-Z0-9_~*+\-><=/!?.:&]*")
def __init__(self, text: str):
self.text = text
self.pos = 0
self.line = 1
self.col = 1
def _advance(self, count: int = 1):
for _ in range(count):
if self.pos < len(self.text):
if self.text[self.pos] == "\n":
self.line += 1
self.col = 1
else:
self.col += 1
self.pos += 1
def _skip_whitespace_and_comments(self):
while self.pos < len(self.text):
m = self.WHITESPACE.match(self.text, self.pos)
if m:
self._advance(m.end() - self.pos)
continue
m = self.COMMENT.match(self.text, self.pos)
if m:
self._advance(m.end() - self.pos)
continue
break
def peek(self) -> str | None:
self._skip_whitespace_and_comments()
if self.pos >= len(self.text):
return None
return self.text[self.pos]
def next_token(self) -> Any:
self._skip_whitespace_and_comments()
if self.pos >= len(self.text):
return None
char = self.text[self.pos]
# Delimiters
if char in "()[]{}":
self._advance()
return char
# String
if char == '"':
m = self.STRING.match(self.text, self.pos)
if not m:
raise ParseError("Unterminated string", self.pos, self.line, self.col)
self._advance(m.end() - self.pos)
content = m.group()[1:-1]
content = _unescape_string(content)
return content
# Keyword
if char == ":":
m = self.KEYWORD.match(self.text, self.pos)
if m:
self._advance(m.end() - self.pos)
return Keyword(m.group()[1:])
raise ParseError("Invalid keyword", self.pos, self.line, self.col)
# Number (check before symbol because of leading -)
if char.isdigit() or (
char == "-"
and self.pos + 1 < len(self.text)
and (self.text[self.pos + 1].isdigit() or self.text[self.pos + 1] == ".")
):
m = self.NUMBER.match(self.text, self.pos)
if m:
self._advance(m.end() - self.pos)
num_str = m.group()
if "." in num_str or "e" in num_str or "E" in num_str:
return float(num_str)
return int(num_str)
# Ellipsis (... as a symbol, used in spec declarations)
if char == "." and self.text[self.pos:self.pos + 3] == "...":
self._advance(3)
return Symbol("...")
# Symbol
m = self.SYMBOL.match(self.text, self.pos)
if m:
self._advance(m.end() - self.pos)
name = m.group()
# Built-in literal symbols
if name == "true":
return True
if name == "false":
return False
if name == "nil":
return NIL
return Symbol(name)
# Reader macro dispatch: #
if char == "#":
return "#"
raise ParseError(f"Unexpected character: {char!r}", self.pos, self.line, self.col)
def _read_raw_string(self) -> str:
"""Read raw string literal until closing |."""
buf: list[str] = []
while self.pos < len(self.text):
ch = self.text[self.pos]
if ch == "|":
self._advance(1)
return "".join(buf)
buf.append(ch)
self._advance(1)
raise ParseError("Unterminated raw string", self.pos, self.line, self.col)
def _read_ident(self) -> str:
"""Read an identifier (for reader macro names)."""
import re
m = self.SYMBOL.match(self.text, self.pos)
if m:
self._advance(m.end() - self.pos)
return m.group()
raise ParseError("Expected identifier after #", self.pos, self.line, self.col)
# ---------------------------------------------------------------------------
# Parsing
# ---------------------------------------------------------------------------
def parse(text: str) -> Any:
"""Parse a single s-expression from *text*.
>>> parse('(div :class "main" (p "hello"))')
[Symbol('div'), Keyword('class'), 'main', [Symbol('p'), 'hello']]
"""
tok = Tokenizer(text)
result = _parse_expr(tok)
if tok.peek() is not None:
raise ParseError("Unexpected content after expression", tok.pos, tok.line, tok.col)
return result
def parse_all(text: str) -> list[Any]:
"""Parse zero or more s-expressions from *text*."""
tok = Tokenizer(text)
results: list[Any] = []
while tok.peek() is not None:
results.append(_parse_expr(tok))
return results
def _parse_expr(tok: Tokenizer) -> Any:
# Use peek() (raw character) for structural decisions so that string
# values like ")" or "(" don't get confused with actual delimiters.
raw = tok.peek()
if raw is None:
raise ParseError("Unexpected end of input", tok.pos, tok.line, tok.col)
if raw in ")]}":
tok.next_token() # consume the delimiter
raise ParseError(f"Unexpected {raw!r}", tok.pos, tok.line, tok.col)
if raw == "(":
tok.next_token() # consume the '('
return _parse_list(tok, ")")
if raw == "[":
tok.next_token() # consume the '['
return _parse_list(tok, "]")
if raw == "{":
tok.next_token() # consume the '{'
return _parse_map(tok)
# Quasiquote syntax: ` , ,@
if raw == "`":
tok._advance(1) # consume the backtick
inner = _parse_expr(tok)
return [Symbol("quasiquote"), inner]
if raw == ",":
tok._advance(1) # consume the comma
# Check for splice-unquote (,@) — no whitespace between , and @
if tok.pos < len(tok.text) and tok.text[tok.pos] == "@":
tok._advance(1) # consume the @
inner = _parse_expr(tok)
return [Symbol("splice-unquote"), inner]
inner = _parse_expr(tok)
return [Symbol("unquote"), inner]
# Reader macro dispatch: #
if raw == "#":
tok._advance(1) # consume the #
if tok.pos >= len(tok.text):
raise ParseError("Unexpected end of input after #",
tok.pos, tok.line, tok.col)
dispatch = tok.text[tok.pos]
if dispatch == ";":
tok._advance(1)
_parse_expr(tok) # read and discard
return _parse_expr(tok) # return next
if dispatch == "|":
tok._advance(1)
return tok._read_raw_string()
if dispatch == "'":
tok._advance(1)
return [Symbol("quote"), _parse_expr(tok)]
# Extensible dispatch: #name expr
if dispatch.isalpha() or dispatch in "_~":
macro_name = tok._read_ident()
handler = _READER_MACROS.get(macro_name)
if handler is None:
raise ParseError(f"Unknown reader macro: #{macro_name}",
tok.pos, tok.line, tok.col)
return handler(_parse_expr(tok))
raise ParseError(f"Unknown reader macro: #{dispatch}",
tok.pos, tok.line, tok.col)
# Everything else: strings, keywords, symbols, numbers
token = tok.next_token()
return token
def _parse_list(tok: Tokenizer, closer: str) -> list[Any]:
items: list[Any] = []
while True:
c = tok.peek()
if c is None:
raise ParseError(f"Unterminated list, expected {closer!r}", tok.pos, tok.line, tok.col)
if c == closer:
tok.next_token()
return items
items.append(_parse_expr(tok))
def _parse_map(tok: Tokenizer) -> dict[str, Any]:
result: dict[str, Any] = {}
while True:
c = tok.peek()
if c is None:
raise ParseError("Unterminated map, expected '}'", tok.pos, tok.line, tok.col)
if c == "}":
tok.next_token()
return result
key_token = _parse_expr(tok)
if isinstance(key_token, Keyword):
key = key_token.name
elif isinstance(key_token, str):
key = key_token
else:
raise ParseError(
f"Map key must be keyword or string, got {type(key_token).__name__}",
tok.pos, tok.line, tok.col,
)
result[key] = _parse_expr(tok)
# ---------------------------------------------------------------------------
# Serialization
# ---------------------------------------------------------------------------
def serialize(expr: Any, indent: int = 0, pretty: bool = False) -> str:
"""Serialize a value back to s-expression text.
Type dispatch order (first match wins):
- ``SxExpr`` → emitted unquoted (pre-built sx source)
- ``list`` → ``(head ...)`` (s-expression list)
- ``Symbol`` → bare name
- ``Keyword`` → ``:name``
- ``str`` → ``"quoted"`` (with escapes)
- ``bool`` → ``true`` / ``false``
- ``int/float`` → numeric literal
- ``None/NIL`` → ``nil``
- ``dict`` → ``{:key val ...}``
List serialization conventions (for ``sx_call`` kwargs):
- ``(list ...)`` — data array: client gets iterable for map/filter
- ``(<> ...)`` — rendered content: client treats as DocumentFragment
- ``(head ...)`` — AST: head is called as function (never use for data)
"""
if isinstance(expr, SxExpr):
return expr.source
if isinstance(expr, list):
if not expr:
return "()"
# Quasiquote sugar: [Symbol("quasiquote"), x] → `x
if (len(expr) == 2 and isinstance(expr[0], Symbol)):
name = expr[0].name
if name == "quasiquote":
return "`" + serialize(expr[1], indent, pretty)
if name == "unquote":
return "," + serialize(expr[1], indent, pretty)
if name == "splice-unquote":
return ",@" + serialize(expr[1], indent, pretty)
if pretty:
return _serialize_pretty(expr, indent)
items = [serialize(item, indent, False) for item in expr]
return "(" + " ".join(items) + ")"
if isinstance(expr, Symbol):
return expr.name
if isinstance(expr, Keyword):
return f":{expr.name}"
if isinstance(expr, str):
escaped = (
expr.replace("\\", "\\\\")
.replace('"', '\\"')
.replace("\n", "\\n")
.replace("\r", "\\r")
.replace("\t", "\\t")
.replace("\0", "\\0")
.replace("</script", "<\\/script")
)
return f'"{escaped}"'
if isinstance(expr, bool):
return "true" if expr else "false"
if isinstance(expr, (int, float)):
return str(expr)
if expr is None or isinstance(expr, type(NIL)):
return "nil"
if isinstance(expr, dict):
items: list[str] = []
for k, v in expr.items():
items.append(f":{k}")
items.append(serialize(v, indent, pretty))
return "{" + " ".join(items) + "}"
# _RawHTML — pre-rendered HTML; wrap as (raw! "...") for SX wire format
from .html import _RawHTML
if isinstance(expr, _RawHTML):
escaped = (
expr.html.replace("\\", "\\\\")
.replace('"', '\\"')
.replace("\n", "\\n")
.replace("\r", "\\r")
.replace("\0", "\\0")
.replace("</script", "<\\/script")
)
return f'(raw! "{escaped}")'
# Catch callables (Python functions leaked into sx data)
if callable(expr):
import logging
logging.getLogger("sx").error(
"serialize: callable leaked into sx data: %r", expr)
return "nil"
# Fallback for Lambda/Component — show repr
return repr(expr)
def _serialize_pretty(expr: list, indent: int) -> str:
if not expr:
return "()"
inner_prefix = " " * (indent + 1)
# Try compact first
compact = serialize(expr, indent, False)
if len(compact) < 72 and "\n" not in compact:
return compact
head = serialize(expr[0], indent + 1, False)
parts = [f"({head}"]
i = 1
while i < len(expr):
item = expr[i]
if isinstance(item, Keyword) and i + 1 < len(expr):
key = serialize(item, 0, False)
val = serialize(expr[i + 1], indent + 1, False)
if len(val) < 50 and "\n" not in val:
parts.append(f"{inner_prefix}{key} {val}")
else:
val_p = serialize(expr[i + 1], indent + 1, True)
parts.append(f"{inner_prefix}{key} {val_p}")
i += 2
else:
item_str = serialize(item, indent + 1, True)
parts.append(f"{inner_prefix}{item_str}")
i += 1
return "\n".join(parts) + ")"