rose-ash/shared/sx/parser.py

"""
S-expression parser.

Supports:
- Lists:    (a b c)
- Vectors:  [a b c]  (sugar for lists)
- Maps:     {:key1 val1 :key2 val2}
- Symbols:  foo, bar-baz, ->, ~card
- Keywords: :class, :id
- Strings:  "hello world"  (with \\n, \\t, \\", \\\\ escapes)
- Numbers:  42, 3.14, -1.5, 1e-3
- Comments: ; to end of line
- Fragment: <>  (empty-tag symbol for fragment groups)
"""

from __future__ import annotations

import re
from typing import Any

from .types import Keyword, Symbol, NIL


# ---------------------------------------------------------------------------
# SxExpr — pre-built sx source marker
# ---------------------------------------------------------------------------

class SxExpr:
    """Pre-built sx source that serialize() outputs unquoted.

    Use this to nest sx call strings inside other sx_call() invocations
    without them being quoted as strings::

        sx_call("parent", child=SxExpr(sx_call("child", x=1)))
        # => (~parent :child (~child :x 1))
    """
    __slots__ = ("source",)

    def __init__(self, source: str):
        self.source = source

    def __repr__(self) -> str:
        return f"SxExpr({self.source!r})"

    def __str__(self) -> str:
        return self.source

    def __add__(self, other: object) -> "SxExpr":
        return SxExpr(self.source + str(other))

    def __radd__(self, other: object) -> "SxExpr":
        return SxExpr(str(other) + self.source)


# ---------------------------------------------------------------------------
# Errors
# ---------------------------------------------------------------------------

class ParseError(Exception):
    """Error during s-expression parsing."""

    def __init__(self, message: str, position: int = 0, line: int = 1, col: int = 1):
        self.position = position
        self.line = line
        self.col = col
        super().__init__(f"{message} at line {line}, column {col}")


# ---------------------------------------------------------------------------
# Tokenizer
# ---------------------------------------------------------------------------

class Tokenizer:
    """Stateful tokenizer that walks an s-expression string."""

    WHITESPACE = re.compile(r"\s+")
    COMMENT = re.compile(r";[^\n]*")
    STRING = re.compile(r'"(?:[^"\\]|\\.)*"')
    NUMBER = re.compile(r"-?(?:\d+\.?\d*|\.\d+)(?:[eE][+-]?\d+)?")
    KEYWORD = re.compile(r":[a-zA-Z_][a-zA-Z0-9_>:-]*")
    # Symbols may start with alpha, _, or common operator chars, plus ~ for components,
    # <> for the fragment symbol, and & for &key/&rest.
    SYMBOL = re.compile(r"[a-zA-Z_~*+\-><=/!?&][a-zA-Z0-9_~*+\-><=/!?.:&]*")

    def __init__(self, text: str):
        self.text = text
        self.pos = 0
        self.line = 1
        self.col = 1

    def _advance(self, count: int = 1):
        for _ in range(count):
            if self.pos < len(self.text):
                if self.text[self.pos] == "\n":
                    self.line += 1
                    self.col = 1
                else:
                    self.col += 1
                self.pos += 1

    def _skip_whitespace_and_comments(self):
        while self.pos < len(self.text):
            m = self.WHITESPACE.match(self.text, self.pos)
            if m:
                self._advance(m.end() - self.pos)
                continue
            m = self.COMMENT.match(self.text, self.pos)
            if m:
                self._advance(m.end() - self.pos)
                continue
            break

    def peek(self) -> str | None:
        self._skip_whitespace_and_comments()
        if self.pos >= len(self.text):
            return None
        return self.text[self.pos]

    def next_token(self) -> Any:
        self._skip_whitespace_and_comments()
        if self.pos >= len(self.text):
            return None

        char = self.text[self.pos]

        # Delimiters
        if char in "()[]{}":
            self._advance()
            return char

        # String
        if char == '"':
            m = self.STRING.match(self.text, self.pos)
            if not m:
                raise ParseError("Unterminated string", self.pos, self.line, self.col)
            self._advance(m.end() - self.pos)
            content = m.group()[1:-1]
            content = content.replace("\\n", "\n")
            content = content.replace("\\t", "\t")
            content = content.replace('\\"', '"')
            content = content.replace("\\/", "/")
            content = content.replace("\\\\", "\\")
            return content

        # Keyword
        if char == ":":
            m = self.KEYWORD.match(self.text, self.pos)
            if m:
                self._advance(m.end() - self.pos)
                return Keyword(m.group()[1:])
            raise ParseError("Invalid keyword", self.pos, self.line, self.col)

        # Number (check before symbol because of leading -)
        if char.isdigit() or (
            char == "-"
            and self.pos + 1 < len(self.text)
            and (self.text[self.pos + 1].isdigit() or self.text[self.pos + 1] == ".")
        ):
            m = self.NUMBER.match(self.text, self.pos)
            if m:
                self._advance(m.end() - self.pos)
                num_str = m.group()
                if "." in num_str or "e" in num_str or "E" in num_str:
                    return float(num_str)
                return int(num_str)

        # Symbol
        m = self.SYMBOL.match(self.text, self.pos)
        if m:
            self._advance(m.end() - self.pos)
            name = m.group()
            # Built-in literal symbols
            if name == "true":
                return True
            if name == "false":
                return False
            if name == "nil":
                return NIL
            return Symbol(name)

        raise ParseError(f"Unexpected character: {char!r}", self.pos, self.line, self.col)


# ---------------------------------------------------------------------------
# Parsing
# ---------------------------------------------------------------------------

def parse(text: str) -> Any:
    """Parse a single s-expression from *text*.

    >>> parse('(div :class "main" (p "hello"))')
    [Symbol('div'), Keyword('class'), 'main', [Symbol('p'), 'hello']]
    """
    tok = Tokenizer(text)
    result = _parse_expr(tok)
    if tok.peek() is not None:
        raise ParseError("Unexpected content after expression", tok.pos, tok.line, tok.col)
    return result


def parse_all(text: str) -> list[Any]:
    """Parse zero or more s-expressions from *text*."""
    tok = Tokenizer(text)
    results: list[Any] = []
    while tok.peek() is not None:
        results.append(_parse_expr(tok))
    return results


def _parse_expr(tok: Tokenizer) -> Any:
    # Use peek() (raw character) for structural decisions so that string
    # values like ")" or "(" don't get confused with actual delimiters.
    raw = tok.peek()
    if raw is None:
        raise ParseError("Unexpected end of input", tok.pos, tok.line, tok.col)
    if raw in ")]}":
        tok.next_token()  # consume the delimiter
        raise ParseError(f"Unexpected {raw!r}", tok.pos, tok.line, tok.col)
    if raw == "(":
        tok.next_token()  # consume the '('
        return _parse_list(tok, ")")
    if raw == "[":
        tok.next_token()  # consume the '['
        return _parse_list(tok, "]")
    if raw == "{":
        tok.next_token()  # consume the '{'
        return _parse_map(tok)
    # Quasiquote syntax: ` , ,@
    if raw == "`":
        tok._advance(1)  # consume the backtick
        inner = _parse_expr(tok)
        return [Symbol("quasiquote"), inner]
    if raw == ",":
        tok._advance(1)  # consume the comma
        # Check for splice-unquote (,@) — no whitespace between , and @
        if tok.pos < len(tok.text) and tok.text[tok.pos] == "@":
            tok._advance(1)  # consume the @
            inner = _parse_expr(tok)
            return [Symbol("splice-unquote"), inner]
        inner = _parse_expr(tok)
        return [Symbol("unquote"), inner]
    # Everything else: strings, keywords, symbols, numbers
    token = tok.next_token()
    return token


def _parse_list(tok: Tokenizer, closer: str) -> list[Any]:
    items: list[Any] = []
    while True:
        c = tok.peek()
        if c is None:
            raise ParseError(f"Unterminated list, expected {closer!r}", tok.pos, tok.line, tok.col)
        if c == closer:
            tok.next_token()
            return items
        items.append(_parse_expr(tok))


def _parse_map(tok: Tokenizer) -> dict[str, Any]:
    result: dict[str, Any] = {}
    while True:
        c = tok.peek()
        if c is None:
            raise ParseError("Unterminated map, expected '}'", tok.pos, tok.line, tok.col)
        if c == "}":
            tok.next_token()
            return result
        key_token = _parse_expr(tok)
        if isinstance(key_token, Keyword):
            key = key_token.name
        elif isinstance(key_token, str):
            key = key_token
        else:
            raise ParseError(
                f"Map key must be keyword or string, got {type(key_token).__name__}",
                tok.pos, tok.line, tok.col,
            )
        result[key] = _parse_expr(tok)


# ---------------------------------------------------------------------------
# Serialization
# ---------------------------------------------------------------------------

def serialize(expr: Any, indent: int = 0, pretty: bool = False) -> str:
    """Serialize a value back to s-expression text."""
    if isinstance(expr, SxExpr):
        return expr.source

    if isinstance(expr, list):
        if not expr:
            return "()"
        # Quasiquote sugar: [Symbol("quasiquote"), x] → `x
        if (len(expr) == 2 and isinstance(expr[0], Symbol)):
            name = expr[0].name
            if name == "quasiquote":
                return "`" + serialize(expr[1], indent, pretty)
            if name == "unquote":
                return "," + serialize(expr[1], indent, pretty)
            if name == "splice-unquote":
                return ",@" + serialize(expr[1], indent, pretty)
        if pretty:
            return _serialize_pretty(expr, indent)
        items = [serialize(item, indent, False) for item in expr]
        return "(" + " ".join(items) + ")"

    if isinstance(expr, Symbol):
        return expr.name

    if isinstance(expr, Keyword):
        return f":{expr.name}"

    if isinstance(expr, str):
        escaped = (
            expr.replace("\\", "\\\\")
            .replace('"', '\\"')
            .replace("\n", "\\n")
            .replace("\t", "\\t")
            .replace("</script", "<\\/script")
        )
        return f'"{escaped}"'

    if isinstance(expr, bool):
        return "true" if expr else "false"

    if isinstance(expr, (int, float)):
        return str(expr)

    if expr is None or isinstance(expr, type(NIL)):
        return "nil"

    if isinstance(expr, dict):
        items: list[str] = []
        for k, v in expr.items():
            items.append(f":{k}")
            items.append(serialize(v, indent, pretty))
        return "{" + " ".join(items) + "}"

    # StyleValue — serialize as class name string
    from .types import StyleValue
    if isinstance(expr, StyleValue):
        return f'"{expr.class_name}"'

    # _RawHTML — pre-rendered HTML; wrap as (raw! "...") for SX wire format
    from .html import _RawHTML
    if isinstance(expr, _RawHTML):
        escaped = (
            expr.html.replace("\\", "\\\\")
            .replace('"', '\\"')
            .replace("\n", "\\n")
            .replace("</script", "<\\/script")
        )
        return f'(raw! "{escaped}")'

    # Catch callables (Python functions leaked into sx data)
    if callable(expr):
        import logging
        logging.getLogger("sx").error(
            "serialize: callable leaked into sx data: %r", expr)
        return "nil"

    # Fallback for Lambda/Component — show repr
    return repr(expr)


def _serialize_pretty(expr: list, indent: int) -> str:
    if not expr:
        return "()"
    inner_prefix = "  " * (indent + 1)

    # Try compact first
    compact = serialize(expr, indent, False)
    if len(compact) < 72 and "\n" not in compact:
        return compact

    head = serialize(expr[0], indent + 1, False)
    parts = [f"({head}"]

    i = 1
    while i < len(expr):
        item = expr[i]
        if isinstance(item, Keyword) and i + 1 < len(expr):
            key = serialize(item, 0, False)
            val = serialize(expr[i + 1], indent + 1, False)
            if len(val) < 50 and "\n" not in val:
                parts.append(f"{inner_prefix}{key} {val}")
            else:
                val_p = serialize(expr[i + 1], indent + 1, True)
                parts.append(f"{inner_prefix}{key} {val_p}")
            i += 2
        else:
            item_str = serialize(item, indent + 1, True)
            parts.append(f"{inner_prefix}{item_str}")
            i += 1

    return "\n".join(parts) + ")"