""" HTML → sx source converter. Converts an HTML string to an equivalent s-expression source string so that raw HTML can be eliminated from the sx tree. html_to_sx('
Hello world
') # → '(p :class "intro" "Hello " (em "world"))' Uses only stdlib ``html.parser`` — no extra dependencies. """ from __future__ import annotations from html.parser import HTMLParser from .html import VOID_ELEMENTS, BOOLEAN_ATTRS def html_to_sx(html: str) -> str: """Convert an HTML string to sx source.""" if not html or not html.strip(): return '""' parser = _SxBuilder() parser.feed(html) nodes = parser.finish() if not nodes: return '""' if len(nodes) == 1: return _serialize(nodes[0]) return "(<> " + " ".join(_serialize(n) for n in nodes) + ")" # --------------------------------------------------------------------------- # Internal tree builder # --------------------------------------------------------------------------- class _TextNode: __slots__ = ("text",) def __init__(self, text: str): self.text = text class _ElementNode: __slots__ = ("tag", "attrs", "children") def __init__(self, tag: str, attrs: list[tuple[str, str | None]]): self.tag = tag self.attrs = attrs self.children: list[_TextNode | _ElementNode] = [] class _SxBuilder(HTMLParser): def __init__(self): super().__init__(convert_charrefs=True) self._roots: list[_TextNode | _ElementNode] = [] self._stack: list[_ElementNode] = [] def _append(self, node: _TextNode | _ElementNode): if self._stack: self._stack[-1].children.append(node) else: self._roots.append(node) def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]): node = _ElementNode(tag, attrs) self._append(node) if tag not in VOID_ELEMENTS: self._stack.append(node) def handle_endtag(self, tag: str): # Pop back to the matching open tag (tolerant of mismatches) for i in range(len(self._stack) - 1, -1, -1): if self._stack[i].tag == tag: self._stack[i + 1:] = [] self._stack.pop(i) return def handle_data(self, data: str): if data: self._append(_TextNode(data)) def handle_comment(self, data: str): pass # skip HTML comments def finish(self) -> list[_TextNode | _ElementNode]: # Strip whitespace-only text nodes at root level return [n for n in self._roots if not (isinstance(n, _TextNode) and not n.text.strip())] # --------------------------------------------------------------------------- # Serializer # --------------------------------------------------------------------------- def _esc(s: str) -> str: """Escape a string for sx double-quoted literals.""" return s.replace("\\", "\\\\").replace('"', '\\"') def _serialize(node: _TextNode | _ElementNode) -> str: if isinstance(node, _TextNode): return f'"{_esc(node.text)}"' parts = [node.tag] for name, value in node.attrs: if name in BOOLEAN_ATTRS: if value is None or value == "" or value == name: parts.append(f":{name} true") else: parts.append(f':{name} "{_esc(value)}"') elif value is None: # Attribute without value (non-boolean) — treat as boolean true parts.append(f":{name} true") else: parts.append(f':{name} "{_esc(value)}"') if node.tag in VOID_ELEMENTS: return "(" + " ".join(parts) + ")" if node.children: child_parts = [_serialize(c) for c in node.children] return "(" + " ".join(parts) + " " + " ".join(child_parts) + ")" return "(" + " ".join(parts) + ")"