diff --git a/blog/bp/blog/ghost/lexical_to_sx.py b/blog/bp/blog/ghost/lexical_to_sx.py index 84cc673..d13c317 100644 --- a/blog/bp/blog/ghost/lexical_to_sx.py +++ b/blog/bp/blog/ghost/lexical_to_sx.py @@ -16,6 +16,8 @@ from typing import Callable import mistune +from shared.sx.html_to_sx import html_to_sx + # --------------------------------------------------------------------------- # Registry @@ -249,7 +251,7 @@ def _image(node: dict) -> str: if alt: parts.append(f':alt "{_esc(alt)}"') if caption: - parts.append(f':caption "{_esc(caption)}"') + parts.append(f":caption {html_to_sx(caption)}") if width: parts.append(f':width "{_esc(width)}"') if href: @@ -273,20 +275,21 @@ def _gallery(node: dict) -> str: if img.get("alt"): item_parts.append(f'"alt" "{_esc(img["alt"])}"') if img.get("caption"): - item_parts.append(f'"caption" "{_esc(img["caption"])}"') + item_parts.append(f'"caption" {html_to_sx(img["caption"])}') row_items.append("(dict " + " ".join(item_parts) + ")") rows.append("(list " + " ".join(row_items) + ")") images_sx = "(list " + " ".join(rows) + ")" caption = node.get("caption", "") - caption_attr = f' :caption "{_esc(caption)}"' if caption else "" + caption_attr = f" :caption {html_to_sx(caption)}" if caption else "" return f"(~kg-gallery :images {images_sx}{caption_attr})" @_converter("html") def _html_card(node: dict) -> str: raw = node.get("html", "") - return f'(~kg-html :html "{_esc(raw)}")' + inner = html_to_sx(raw) + return f"(~kg-html {inner})" @_converter("embed") @@ -295,7 +298,7 @@ def _embed(node: dict) -> str: caption = node.get("caption", "") parts = [f':html "{_esc(embed_html)}"'] if caption: - parts.append(f':caption "{_esc(caption)}"') + parts.append(f":caption {html_to_sx(caption)}") return "(~kg-embed " + " ".join(parts) + ")" @@ -325,7 +328,7 @@ def _bookmark(node: dict) -> str: parts.append(f':thumbnail "{_esc(thumbnail)}"') caption = node.get("caption", "") if caption: - parts.append(f':caption "{_esc(caption)}"') + parts.append(f":caption {html_to_sx(caption)}") return "(~kg-bookmark " + " ".join(parts) + ")" @@ -390,7 +393,7 @@ def _video(node: dict) -> str: parts = [f':src "{_esc(src)}"'] if caption: - parts.append(f':caption "{_esc(caption)}"') + parts.append(f":caption {html_to_sx(caption)}") if width: parts.append(f':width "{_esc(width)}"') if thumbnail: @@ -425,7 +428,7 @@ def _file(node: dict) -> str: if size_str: parts.append(f':filesize "{size_str}"') if caption: - parts.append(f':caption "{_esc(caption)}"') + parts.append(f":caption {html_to_sx(caption)}") return "(~kg-file " + " ".join(parts) + ")" @@ -438,4 +441,5 @@ def _paywall(_node: dict) -> str: def _markdown(node: dict) -> str: md_text = node.get("markdown", "") rendered = mistune.html(md_text) - return f'(~kg-html :html "{_esc(rendered)}")' + inner = html_to_sx(rendered) + return f"(~kg-md {inner})" diff --git a/blog/scripts/migrate_sx_html.py b/blog/scripts/migrate_sx_html.py new file mode 100644 index 0000000..68aeb1a --- /dev/null +++ b/blog/scripts/migrate_sx_html.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Re-convert sx_content from lexical JSON to eliminate ~kg-html wrappers and +raw caption strings. + +The updated lexical_to_sx converter now produces native sx expressions instead +of (1) wrapping HTML/markdown cards in (~kg-html :html "...") and (2) storing +captions as escaped HTML strings. This script re-runs the conversion on all +posts that already have sx_content, overwriting the old output. + +Usage: + cd blog && python3 scripts/migrate_sx_html.py [--dry-run] +""" +from __future__ import annotations + +import argparse +import asyncio +import sys + +from sqlalchemy import select, and_ + + +async def migrate(dry_run: bool = False) -> int: + from shared.db.session import get_session + from models.ghost_content import Post + from bp.blog.ghost.lexical_to_sx import lexical_to_sx + + converted = 0 + skipped = 0 + errors = 0 + + async with get_session() as sess: + # All posts with lexical content (whether or not sx_content exists) + stmt = select(Post).where( + and_( + Post.lexical.isnot(None), + Post.lexical != "", + ) + ) + result = await sess.execute(stmt) + posts = result.scalars().all() + + print(f"Found {len(posts)} posts with lexical content") + + for post in posts: + try: + new_sx = lexical_to_sx(post.lexical) + if post.sx_content == new_sx: + skipped += 1 + continue + + if dry_run: + old_has_kg = "~kg-html" in (post.sx_content or "") + old_has_raw = "raw! caption" in (post.sx_content or "") + markers = [] + if old_has_kg: + markers.append("~kg-html") + if old_has_raw: + markers.append("raw-caption") + tag = f" [{', '.join(markers)}]" if markers else "" + print(f" [DRY RUN] {post.slug}: {len(new_sx)} chars{tag}") + else: + post.sx_content = new_sx + print(f" Converted: {post.slug} ({len(new_sx)} chars)") + converted += 1 + except Exception as e: + print(f" ERROR: {post.slug}: {e}", file=sys.stderr) + errors += 1 + + if not dry_run and converted: + await sess.commit() + + print(f"\nDone: {converted} converted, {skipped} unchanged, {errors} errors") + return converted + + +def main(): + parser = argparse.ArgumentParser( + description="Re-convert sx_content to eliminate ~kg-html and raw captions" + ) + parser.add_argument("--dry-run", action="store_true", + help="Preview changes without writing to database") + args = parser.parse_args() + + asyncio.run(migrate(dry_run=args.dry_run)) + + +if __name__ == "__main__": + main() diff --git a/blog/sx/kg_cards.sx b/blog/sx/kg_cards.sx index b55a929..56eab5c 100644 --- a/blog/sx/kg_cards.sx +++ b/blog/sx/kg_cards.sx @@ -2,7 +2,7 @@ ;; Produces same HTML structure as lexical_renderer.py so cards.css works unchanged. ;; Used by both display pipeline and block editor. -;; @css kg-card kg-image-card kg-width-wide kg-width-full kg-gallery-card kg-gallery-container kg-gallery-row kg-gallery-image kg-embed-card kg-bookmark-card kg-bookmark-container kg-bookmark-content kg-bookmark-title kg-bookmark-description kg-bookmark-metadata kg-bookmark-icon kg-bookmark-author kg-bookmark-publisher kg-bookmark-thumbnail kg-callout-card kg-callout-emoji kg-callout-text kg-button-card kg-btn kg-btn-accent kg-toggle-card kg-toggle-heading kg-toggle-heading-text kg-toggle-card-icon kg-toggle-content kg-audio-card kg-audio-thumbnail kg-audio-player-container kg-audio-title kg-audio-player kg-audio-play-icon kg-audio-current-time kg-audio-time kg-audio-seek-slider kg-audio-playback-rate kg-audio-unmute-icon kg-audio-volume-slider kg-video-card kg-video-container kg-file-card kg-file-card-container kg-file-card-contents kg-file-card-title kg-file-card-filesize kg-file-card-icon kg-file-card-caption kg-align-center kg-align-left kg-callout-card-grey kg-callout-card-white kg-callout-card-blue kg-callout-card-green kg-callout-card-yellow kg-callout-card-red kg-callout-card-pink kg-callout-card-purple kg-callout-card-accent placeholder +;; @css kg-card kg-image-card kg-width-wide kg-width-full kg-gallery-card kg-gallery-container kg-gallery-row kg-gallery-image kg-embed-card kg-bookmark-card kg-bookmark-container kg-bookmark-content kg-bookmark-title kg-bookmark-description kg-bookmark-metadata kg-bookmark-icon kg-bookmark-author kg-bookmark-publisher kg-bookmark-thumbnail kg-callout-card kg-callout-emoji kg-callout-text kg-button-card kg-btn kg-btn-accent kg-toggle-card kg-toggle-heading kg-toggle-heading-text kg-toggle-card-icon kg-toggle-content kg-audio-card kg-audio-thumbnail kg-audio-player-container kg-audio-title kg-audio-player kg-audio-play-icon kg-audio-current-time kg-audio-time kg-audio-seek-slider kg-audio-playback-rate kg-audio-unmute-icon kg-audio-volume-slider kg-video-card kg-video-container kg-file-card kg-file-card-container kg-file-card-contents kg-file-card-title kg-file-card-filesize kg-file-card-icon kg-file-card-caption kg-align-center kg-align-left kg-callout-card-grey kg-callout-card-white kg-callout-card-blue kg-callout-card-green kg-callout-card-yellow kg-callout-card-red kg-callout-card-pink kg-callout-card-purple kg-callout-card-accent kg-html-card kg-md-card placeholder ;; --------------------------------------------------------------------------- ;; Image card @@ -33,10 +33,17 @@ (when caption (figcaption caption)))) ;; --------------------------------------------------------------------------- -;; HTML card (raw HTML injection) +;; HTML card — wraps user-pasted HTML so the editor can identify the block. +;; Content is native sx children (no longer an opaque HTML string). ;; --------------------------------------------------------------------------- -(defcomp ~kg-html (&key html) - (~rich-text :html html)) +(defcomp ~kg-html (&rest children) + (div :class "kg-card kg-html-card" children)) + +;; --------------------------------------------------------------------------- +;; Markdown card — rendered markdown content, editor can identify the block. +;; --------------------------------------------------------------------------- +(defcomp ~kg-md (&rest children) + (div :class "kg-card kg-md-card" children)) ;; --------------------------------------------------------------------------- ;; Embed card diff --git a/blog/tests/test_lexical_to_sx.py b/blog/tests/test_lexical_to_sx.py index 973b0e7..ce3f2c6 100644 --- a/blog/tests/test_lexical_to_sx.py +++ b/blog/tests/test_lexical_to_sx.py @@ -5,8 +5,9 @@ import sys import os import pytest -# The lexical_to_sx module is standalone (only depends on json). -# Import it directly to avoid pulling in the full blog app. +# Add project root so shared.sx.html_to_sx resolves, plus the ghost dir. +_project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) +sys.path.insert(0, _project_root) sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "bp", "blog", "ghost")) from lexical_to_sx import lexical_to_sx @@ -176,6 +177,13 @@ class TestCards: assert ':width "wide"' in result assert ':caption "Fig 1"' in result + def test_image_html_caption(self): + result = lexical_to_sx(_doc({ + "type": "image", "src": "p.jpg", "alt": "", + "caption": 'Photo by Author' + })) + assert ':caption (<> "Photo by " (a :href "https://x.com" "Author"))' in result + def test_bookmark(self): result = lexical_to_sx(_doc({ "type": "bookmark", "url": "https://example.com", @@ -214,7 +222,7 @@ class TestCards: result = lexical_to_sx(_doc({ "type": "html", "html": "
custom
" })) - assert "(~kg-html " in result + assert result == '(~kg-html (div "custom"))' def test_embed(self): result = lexical_to_sx(_doc({ @@ -224,6 +232,14 @@ class TestCards: assert "(~kg-embed " in result assert ':caption "Video"' in result + def test_markdown(self): + result = lexical_to_sx(_doc({ + "type": "markdown", "markdown": "**bold** text" + })) + assert result.startswith("(~kg-md ") + assert "(p " in result + assert "(strong " in result + def test_video(self): result = lexical_to_sx(_doc({ "type": "video", "src": "v.mp4", "cardWidth": "wide" diff --git a/shared/static/scripts/sx-editor.js b/shared/static/scripts/sx-editor.js index 1d5254c..a67c01b 100644 --- a/shared/static/scripts/sx-editor.js +++ b/shared/static/scripts/sx-editor.js @@ -98,6 +98,106 @@ return d.innerHTML; } + // Void elements that have no closing tag + var VOID_ELEMENTS = { + area:1, base:1, br:1, col:1, embed:1, hr:1, img:1, input:1, + link:1, meta:1, param:1, source:1, track:1, wbr:1 + }; + // Boolean HTML attributes + var BOOLEAN_ATTRS = { + async:1, autofocus:1, autoplay:1, checked:1, controls:1, + default:1, defer:1, disabled:1, formnovalidate:1, hidden:1, + inert:1, ismap:1, loop:1, multiple:1, muted:1, nomodule:1, + novalidate:1, open:1, playsinline:1, readonly:1, required:1, + reversed:1, selected:1 + }; + + /** + * Convert an HTML string to sx source using the browser's DOM parser. + */ + function htmlToSx(html) { + if (!html || !html.trim()) return '""'; + var doc = new DOMParser().parseFromString("" + html + "", "text/html"); + var body = doc.body; + // Collect non-whitespace-only root nodes + var roots = []; + for (var i = 0; i < body.childNodes.length; i++) { + var n = body.childNodes[i]; + if (n.nodeType === 3 && !n.textContent.trim()) continue; // skip ws-only text at root + roots.push(n); + } + if (!roots.length) return '""'; + if (roots.length === 1) return nodeToSx(roots[0]); + var parts = []; + for (var i = 0; i < roots.length; i++) parts.push(nodeToSx(roots[i])); + return "(<> " + parts.join(" ") + ")"; + } + + function nodeToSx(node) { + if (node.nodeType === 3) { + return '"' + escSx(node.textContent) + '"'; + } + if (node.nodeType === 8) return ""; // comment + if (node.nodeType !== 1) return ""; + var tag = node.tagName.toLowerCase(); + var parts = [tag]; + // Attributes + for (var i = 0; i < node.attributes.length; i++) { + var a = node.attributes[i]; + if (BOOLEAN_ATTRS[a.name]) { + parts.push(":" + a.name + " true"); + } else { + parts.push(':' + a.name + ' "' + escSx(a.value) + '"'); + } + } + if (VOID_ELEMENTS[tag]) return "(" + parts.join(" ") + ")"; + // Children + var children = []; + for (var i = 0; i < node.childNodes.length; i++) { + var s = nodeToSx(node.childNodes[i]); + if (s) children.push(s); + } + if (children.length) return "(" + parts.join(" ") + " " + children.join(" ") + ")"; + return "(" + parts.join(" ") + ")"; + } + + /** + * Render sx children expressions back to an HTML string (for the HTML card textarea). + */ + function sxChildrenToHtml(childrenSx) { + if (!childrenSx || !childrenSx.trim()) return ""; + try { + var rendered = Sx.render(childrenSx); + if (rendered instanceof Node) { + var div = document.createElement("div"); + div.appendChild(rendered); + return div.innerHTML; + } + return String(rendered); + } catch (e) { + return childrenSx; + } + } + + /** + * Serialize a parsed sx expression (from Sx.parse) back to sx source string. + * Used to capture children of ~kg-html/~kg-md cards from the parsed tree. + */ + function serializeExpr(expr) { + if (typeof expr === "string") return '"' + escSx(expr) + '"'; + if (expr && expr.constructor === Sx.Keyword) return ":" + expr.name; + if (expr && expr.name !== undefined && expr.constructor !== Sx.Keyword) return expr.name; + if (Array.isArray(expr)) { + var parts = []; + for (var i = 0; i < expr.length; i++) parts.push(serializeExpr(expr[i])); + return "(" + parts.join(" ") + ")"; + } + if (expr === true) return "true"; + if (expr === false) return "false"; + if (expr === null || expr === undefined) return "nil"; + return String(expr); + } + function closestBlock(node, container) { while (node && node !== container) { if (node.hasAttribute && node.hasAttribute("data-sx-block")) return node; @@ -223,28 +323,47 @@ return "(<>\n " + parts.join("\n ") + ")"; } + // Cards whose children are positional sx args (not kwargs) + var CHILDREN_CARDS = { "kg-html": true, "kg-md": true }; + function serializeCard(block) { var cardType = block.getAttribute("data-sx-card"); var attrsJson = block.getAttribute("data-sx-attrs") || "{}"; var attrs; try { attrs = JSON.parse(attrsJson); } catch (e) { attrs = {}; } + // Serialize caption as inline sx from the contenteditable var captionEl = block.querySelector("[data-sx-caption]"); if (captionEl) { - var captionText = captionEl.textContent.trim(); - if (captionText) attrs.caption = captionText; - else delete attrs.caption; + var captionHtml = captionEl.innerHTML.trim(); + if (captionHtml && captionEl.textContent.trim()) { + attrs.caption = serializeInline(captionEl); + } else { + delete attrs.caption; + } } var parts = ["(~" + cardType]; for (var k in attrs) { if (attrs[k] === null || attrs[k] === undefined || attrs[k] === false) continue; + if (k === "caption") { + // Caption is already an sx expression string + parts.push(":caption " + attrs[k]); + continue; + } + if (k === "_childrenSx") continue; // handled below if (attrs[k] === true) { parts.push(":" + k + " true"); } else { parts.push(':' + k + ' "' + escSx(String(attrs[k])) + '"'); } } + + // Append children for cards like ~kg-html, ~kg-md + if (CHILDREN_CARDS[cardType] && attrs._childrenSx) { + parts.push(attrs._childrenSx); + } + parts.push(")"); return parts.join(" "); } @@ -305,7 +424,21 @@ } if (tag.charAt(0) === "~") { var cardType = tag.slice(1); - var attrs = extractKwargs(expr.slice(1)); + var args = expr.slice(1); + var attrs = extractKwargs(args); + // For children-based cards, capture positional children as sx source + if (CHILDREN_CARDS[cardType]) { + var children = extractChildren(args); + if (children.length) { + var childParts = []; + for (var ci = 0; ci < children.length; ci++) childParts.push(serializeExpr(children[ci])); + attrs._childrenSx = childParts.join(" "); + } + } + // Convert caption from parsed sx expression back to sx source string + if (attrs.caption !== undefined && attrs.caption !== null) { + attrs.caption = serializeExpr(attrs.caption); + } return createCardBlock(cardType, attrs); } return null; @@ -535,7 +668,7 @@ // Add caption for applicable card types var captionTypes = { "kg-image": true, "kg-gallery": true, "kg-embed": true, - "kg-bookmark": true, "kg-video": true + "kg-bookmark": true, "kg-video": true, "kg-file": true }; if (captionTypes[cardType]) { var captionEl = el("div", { @@ -544,7 +677,10 @@ "data-sx-caption": "true", "data-placeholder": "Type caption for image (optional)" }); - if (attrs.caption) captionEl.textContent = attrs.caption; + if (attrs.caption) { + // Caption is an sx expression string — render to HTML for the contenteditable + captionEl.innerHTML = sxChildrenToHtml(attrs.caption); + } wrapper.appendChild(captionEl); } @@ -566,12 +702,22 @@ var parts = ["(~" + cardType]; for (var k in attrs) { if (attrs[k] === null || attrs[k] === undefined || attrs[k] === false || attrs[k] === "") continue; + if (k === "caption") { + // Caption is an sx expression, not a quoted string + parts.push(":caption " + attrs[k]); + continue; + } + if (k === "_childrenSx") continue; // handled below if (attrs[k] === true) { parts.push(":" + k + " true"); } else { parts.push(':' + k + ' "' + escSx(String(attrs[k])) + '"'); } } + // Append children for ~kg-html, ~kg-md + if (CHILDREN_CARDS[cardType] && attrs._childrenSx) { + parts.push(attrs._childrenSx); + } parts.push(")"); return parts.join(" "); } @@ -602,6 +748,7 @@ case "kg-image": buildImageEditUI(attrs, editPanel, wrapper); break; case "kg-gallery": buildGalleryEditUI(attrs, editPanel, wrapper); break; case "kg-html": buildHtmlEditUI(attrs, editPanel, wrapper); break; + case "kg-md": buildMarkdownEditUI(attrs, editPanel, wrapper); break; case "kg-embed": buildEmbedEditUI(attrs, editPanel, wrapper); break; case "kg-bookmark": buildBookmarkEditUI(attrs, editPanel, wrapper); break; case "kg-callout": buildCalloutEditUI(attrs, editPanel, wrapper); break; @@ -717,11 +864,13 @@ // -- HTML card edit UI -- function buildHtmlEditUI(attrs, panel, wrapper) { + // Show HTML in textarea; convert HTML↔sx children for storage + var currentHtml = attrs._childrenSx ? sxChildrenToHtml(attrs._childrenSx) : ""; var textarea = el("textarea", { className: "sx-edit-html-textarea", placeholder: "Paste HTML here...", spellcheck: "false" - }, attrs.html || ""); + }, currentHtml); function autoResize() { textarea.style.height = "auto"; @@ -729,7 +878,31 @@ } textarea.addEventListener("input", function () { - attrs.html = textarea.value; + attrs._childrenSx = htmlToSx(textarea.value); + updateCardAttrs(wrapper, attrs); + autoResize(); + }); + setTimeout(autoResize, 0); + + panel.appendChild(textarea); + } + + // -- Markdown card edit UI (read-only rendered view, edit as HTML) -- + function buildMarkdownEditUI(attrs, panel, wrapper) { + var currentHtml = attrs._childrenSx ? sxChildrenToHtml(attrs._childrenSx) : ""; + var textarea = el("textarea", { + className: "sx-edit-html-textarea", + placeholder: "Markdown content (as HTML)...", + spellcheck: "false" + }, currentHtml); + + function autoResize() { + textarea.style.height = "auto"; + textarea.style.height = Math.max(120, textarea.scrollHeight) + "px"; + } + + textarea.addEventListener("input", function () { + attrs._childrenSx = htmlToSx(textarea.value); updateCardAttrs(wrapper, attrs); autoResize(); }); @@ -1613,7 +1786,7 @@ block.querySelector(".sx-card-preview").click(); return; } else if (type === "html") { - block = createCardBlock("kg-html", { html: "" }); + block = createCardBlock("kg-html", { _childrenSx: '""' }); insertBlockNode(editor, block, refBlock); block.querySelector(".sx-card-preview").click(); return; diff --git a/shared/sx/html_to_sx.py b/shared/sx/html_to_sx.py new file mode 100644 index 0000000..73a0c15 --- /dev/null +++ b/shared/sx/html_to_sx.py @@ -0,0 +1,122 @@ +""" +HTML → sx source converter. + +Converts an HTML string to an equivalent s-expression source string so that +raw HTML can be eliminated from the sx tree. + + html_to_sx('

Hello world

') + # → '(p :class "intro" "Hello " (em "world"))' + +Uses only stdlib ``html.parser`` — no extra dependencies. +""" +from __future__ import annotations + +from html.parser import HTMLParser + +from .html import VOID_ELEMENTS, BOOLEAN_ATTRS + + +def html_to_sx(html: str) -> str: + """Convert an HTML string to sx source.""" + if not html or not html.strip(): + return '""' + parser = _SxBuilder() + parser.feed(html) + nodes = parser.finish() + if not nodes: + return '""' + if len(nodes) == 1: + return _serialize(nodes[0]) + return "(<> " + " ".join(_serialize(n) for n in nodes) + ")" + + +# --------------------------------------------------------------------------- +# Internal tree builder +# --------------------------------------------------------------------------- + +class _TextNode: + __slots__ = ("text",) + def __init__(self, text: str): + self.text = text + +class _ElementNode: + __slots__ = ("tag", "attrs", "children") + def __init__(self, tag: str, attrs: list[tuple[str, str | None]]): + self.tag = tag + self.attrs = attrs + self.children: list[_TextNode | _ElementNode] = [] + + +class _SxBuilder(HTMLParser): + def __init__(self): + super().__init__(convert_charrefs=True) + self._roots: list[_TextNode | _ElementNode] = [] + self._stack: list[_ElementNode] = [] + + def _append(self, node: _TextNode | _ElementNode): + if self._stack: + self._stack[-1].children.append(node) + else: + self._roots.append(node) + + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]): + node = _ElementNode(tag, attrs) + self._append(node) + if tag not in VOID_ELEMENTS: + self._stack.append(node) + + def handle_endtag(self, tag: str): + # Pop back to the matching open tag (tolerant of mismatches) + for i in range(len(self._stack) - 1, -1, -1): + if self._stack[i].tag == tag: + self._stack[i + 1:] = [] + self._stack.pop(i) + return + + def handle_data(self, data: str): + if data: + self._append(_TextNode(data)) + + def handle_comment(self, data: str): + pass # skip HTML comments + + def finish(self) -> list[_TextNode | _ElementNode]: + # Strip whitespace-only text nodes at root level + return [n for n in self._roots + if not (isinstance(n, _TextNode) and not n.text.strip())] + + +# --------------------------------------------------------------------------- +# Serializer +# --------------------------------------------------------------------------- + +def _esc(s: str) -> str: + """Escape a string for sx double-quoted literals.""" + return s.replace("\\", "\\\\").replace('"', '\\"') + + +def _serialize(node: _TextNode | _ElementNode) -> str: + if isinstance(node, _TextNode): + return f'"{_esc(node.text)}"' + + parts = [node.tag] + for name, value in node.attrs: + if name in BOOLEAN_ATTRS: + if value is None or value == "" or value == name: + parts.append(f":{name} true") + else: + parts.append(f':{name} "{_esc(value)}"') + elif value is None: + # Attribute without value (non-boolean) — treat as boolean true + parts.append(f":{name} true") + else: + parts.append(f':{name} "{_esc(value)}"') + + if node.tag in VOID_ELEMENTS: + return "(" + " ".join(parts) + ")" + + if node.children: + child_parts = [_serialize(c) for c in node.children] + return "(" + " ".join(parts) + " " + " ".join(child_parts) + ")" + + return "(" + " ".join(parts) + ")" diff --git a/shared/sx/tests/test_html_to_sx.py b/shared/sx/tests/test_html_to_sx.py new file mode 100644 index 0000000..504f35c --- /dev/null +++ b/shared/sx/tests/test_html_to_sx.py @@ -0,0 +1,159 @@ +"""Unit tests for html_to_sx converter.""" +from __future__ import annotations + +import pytest + +from shared.sx.html_to_sx import html_to_sx + + +class TestBasicElements: + def test_simple_paragraph(self): + assert html_to_sx("

Hello

") == '(p "Hello")' + + def test_with_class(self): + assert html_to_sx('

Hi

') == '(p :class "intro" "Hi")' + + def test_multiple_attrs(self): + result = html_to_sx('click') + assert result == '(a :href "u" :class "link" "click")' + + def test_nested_inline(self): + result = html_to_sx("

Hello world

") + assert result == '(p "Hello " (em "world"))' + + def test_deeply_nested(self): + result = html_to_sx("

bold

") + assert result == '(div (p (strong "bold")))' + + +class TestVoidElements: + def test_br(self): + assert html_to_sx("
") == "(br)" + + def test_img(self): + result = html_to_sx('pic') + assert result == '(img :src "a.jpg" :alt "pic")' + + def test_hr(self): + assert html_to_sx("
") == "(hr)" + + def test_input(self): + result = html_to_sx('') + assert result == '(input :type "text" :value "hi")' + + +class TestBooleanAttrs: + def test_checked(self): + result = html_to_sx('') + assert result == '(input :type "checkbox" :checked true)' + + def test_disabled(self): + result = html_to_sx('') + assert result == '(button :disabled true "No")' + + def test_controls(self): + result = html_to_sx('') + assert result == '(video :controls true)' + + +class TestTopLevel: + def test_multiple_top_level(self): + result = html_to_sx("

A

B

") + assert result == '(<> (p "A") (p "B"))' + + def test_single_top_level(self): + result = html_to_sx("

Only

") + assert result == '(p "Only")' + + def test_text_only(self): + result = html_to_sx("just text") + assert result == '"just text"' + + def test_empty(self): + assert html_to_sx("") == '""' + + def test_whitespace_only(self): + assert html_to_sx(" \n ") == '""' + + +class TestWhitespace: + def test_root_whitespace_stripped(self): + result = html_to_sx("\n

A

\n

B

\n") + assert result == '(<> (p "A") (p "B"))' + + +class TestEntities: + def test_amp(self): + result = html_to_sx("

A & B

") + assert result == '(p "A & B")' + + def test_lt_gt(self): + result = html_to_sx("

<tag>

") + assert result == '(p "")' + + def test_nbsp(self): + result = html_to_sx("

hello world

") + assert result == '(p "hello\u00a0world")' + + +class TestEscaping: + def test_quotes_in_text(self): + result = html_to_sx('

He said "hello"

') + assert result == '(p "He said \\"hello\\"")' + + def test_backslash_in_text(self): + result = html_to_sx("

a\\b

") + assert result == '(p "a\\\\b")' + + def test_quotes_in_attr(self): + # Attribute values with quotes get escaped + result = html_to_sx('
x
') + assert result == '(div :title "a\\"b" "x")' + + +class TestComments: + def test_comment_stripped(self): + result = html_to_sx("

hi

") + assert result == '(p "hi")' + + +class TestMixedContent: + def test_caption_with_link(self): + result = html_to_sx('Photo by Author') + assert result == '(<> "Photo by " (a :href "https://x.com" "Author"))' + + def test_caption_plain_text(self): + result = html_to_sx("Figure 1") + assert result == '"Figure 1"' + + +class TestRoundtrip: + """html_to_sx → parse → render should produce equivalent HTML.""" + + def _roundtrip(self, html_in: str) -> str: + from shared.sx.parser import parse + from shared.sx.html import render + sx_src = html_to_sx(html_in) + expr = parse(sx_src) + return render(expr) + + def test_simple(self): + assert self._roundtrip("

Hello

") == "

Hello

" + + def test_nested(self): + assert self._roundtrip("

A B C

") == "

A B C

" + + def test_void(self): + assert self._roundtrip('') == '' + + def test_link(self): + html = 'click' + assert self._roundtrip(html) == html + + def test_entities_roundtrip(self): + # Entities get decoded by parser, then re-escaped by render + assert self._roundtrip("

A & B

") == "

A & B

" + + def test_multi_block(self): + html = "

A

B

" + assert self._roundtrip(html) == html