diff --git a/blog/bp/blog/ghost/lexical_to_sx.py b/blog/bp/blog/ghost/lexical_to_sx.py index 84cc673..d13c317 100644 --- a/blog/bp/blog/ghost/lexical_to_sx.py +++ b/blog/bp/blog/ghost/lexical_to_sx.py @@ -16,6 +16,8 @@ from typing import Callable import mistune +from shared.sx.html_to_sx import html_to_sx + # --------------------------------------------------------------------------- # Registry @@ -249,7 +251,7 @@ def _image(node: dict) -> str: if alt: parts.append(f':alt "{_esc(alt)}"') if caption: - parts.append(f':caption "{_esc(caption)}"') + parts.append(f":caption {html_to_sx(caption)}") if width: parts.append(f':width "{_esc(width)}"') if href: @@ -273,20 +275,21 @@ def _gallery(node: dict) -> str: if img.get("alt"): item_parts.append(f'"alt" "{_esc(img["alt"])}"') if img.get("caption"): - item_parts.append(f'"caption" "{_esc(img["caption"])}"') + item_parts.append(f'"caption" {html_to_sx(img["caption"])}') row_items.append("(dict " + " ".join(item_parts) + ")") rows.append("(list " + " ".join(row_items) + ")") images_sx = "(list " + " ".join(rows) + ")" caption = node.get("caption", "") - caption_attr = f' :caption "{_esc(caption)}"' if caption else "" + caption_attr = f" :caption {html_to_sx(caption)}" if caption else "" return f"(~kg-gallery :images {images_sx}{caption_attr})" @_converter("html") def _html_card(node: dict) -> str: raw = node.get("html", "") - return f'(~kg-html :html "{_esc(raw)}")' + inner = html_to_sx(raw) + return f"(~kg-html {inner})" @_converter("embed") @@ -295,7 +298,7 @@ def _embed(node: dict) -> str: caption = node.get("caption", "") parts = [f':html "{_esc(embed_html)}"'] if caption: - parts.append(f':caption "{_esc(caption)}"') + parts.append(f":caption {html_to_sx(caption)}") return "(~kg-embed " + " ".join(parts) + ")" @@ -325,7 +328,7 @@ def _bookmark(node: dict) -> str: parts.append(f':thumbnail "{_esc(thumbnail)}"') caption = node.get("caption", "") if caption: - parts.append(f':caption "{_esc(caption)}"') + parts.append(f":caption {html_to_sx(caption)}") return "(~kg-bookmark " + " ".join(parts) + ")" @@ -390,7 +393,7 @@ def _video(node: dict) -> str: parts = [f':src "{_esc(src)}"'] if caption: - parts.append(f':caption "{_esc(caption)}"') + parts.append(f":caption {html_to_sx(caption)}") if width: parts.append(f':width "{_esc(width)}"') if thumbnail: @@ -425,7 +428,7 @@ def _file(node: dict) -> str: if size_str: parts.append(f':filesize "{size_str}"') if caption: - parts.append(f':caption "{_esc(caption)}"') + parts.append(f":caption {html_to_sx(caption)}") return "(~kg-file " + " ".join(parts) + ")" @@ -438,4 +441,5 @@ def _paywall(_node: dict) -> str: def _markdown(node: dict) -> str: md_text = node.get("markdown", "") rendered = mistune.html(md_text) - return f'(~kg-html :html "{_esc(rendered)}")' + inner = html_to_sx(rendered) + return f"(~kg-md {inner})" diff --git a/blog/scripts/migrate_sx_html.py b/blog/scripts/migrate_sx_html.py new file mode 100644 index 0000000..68aeb1a --- /dev/null +++ b/blog/scripts/migrate_sx_html.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Re-convert sx_content from lexical JSON to eliminate ~kg-html wrappers and +raw caption strings. + +The updated lexical_to_sx converter now produces native sx expressions instead +of (1) wrapping HTML/markdown cards in (~kg-html :html "...") and (2) storing +captions as escaped HTML strings. This script re-runs the conversion on all +posts that already have sx_content, overwriting the old output. + +Usage: + cd blog && python3 scripts/migrate_sx_html.py [--dry-run] +""" +from __future__ import annotations + +import argparse +import asyncio +import sys + +from sqlalchemy import select, and_ + + +async def migrate(dry_run: bool = False) -> int: + from shared.db.session import get_session + from models.ghost_content import Post + from bp.blog.ghost.lexical_to_sx import lexical_to_sx + + converted = 0 + skipped = 0 + errors = 0 + + async with get_session() as sess: + # All posts with lexical content (whether or not sx_content exists) + stmt = select(Post).where( + and_( + Post.lexical.isnot(None), + Post.lexical != "", + ) + ) + result = await sess.execute(stmt) + posts = result.scalars().all() + + print(f"Found {len(posts)} posts with lexical content") + + for post in posts: + try: + new_sx = lexical_to_sx(post.lexical) + if post.sx_content == new_sx: + skipped += 1 + continue + + if dry_run: + old_has_kg = "~kg-html" in (post.sx_content or "") + old_has_raw = "raw! caption" in (post.sx_content or "") + markers = [] + if old_has_kg: + markers.append("~kg-html") + if old_has_raw: + markers.append("raw-caption") + tag = f" [{', '.join(markers)}]" if markers else "" + print(f" [DRY RUN] {post.slug}: {len(new_sx)} chars{tag}") + else: + post.sx_content = new_sx + print(f" Converted: {post.slug} ({len(new_sx)} chars)") + converted += 1 + except Exception as e: + print(f" ERROR: {post.slug}: {e}", file=sys.stderr) + errors += 1 + + if not dry_run and converted: + await sess.commit() + + print(f"\nDone: {converted} converted, {skipped} unchanged, {errors} errors") + return converted + + +def main(): + parser = argparse.ArgumentParser( + description="Re-convert sx_content to eliminate ~kg-html and raw captions" + ) + parser.add_argument("--dry-run", action="store_true", + help="Preview changes without writing to database") + args = parser.parse_args() + + asyncio.run(migrate(dry_run=args.dry_run)) + + +if __name__ == "__main__": + main() diff --git a/blog/sx/kg_cards.sx b/blog/sx/kg_cards.sx index b55a929..56eab5c 100644 --- a/blog/sx/kg_cards.sx +++ b/blog/sx/kg_cards.sx @@ -2,7 +2,7 @@ ;; Produces same HTML structure as lexical_renderer.py so cards.css works unchanged. ;; Used by both display pipeline and block editor. -;; @css kg-card kg-image-card kg-width-wide kg-width-full kg-gallery-card kg-gallery-container kg-gallery-row kg-gallery-image kg-embed-card kg-bookmark-card kg-bookmark-container kg-bookmark-content kg-bookmark-title kg-bookmark-description kg-bookmark-metadata kg-bookmark-icon kg-bookmark-author kg-bookmark-publisher kg-bookmark-thumbnail kg-callout-card kg-callout-emoji kg-callout-text kg-button-card kg-btn kg-btn-accent kg-toggle-card kg-toggle-heading kg-toggle-heading-text kg-toggle-card-icon kg-toggle-content kg-audio-card kg-audio-thumbnail kg-audio-player-container kg-audio-title kg-audio-player kg-audio-play-icon kg-audio-current-time kg-audio-time kg-audio-seek-slider kg-audio-playback-rate kg-audio-unmute-icon kg-audio-volume-slider kg-video-card kg-video-container kg-file-card kg-file-card-container kg-file-card-contents kg-file-card-title kg-file-card-filesize kg-file-card-icon kg-file-card-caption kg-align-center kg-align-left kg-callout-card-grey kg-callout-card-white kg-callout-card-blue kg-callout-card-green kg-callout-card-yellow kg-callout-card-red kg-callout-card-pink kg-callout-card-purple kg-callout-card-accent placeholder +;; @css kg-card kg-image-card kg-width-wide kg-width-full kg-gallery-card kg-gallery-container kg-gallery-row kg-gallery-image kg-embed-card kg-bookmark-card kg-bookmark-container kg-bookmark-content kg-bookmark-title kg-bookmark-description kg-bookmark-metadata kg-bookmark-icon kg-bookmark-author kg-bookmark-publisher kg-bookmark-thumbnail kg-callout-card kg-callout-emoji kg-callout-text kg-button-card kg-btn kg-btn-accent kg-toggle-card kg-toggle-heading kg-toggle-heading-text kg-toggle-card-icon kg-toggle-content kg-audio-card kg-audio-thumbnail kg-audio-player-container kg-audio-title kg-audio-player kg-audio-play-icon kg-audio-current-time kg-audio-time kg-audio-seek-slider kg-audio-playback-rate kg-audio-unmute-icon kg-audio-volume-slider kg-video-card kg-video-container kg-file-card kg-file-card-container kg-file-card-contents kg-file-card-title kg-file-card-filesize kg-file-card-icon kg-file-card-caption kg-align-center kg-align-left kg-callout-card-grey kg-callout-card-white kg-callout-card-blue kg-callout-card-green kg-callout-card-yellow kg-callout-card-red kg-callout-card-pink kg-callout-card-purple kg-callout-card-accent kg-html-card kg-md-card placeholder ;; --------------------------------------------------------------------------- ;; Image card @@ -33,10 +33,17 @@ (when caption (figcaption caption)))) ;; --------------------------------------------------------------------------- -;; HTML card (raw HTML injection) +;; HTML card — wraps user-pasted HTML so the editor can identify the block. +;; Content is native sx children (no longer an opaque HTML string). ;; --------------------------------------------------------------------------- -(defcomp ~kg-html (&key html) - (~rich-text :html html)) +(defcomp ~kg-html (&rest children) + (div :class "kg-card kg-html-card" children)) + +;; --------------------------------------------------------------------------- +;; Markdown card — rendered markdown content, editor can identify the block. +;; --------------------------------------------------------------------------- +(defcomp ~kg-md (&rest children) + (div :class "kg-card kg-md-card" children)) ;; --------------------------------------------------------------------------- ;; Embed card diff --git a/blog/tests/test_lexical_to_sx.py b/blog/tests/test_lexical_to_sx.py index 973b0e7..ce3f2c6 100644 --- a/blog/tests/test_lexical_to_sx.py +++ b/blog/tests/test_lexical_to_sx.py @@ -5,8 +5,9 @@ import sys import os import pytest -# The lexical_to_sx module is standalone (only depends on json). -# Import it directly to avoid pulling in the full blog app. +# Add project root so shared.sx.html_to_sx resolves, plus the ghost dir. +_project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) +sys.path.insert(0, _project_root) sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "bp", "blog", "ghost")) from lexical_to_sx import lexical_to_sx @@ -176,6 +177,13 @@ class TestCards: assert ':width "wide"' in result assert ':caption "Fig 1"' in result + def test_image_html_caption(self): + result = lexical_to_sx(_doc({ + "type": "image", "src": "p.jpg", "alt": "", + "caption": 'Photo by Author' + })) + assert ':caption (<> "Photo by " (a :href "https://x.com" "Author"))' in result + def test_bookmark(self): result = lexical_to_sx(_doc({ "type": "bookmark", "url": "https://example.com", @@ -214,7 +222,7 @@ class TestCards: result = lexical_to_sx(_doc({ "type": "html", "html": "
Hello world
') + # → '(p :class "intro" "Hello " (em "world"))' + +Uses only stdlib ``html.parser`` — no extra dependencies. +""" +from __future__ import annotations + +from html.parser import HTMLParser + +from .html import VOID_ELEMENTS, BOOLEAN_ATTRS + + +def html_to_sx(html: str) -> str: + """Convert an HTML string to sx source.""" + if not html or not html.strip(): + return '""' + parser = _SxBuilder() + parser.feed(html) + nodes = parser.finish() + if not nodes: + return '""' + if len(nodes) == 1: + return _serialize(nodes[0]) + return "(<> " + " ".join(_serialize(n) for n in nodes) + ")" + + +# --------------------------------------------------------------------------- +# Internal tree builder +# --------------------------------------------------------------------------- + +class _TextNode: + __slots__ = ("text",) + def __init__(self, text: str): + self.text = text + +class _ElementNode: + __slots__ = ("tag", "attrs", "children") + def __init__(self, tag: str, attrs: list[tuple[str, str | None]]): + self.tag = tag + self.attrs = attrs + self.children: list[_TextNode | _ElementNode] = [] + + +class _SxBuilder(HTMLParser): + def __init__(self): + super().__init__(convert_charrefs=True) + self._roots: list[_TextNode | _ElementNode] = [] + self._stack: list[_ElementNode] = [] + + def _append(self, node: _TextNode | _ElementNode): + if self._stack: + self._stack[-1].children.append(node) + else: + self._roots.append(node) + + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]): + node = _ElementNode(tag, attrs) + self._append(node) + if tag not in VOID_ELEMENTS: + self._stack.append(node) + + def handle_endtag(self, tag: str): + # Pop back to the matching open tag (tolerant of mismatches) + for i in range(len(self._stack) - 1, -1, -1): + if self._stack[i].tag == tag: + self._stack[i + 1:] = [] + self._stack.pop(i) + return + + def handle_data(self, data: str): + if data: + self._append(_TextNode(data)) + + def handle_comment(self, data: str): + pass # skip HTML comments + + def finish(self) -> list[_TextNode | _ElementNode]: + # Strip whitespace-only text nodes at root level + return [n for n in self._roots + if not (isinstance(n, _TextNode) and not n.text.strip())] + + +# --------------------------------------------------------------------------- +# Serializer +# --------------------------------------------------------------------------- + +def _esc(s: str) -> str: + """Escape a string for sx double-quoted literals.""" + return s.replace("\\", "\\\\").replace('"', '\\"') + + +def _serialize(node: _TextNode | _ElementNode) -> str: + if isinstance(node, _TextNode): + return f'"{_esc(node.text)}"' + + parts = [node.tag] + for name, value in node.attrs: + if name in BOOLEAN_ATTRS: + if value is None or value == "" or value == name: + parts.append(f":{name} true") + else: + parts.append(f':{name} "{_esc(value)}"') + elif value is None: + # Attribute without value (non-boolean) — treat as boolean true + parts.append(f":{name} true") + else: + parts.append(f':{name} "{_esc(value)}"') + + if node.tag in VOID_ELEMENTS: + return "(" + " ".join(parts) + ")" + + if node.children: + child_parts = [_serialize(c) for c in node.children] + return "(" + " ".join(parts) + " " + " ".join(child_parts) + ")" + + return "(" + " ".join(parts) + ")" diff --git a/shared/sx/tests/test_html_to_sx.py b/shared/sx/tests/test_html_to_sx.py new file mode 100644 index 0000000..504f35c --- /dev/null +++ b/shared/sx/tests/test_html_to_sx.py @@ -0,0 +1,159 @@ +"""Unit tests for html_to_sx converter.""" +from __future__ import annotations + +import pytest + +from shared.sx.html_to_sx import html_to_sx + + +class TestBasicElements: + def test_simple_paragraph(self): + assert html_to_sx("Hello
") == '(p "Hello")' + + def test_with_class(self): + assert html_to_sx('Hi
') == '(p :class "intro" "Hi")' + + def test_multiple_attrs(self): + result = html_to_sx('click') + assert result == '(a :href "u" :class "link" "click")' + + def test_nested_inline(self): + result = html_to_sx("Hello world
") + assert result == '(p "Hello " (em "world"))' + + def test_deeply_nested(self): + result = html_to_sx("bold
')
+ assert result == '(img :src "a.jpg" :alt "pic")'
+
+ def test_hr(self):
+ assert html_to_sx("A
B
") + assert result == '(<> (p "A") (p "B"))' + + def test_single_top_level(self): + result = html_to_sx("Only
") + assert result == '(p "Only")' + + def test_text_only(self): + result = html_to_sx("just text") + assert result == '"just text"' + + def test_empty(self): + assert html_to_sx("") == '""' + + def test_whitespace_only(self): + assert html_to_sx(" \n ") == '""' + + +class TestWhitespace: + def test_root_whitespace_stripped(self): + result = html_to_sx("\nA
\nB
\n") + assert result == '(<> (p "A") (p "B"))' + + +class TestEntities: + def test_amp(self): + result = html_to_sx("A & B
") + assert result == '(p "A & B")' + + def test_lt_gt(self): + result = html_to_sx("<tag>
") + assert result == '(p "hello world
") + assert result == '(p "hello\u00a0world")' + + +class TestEscaping: + def test_quotes_in_text(self): + result = html_to_sx('He said "hello"
') + assert result == '(p "He said \\"hello\\"")' + + def test_backslash_in_text(self): + result = html_to_sx("a\\b
") + assert result == '(p "a\\\\b")' + + def test_quotes_in_attr(self): + # Attribute values with quotes get escaped + result = html_to_sx('hi
") + assert result == '(p "hi")' + + +class TestMixedContent: + def test_caption_with_link(self): + result = html_to_sx('Photo by Author') + assert result == '(<> "Photo by " (a :href "https://x.com" "Author"))' + + def test_caption_plain_text(self): + result = html_to_sx("Figure 1") + assert result == '"Figure 1"' + + +class TestRoundtrip: + """html_to_sx → parse → render should produce equivalent HTML.""" + + def _roundtrip(self, html_in: str) -> str: + from shared.sx.parser import parse + from shared.sx.html import render + sx_src = html_to_sx(html_in) + expr = parse(sx_src) + return render(expr) + + def test_simple(self): + assert self._roundtrip("Hello
") == "Hello
" + + def test_nested(self): + assert self._roundtrip("A B C
") == "A B C
" + + def test_void(self): + assert self._roundtrip('
') == '
'
+
+ def test_link(self):
+ html = 'click'
+ assert self._roundtrip(html) == html
+
+ def test_entities_roundtrip(self):
+ # Entities get decoded by parser, then re-escaped by render
+ assert self._roundtrip("A & B
") == "A & B
" + + def test_multi_block(self): + html = "A
B
" + assert self._roundtrip(html) == html