Files
rose-ash/blog/bp/blog/ghost/lexical_to_sx.py
giles 8ceb9aee62 Eliminate raw HTML injection: convert ~kg-html/captions to native sx
Add shared/sx/html_to_sx.py (HTMLParser-based HTML→sx converter) and
update lexical_to_sx.py so HTML cards, markdown cards, and captions all
produce native sx expressions instead of opaque HTML strings.

- ~kg-html now wraps native sx children (editor can identify the block)
- New ~kg-md component for markdown card blocks
- Captions are sx expressions, not escaped HTML strings
- kg_cards.sx: replace (raw! caption) with direct caption rendering
- sx-editor.js: htmlToSx() via DOMParser, serializeInline for captions,
  _childrenSx for ~kg-html/~kg-md, new kg-md edit UI
- Migration script (blog/scripts/migrate_sx_html.py) to re-convert
  stored sx_content from lexical source

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 19:57:27 +00:00

446 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Lexical JSON → s-expression converter.
Mirrors lexical_renderer.py's registry/dispatch pattern but produces sx source
instead of HTML. Used for backfilling existing posts and on-the-fly conversion
when editing pre-migration posts in the SX editor.
Public API
----------
lexical_to_sx(doc) Lexical JSON (dict or string) → sx source string
"""
from __future__ import annotations
import json
from typing import Callable
import mistune
from shared.sx.html_to_sx import html_to_sx
# ---------------------------------------------------------------------------
# Registry
# ---------------------------------------------------------------------------
_CONVERTERS: dict[str, Callable[[dict], str]] = {}
def _converter(node_type: str):
"""Decorator — register a function as the converter for *node_type*."""
def decorator(fn: Callable[[dict], str]) -> Callable[[dict], str]:
_CONVERTERS[node_type] = fn
return fn
return decorator
# ---------------------------------------------------------------------------
# Public entry point
# ---------------------------------------------------------------------------
def lexical_to_sx(doc: dict | str) -> str:
"""Convert a Lexical JSON document to an sx source string."""
if isinstance(doc, str):
doc = json.loads(doc)
root = doc.get("root", doc)
children = root.get("children", [])
parts = [_convert_node(c) for c in children]
parts = [p for p in parts if p]
if not parts:
return '(<> (p ""))'
if len(parts) == 1:
return parts[0]
return "(<>\n " + "\n ".join(parts) + ")"
# ---------------------------------------------------------------------------
# Core dispatch
# ---------------------------------------------------------------------------
def _convert_node(node: dict) -> str:
node_type = node.get("type", "")
converter = _CONVERTERS.get(node_type)
if converter:
return converter(node)
return ""
def _convert_children(children: list[dict]) -> str:
"""Convert children to inline sx content (for text nodes)."""
parts = [_convert_node(c) for c in children]
return " ".join(p for p in parts if p)
def _esc(s: str) -> str:
"""Escape a string for sx double-quoted literals."""
return s.replace("\\", "\\\\").replace('"', '\\"')
# ---------------------------------------------------------------------------
# Text format bitmask
# ---------------------------------------------------------------------------
_FORMAT_BOLD = 1
_FORMAT_ITALIC = 2
_FORMAT_STRIKETHROUGH = 4
_FORMAT_UNDERLINE = 8
_FORMAT_CODE = 16
_FORMAT_SUBSCRIPT = 32
_FORMAT_SUPERSCRIPT = 64
_FORMAT_WRAPPERS: list[tuple[int, str]] = [
(_FORMAT_BOLD, "strong"),
(_FORMAT_ITALIC, "em"),
(_FORMAT_STRIKETHROUGH, "s"),
(_FORMAT_UNDERLINE, "u"),
(_FORMAT_CODE, "code"),
(_FORMAT_SUBSCRIPT, "sub"),
(_FORMAT_SUPERSCRIPT, "sup"),
]
def _wrap_format(text_sx: str, fmt: int) -> str:
for mask, tag in _FORMAT_WRAPPERS:
if fmt & mask:
text_sx = f"({tag} {text_sx})"
return text_sx
# ---------------------------------------------------------------------------
# Tier 1 — text nodes
# ---------------------------------------------------------------------------
@_converter("text")
def _text(node: dict) -> str:
text = node.get("text", "")
if not text:
return ""
sx = f'"{_esc(text)}"'
fmt = node.get("format", 0)
if isinstance(fmt, int) and fmt:
sx = _wrap_format(sx, fmt)
return sx
@_converter("linebreak")
def _linebreak(_node: dict) -> str:
return '"\\n"'
@_converter("tab")
def _tab(_node: dict) -> str:
return '"\\t"'
@_converter("paragraph")
def _paragraph(node: dict) -> str:
inner = _convert_children(node.get("children", []))
if not inner:
inner = '""'
return f"(p {inner})"
@_converter("extended-text")
def _extended_text(node: dict) -> str:
# extended-text can be block-level (with children) or inline (with text).
# When it has a "text" field, treat it as a plain text node.
if "text" in node:
return _text(node)
return _paragraph(node)
@_converter("heading")
def _heading(node: dict) -> str:
tag = node.get("tag", "h2")
inner = _convert_children(node.get("children", []))
if not inner:
inner = '""'
return f"({tag} {inner})"
@_converter("extended-heading")
def _extended_heading(node: dict) -> str:
if "text" in node:
return _text(node)
return _heading(node)
@_converter("quote")
def _quote(node: dict) -> str:
inner = _convert_children(node.get("children", []))
return f"(blockquote {inner})" if inner else '(blockquote "")'
@_converter("extended-quote")
def _extended_quote(node: dict) -> str:
if "text" in node:
return _text(node)
return _quote(node)
@_converter("link")
def _link(node: dict) -> str:
href = node.get("url", "")
inner = _convert_children(node.get("children", []))
if not inner:
inner = f'"{_esc(href)}"'
return f'(a :href "{_esc(href)}" {inner})'
@_converter("autolink")
def _autolink(node: dict) -> str:
return _link(node)
@_converter("at-link")
def _at_link(node: dict) -> str:
return _link(node)
@_converter("list")
def _list(node: dict) -> str:
tag = "ol" if node.get("listType") == "number" else "ul"
inner = _convert_children(node.get("children", []))
return f"({tag} {inner})" if inner else f"({tag})"
@_converter("listitem")
def _listitem(node: dict) -> str:
inner = _convert_children(node.get("children", []))
return f"(li {inner})" if inner else '(li "")'
@_converter("horizontalrule")
def _horizontalrule(_node: dict) -> str:
return "(hr)"
@_converter("code")
def _code(node: dict) -> str:
inner = _convert_children(node.get("children", []))
return f"(code {inner})" if inner else ""
@_converter("codeblock")
def _codeblock(node: dict) -> str:
lang = node.get("language", "")
code = node.get("code", "")
lang_attr = f' :class "language-{_esc(lang)}"' if lang else ""
return f'(pre (code{lang_attr} "{_esc(code)}"))'
@_converter("code-highlight")
def _code_highlight(node: dict) -> str:
text = node.get("text", "")
return f'"{_esc(text)}"' if text else ""
# ---------------------------------------------------------------------------
# Tier 2 — common cards
# ---------------------------------------------------------------------------
@_converter("image")
def _image(node: dict) -> str:
src = node.get("src", "")
alt = node.get("alt", "")
caption = node.get("caption", "")
width = node.get("cardWidth", "") or node.get("width", "")
href = node.get("href", "")
parts = [f':src "{_esc(src)}"']
if alt:
parts.append(f':alt "{_esc(alt)}"')
if caption:
parts.append(f":caption {html_to_sx(caption)}")
if width:
parts.append(f':width "{_esc(width)}"')
if href:
parts.append(f':href "{_esc(href)}"')
return "(~kg-image " + " ".join(parts) + ")"
@_converter("gallery")
def _gallery(node: dict) -> str:
images = node.get("images", [])
if not images:
return ""
# Group images into rows of 3 (matching lexical_renderer.py)
rows = []
for i in range(0, len(images), 3):
row_imgs = images[i:i + 3]
row_items = []
for img in row_imgs:
item_parts = [f'"src" "{_esc(img.get("src", ""))}"']
if img.get("alt"):
item_parts.append(f'"alt" "{_esc(img["alt"])}"')
if img.get("caption"):
item_parts.append(f'"caption" {html_to_sx(img["caption"])}')
row_items.append("(dict " + " ".join(item_parts) + ")")
rows.append("(list " + " ".join(row_items) + ")")
images_sx = "(list " + " ".join(rows) + ")"
caption = node.get("caption", "")
caption_attr = f" :caption {html_to_sx(caption)}" if caption else ""
return f"(~kg-gallery :images {images_sx}{caption_attr})"
@_converter("html")
def _html_card(node: dict) -> str:
raw = node.get("html", "")
inner = html_to_sx(raw)
return f"(~kg-html {inner})"
@_converter("embed")
def _embed(node: dict) -> str:
embed_html = node.get("html", "")
caption = node.get("caption", "")
parts = [f':html "{_esc(embed_html)}"']
if caption:
parts.append(f":caption {html_to_sx(caption)}")
return "(~kg-embed " + " ".join(parts) + ")"
@_converter("bookmark")
def _bookmark(node: dict) -> str:
url = node.get("url", "")
meta = node.get("metadata", {})
parts = [f':url "{_esc(url)}"']
title = meta.get("title", "") or node.get("title", "")
if title:
parts.append(f':title "{_esc(title)}"')
desc = meta.get("description", "") or node.get("description", "")
if desc:
parts.append(f':description "{_esc(desc)}"')
icon = meta.get("icon", "") or node.get("icon", "")
if icon:
parts.append(f':icon "{_esc(icon)}"')
author = meta.get("author", "") or node.get("author", "")
if author:
parts.append(f':author "{_esc(author)}"')
publisher = meta.get("publisher", "") or node.get("publisher", "")
if publisher:
parts.append(f':publisher "{_esc(publisher)}"')
thumbnail = meta.get("thumbnail", "") or node.get("thumbnail", "")
if thumbnail:
parts.append(f':thumbnail "{_esc(thumbnail)}"')
caption = node.get("caption", "")
if caption:
parts.append(f":caption {html_to_sx(caption)}")
return "(~kg-bookmark " + " ".join(parts) + ")"
@_converter("callout")
def _callout(node: dict) -> str:
color = node.get("backgroundColor", "grey")
emoji = node.get("calloutEmoji", "")
inner = _convert_children(node.get("children", []))
parts = [f':color "{_esc(color)}"']
if emoji:
parts.append(f':emoji "{_esc(emoji)}"')
if inner:
parts.append(f':content {inner}')
return "(~kg-callout " + " ".join(parts) + ")"
@_converter("button")
def _button(node: dict) -> str:
text = node.get("buttonText", "")
url = node.get("buttonUrl", "")
alignment = node.get("alignment", "center")
return f'(~kg-button :url "{_esc(url)}" :text "{_esc(text)}" :alignment "{_esc(alignment)}")'
@_converter("toggle")
def _toggle(node: dict) -> str:
heading = node.get("heading", "")
inner = _convert_children(node.get("children", []))
content_attr = f" :content {inner}" if inner else ""
return f'(~kg-toggle :heading "{_esc(heading)}"{content_attr})'
@_converter("audio")
def _audio(node: dict) -> str:
src = node.get("src", "")
title = node.get("title", "")
duration = node.get("duration", 0)
thumbnail = node.get("thumbnailSrc", "")
duration_min = int(duration) // 60
duration_sec = int(duration) % 60
duration_str = f"{duration_min}:{duration_sec:02d}"
parts = [f':src "{_esc(src)}"']
if title:
parts.append(f':title "{_esc(title)}"')
parts.append(f':duration "{duration_str}"')
if thumbnail:
parts.append(f':thumbnail "{_esc(thumbnail)}"')
return "(~kg-audio " + " ".join(parts) + ")"
@_converter("video")
def _video(node: dict) -> str:
src = node.get("src", "")
caption = node.get("caption", "")
width = node.get("cardWidth", "")
thumbnail = node.get("thumbnailSrc", "") or node.get("customThumbnailSrc", "")
loop = node.get("loop", False)
parts = [f':src "{_esc(src)}"']
if caption:
parts.append(f":caption {html_to_sx(caption)}")
if width:
parts.append(f':width "{_esc(width)}"')
if thumbnail:
parts.append(f':thumbnail "{_esc(thumbnail)}"')
if loop:
parts.append(":loop true")
return "(~kg-video " + " ".join(parts) + ")"
@_converter("file")
def _file(node: dict) -> str:
src = node.get("src", "")
filename = node.get("fileName", "")
title = node.get("title", "") or filename
file_size = node.get("fileSize", 0)
caption = node.get("caption", "")
# Format size
size_str = ""
if file_size:
kb = file_size / 1024
if kb < 1024:
size_str = f"{kb:.0f} KB"
else:
size_str = f"{kb / 1024:.1f} MB"
parts = [f':src "{_esc(src)}"']
if filename:
parts.append(f':filename "{_esc(filename)}"')
if title:
parts.append(f':title "{_esc(title)}"')
if size_str:
parts.append(f':filesize "{size_str}"')
if caption:
parts.append(f":caption {html_to_sx(caption)}")
return "(~kg-file " + " ".join(parts) + ")"
@_converter("paywall")
def _paywall(_node: dict) -> str:
return "(~kg-paywall)"
@_converter("markdown")
def _markdown(node: dict) -> str:
md_text = node.get("markdown", "")
rendered = mistune.html(md_text)
inner = html_to_sx(rendered)
return f"(~kg-md {inner})"