host: SX-native HTML→SX converter (the radar migrator) + first-class HTML import

lib/host/htmlsx.sx — a pure-SX HTML → SX converter (char-level tokenizer + stack parser):
host/html->sx turns a post's HTML into an (article …) tree that host/blog--decompose! consumes
— img / p / figure+figcaption / iframe / headings / blockquote / lists, inline strong/em/a kept
nested (decompose flattens to text), entities decoded to UTF-8, comments+doctype skipped. This
replaces the one-off external Python converter used for the nt-live-encore import.

import-post! now accepts a raw "html" field (converted via html->sx, serialized to sx_content,
decomposed) alongside "sx_content" — so importing real Ghost HTML is first-class. Wired
htmlsx.sx into conformance.sh + serve.sh module lists (loads in conformance AND live).

New htmlsx suite 8/8 (text/entities/void/nested/figure/iframe/comments + an html→sx→decompose→
typed-cards round-trip); blog 197/197 (+ import-from-html test).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-07-01 15:32:06 +00:00
parent a99e64b661
commit 7e2275b90c
6 changed files with 205 additions and 6 deletions

View File

@@ -2211,9 +2211,16 @@
;; (put!/seed!/relate! are sets). Contract: plans/NOTE-blog-types-for-radar.md.
(define host/blog-import-post!
(fn (gp)
(let ((slug (get gp "slug")) (title (get gp "title")))
(let ((slug (get gp "slug")) (title (get gp "title"))
;; content may arrive as raw "html" (converted to an SX tree by the pure-SX
;; converter) OR as "sx_content" (SX source). Either way -> one tree.
(tree (if (get gp "html")
(host/html->sx (get gp "html"))
(parse-safe (or (get gp "sx_content") "")))))
(begin
(host/blog-put! slug title (or (get gp "sx_content") "") (or (get gp "status") "published"))
(host/blog-put! slug title
(if (get gp "html") (serialize tree) (or (get gp "sx_content") ""))
(or (get gp "status") "published"))
(host/blog-relate! slug "article" "is-a")
(host/blog--set-field-values! slug
{"subtitle" (or (get gp "custom_excerpt") (get gp "excerpt") "")
@@ -2226,10 +2233,9 @@
(host/blog-relate! tslug "tag" "is-a")
(host/blog-relate! slug tslug "tagged"))))
(or (get gp "tags") (list)))
;; cards-as-objects: decompose the Ghost body into card objects + a `contains`
;; body, so the post renders via the composition fold (its :body supersedes the
;; opaque sx_content). parse-safe degrades to nil on bad input -> decompose no-ops.
(host/blog--decompose! slug (parse-safe (or (get gp "sx_content") "")))
;; cards-as-objects: decompose the (html- or sx-derived) content tree into card
;; objects + a `contains` body, so the post renders via the composition fold.
(host/blog--decompose! slug tree)
slug))))
;; Import a batch; returns the imported slugs.
(define host/blog-import-all!

View File

@@ -94,6 +94,7 @@ MODULES=(
"lib/host/relations.sx"
"lib/host/compose.sx"
"lib/host/execute.sx"
"lib/host/htmlsx.sx"
"lib/host/blog.sx"
"lib/host/page.sx"
"lib/host/server.sx"
@@ -109,6 +110,7 @@ SUITES=(
"feed host-fd-tests-run! lib/host/tests/feed.sx"
"relations host-rl-tests-run! lib/host/tests/relations.sx"
"blog host-bl-tests-run! lib/host/tests/blog.sx"
"htmlsx host-ht-tests-run! lib/host/tests/htmlsx.sx"
"compose host-cp-tests-run! lib/host/tests/compose.sx"
"execute host-ex-tests-run! lib/host/tests/execute.sx"
"session host-se-tests-run! lib/host/tests/session.sx"

116
lib/host/htmlsx.sx Normal file
View File

@@ -0,0 +1,116 @@
;; lib/host/htmlsx.sx — a pure-SX HTML → SX converter (the "radar migrator" core). Turns a
;; post's HTML content into an SX (article …) tree that host/blog--decompose! consumes: img,
;; p, figure/figcaption, iframe, headings, blockquote, lists, inline strong/em/a (kept nested;
;; decompose flattens them to text at the block level). Char-level tokenizer + a stack parser.
;; NOTE: substr is (string, start, LENGTH); index-of returns -1 when absent.
;; ── string helpers ──────────────────────────────────────────────────
(define host/html--at (fn (s i) (if (< i (len s)) (substr s i 1) "")))
(define host/html--from (fn (s i) (substr s i (- (len s) i)))) ;; s[i:]
(define host/html--slice (fn (s a b) (substr s a (- b a)))) ;; s[a:b)
(define host/html--replace-all
(fn (s old new)
(let ((i (index-of s old)))
(if (< i 0) s
(str (host/html--slice s 0 i) new
(host/html--replace-all (host/html--from s (+ i (len old))) old new))))))
;; ── entity decode (the common named + a few numeric entities → UTF-8) ──
(define host/html--entities
(list (list "&nbsp;" " ") (list "&amp;" "&") (list "&lt;" "<") (list "&gt;" ">")
(list "&quot;" "\"") (list "&#39;" "'") (list "&apos;" "'") (list "&#x27;" "'")
(list "&#x2019;" "") (list "&#8217;" "") (list "&#x2018;" "")
(list "&hellip;" "…") (list "&#x2026;" "…") (list "&mdash;" "—") (list "&ndash;" "")
(list "&pound;" "£") (list "&#xA3;" "£") (list "&#163;" "£")))
(define host/html--decode
(fn (s) (reduce (fn (acc pair) (host/html--replace-all acc (first pair) (first (rest pair)))) s host/html--entities)))
;; ── tag classification + name/attr parsing ──────────────────────────
(define host/html--void?
(fn (n) (contains? (list "img" "br" "hr" "iframe" "input" "meta" "link" "source" "embed") n)))
;; the tag name from a tag's inner text ("img src=…" -> "img"): up to the first space or '/'.
(define host/html--tag-name
(fn (inner)
(let ((sp (index-of inner " ")))
(lower (trim (host/html--replace-all (if (< sp 0) inner (host/html--slice inner 0 sp)) "/" ""))))))
;; parse the attrs of a tag's inner text into a dict (quoted or unquoted values).
(define host/html--attrs-loop
(fn (rest acc)
(let ((r (trim rest)))
(if (or (= r "") (= r "/")) acc
(let ((eq (index-of r "=")))
(if (< eq 0) acc
(let ((name (lower (trim (host/html--slice r 0 eq))))
(after (trim (host/html--from r (+ eq 1)))))
(let ((q (host/html--at after 0)))
(if (or (= q "\"") (= q "'"))
(let ((close (index-of (host/html--from after 1) q)))
(if (< close 0) acc
(host/html--attrs-loop (host/html--from after (+ close 2))
(assoc acc name (host/html--decode (host/html--slice after 1 (+ 1 close)))))))
(let ((sp2 (index-of after " ")))
(host/html--attrs-loop (if (< sp2 0) "" (host/html--from after sp2))
(assoc acc name (if (< sp2 0) after (host/html--slice after 0 sp2))))))))))))))
(define host/html--parse-attrs
(fn (inner)
(let ((sp (index-of inner " ")))
(if (< sp 0) {} (host/html--attrs-loop (host/html--from inner (+ sp 1)) {})))))
;; ── tokenizer: HTML string → a list of {:t text|open|close|void …} tokens ──
(define host/html--tokens
(fn (s)
(let loop ((i 0) (acc (list)))
(if (>= i (len s)) acc
(if (= (host/html--at s i) "<")
(let ((rel (index-of (host/html--from s i) ">")))
(if (< rel 0) acc
(let ((gt (+ i rel)) (inner (host/html--slice s (+ i 1) (+ i rel))))
(cond
((starts-with? inner "!") (loop (+ gt 1) acc)) ;; comment / doctype
((starts-with? inner "/")
(loop (+ gt 1) (concat acc (list {:t "close" :name (host/html--tag-name (host/html--from inner 1))}))))
(else
(let ((name (host/html--tag-name inner)))
(loop (+ gt 1) (concat acc (list {:t (if (or (host/html--void? name) (ends-with? inner "/")) "void" "open")
:name name :attrs (host/html--parse-attrs inner)})))))))))
(let ((rel (index-of (host/html--from s i) "<")))
(let ((te (if (< rel 0) (len s) (+ i rel))))
(let ((txt (host/html--decode (host/html--slice s i te))))
(loop te (if (= (trim txt) "") acc (concat acc (list {:t "text" :text txt}))))))))))))
;; ── parser: tokens → a tree of {:name :attrs :kids} nodes (kids: node | string, in order).
;; A functional stack of open frames; a synthetic root frame collects the top-level nodes. ──
(define host/html--push-kid
(fn (stack kid)
(let ((top (first stack)))
(cons (assoc top :kids (concat (get top :kids) (list kid))) (rest stack)))))
(define host/html--parse
(fn (tokens)
(let loop ((ts tokens) (stack (list {:name "article" :attrs {} :kids (list)})))
(if (empty? ts) (get (first stack) :kids)
(let ((tok (first ts)))
(cond
((= (get tok :t) "text") (loop (rest ts) (host/html--push-kid stack (get tok :text))))
((= (get tok :t) "void") (loop (rest ts) (host/html--push-kid stack {:name (get tok :name) :attrs (get tok :attrs) :kids (list)})))
((= (get tok :t) "open") (loop (rest ts) (cons {:name (get tok :name) :attrs (get tok :attrs) :kids (list)} stack)))
((= (get tok :t) "close")
(if (> (len stack) 1)
(loop (rest ts) (host/html--push-kid (rest stack) (first stack)))
(loop (rest ts) stack)))
(else (loop (rest ts) stack))))))))
;; ── tree → SX. A node becomes (name :attr val … child …); text stays a string. Attr keys
;; become keywords via parse-safe (":src" -> the keyword :src) so decompose reads them. ──
(define host/html--attrs->sx
(fn (attrs)
(reduce (fn (acc k) (concat acc (list (parse-safe (str ":" k)) (get attrs k)))) (list) (keys attrs))))
(define host/html--node->sx
(fn (node)
(if (= (type-of node) "string") node
(cons (string->symbol (get node :name))
(concat (host/html--attrs->sx (get node :attrs))
(map host/html--node->sx (get node :kids)))))))
;; HTML content string → an (article …) SX tree, ready for host/blog--decompose!.
(define host/html->sx
(fn (html)
(cons (quote article) (map host/html--node->sx (host/html--parse (host/html--tokens html))))))

View File

@@ -88,6 +88,7 @@ MODULES=(
"lib/host/relations.sx"
"lib/host/compose.sx"
"lib/host/execute.sx"
"lib/host/htmlsx.sx"
"lib/host/blog.sx"
"lib/host/server.sx"
)

View File

@@ -835,6 +835,17 @@
(host/blog-is-a? "imp-fig__body__b1" "card-embed")
(get (host/blog-field-values-of "imp-fig__body__b1") "url")))
(list true "the cap" "p.jpg" true "https://youtube.com/embed/xyz"))
;; import accepts RAW HTML (converted by the pure-SX host/html->sx) — the first-class path,
;; replacing the one-off external converter used for the nt-live-encore import.
(host-bl-test "import-post! accepts raw \"html\" (html->sx) and decomposes it into typed cards"
(begin
(host/blog-import-post! {"slug" "html-imp" "title" "HI" "status" "published"
"html" "<h2>Hi</h2><p>Some <strong>bold</strong> text.</p><img src=\"p.jpg\" alt=\"a\">"})
(list (host/blog-is-a? "html-imp__body__b0" "card-heading")
(host/blog-is-a? "html-imp__body__b1" "card-text")
(get (host/blog-field-values-of "html-imp__body__b1") "text")
(host/blog-is-a? "html-imp__body__b2" "card-image")))
(list true true "Some bold text." true))
;; -- block editor: structural edits to the post :body composition (step 6). --
(host-bl-test "block-add! creates a card object + contains edge + appends a ref to the body"
(begin

63
lib/host/tests/htmlsx.sx Normal file
View File

@@ -0,0 +1,63 @@
;; lib/host/tests/htmlsx.sx — the pure-SX HTML → SX converter (host/html->sx). Covers text,
;; entities, void/nested tags, attributes, figure/iframe, and an end-to-end import round-trip.
(define host-ht-pass 0)
(define host-ht-fail 0)
(define host-ht-fails (list))
(define host-ht-test
(fn (name actual expected)
(if (= actual expected)
(set! host-ht-pass (+ host-ht-pass 1))
(begin
(set! host-ht-fail (+ host-ht-fail 1))
(append! host-ht-fails {:name name :actual actual :expected expected})))))
;; a paragraph with inline formatting — kept nested (decompose flattens to text later).
(host-ht-test "a <p> with inline <strong> parses to (p \"…\" (strong \"…\") \"…\")"
(str (host/html->sx "<p>Hello <strong>world</strong> now</p>"))
"(article (p \"Hello \" (strong \"world\") \" now\"))")
;; HTML entities decode to UTF-8 (not \\uXXXX).
(host-ht-test "entities decode (&amp; &pound; &#x2019;)"
(str (host/html->sx "<p>Tom &amp; Jerry cost &pound;5 &#x2019;n up</p>"))
"(article (p \"Tom & Jerry cost £5 n up\"))")
;; a void <img> keeps its attributes as keyword attrs.
(host-ht-test "a void <img> keeps :src/:alt attrs"
(str (host/html->sx "<img src=\"a.jpg\" alt=\"a photo\">"))
"(article (img :alt \"a photo\" :src \"a.jpg\"))")
;; a <figure> with an <img> + <figcaption> nests correctly.
(host-ht-test "a <figure> nests an <img> and a <figcaption>"
(str (host/html->sx "<figure><img src=\"y.jpg\" alt=\"y\"><figcaption>a caption</figcaption></figure>"))
"(article (figure (img :alt \"y\" :src \"y.jpg\") (figcaption \"a caption\")))")
;; an <iframe> is a void-ish embed (self-contained token).
(host-ht-test "an <iframe> becomes a leaf with its :src"
(str (host/html->sx "<iframe src=\"https://youtube.com/embed/x\"></iframe>"))
"(article (iframe :src \"https://youtube.com/embed/x\"))")
;; comments + doctype are skipped; whitespace-only text is dropped.
(host-ht-test "comments/doctype are skipped, blank text dropped"
(str (host/html->sx "<!-- hi --> <p>x</p>\n <p>y</p>"))
"(article (p \"x\") (p \"y\"))")
;; headings map through (decompose then turns h2 into card-heading).
(host-ht-test "headings + paragraphs come through in order"
(str (host/html->sx "<h2>Title</h2><p>body</p>"))
"(article (h2 \"Title\") (p \"body\"))")
;; ── END TO END: HTML → SX → decompose → typed card objects ──────────
(host/blog-use-store! (persist/open))
(host/blog-seed-types!)
(host-ht-test "html->sx feeds decompose: a real snippet becomes typed cards"
(begin
(host/blog-put! "htdoc" "HT" "(p)" "published")
(host/blog--decompose! "htdoc"
(host/html->sx "<h2>Heading</h2><p>Some <strong>bold</strong> text.</p><figure><img src=\"p.jpg\" alt=\"a\"><figcaption>cap</figcaption></figure><iframe src=\"https://youtube.com/embed/z\"></iframe>"))
(list (host/blog-is-a? "htdoc__body__b0" "card-heading")
(host/blog-is-a? "htdoc__body__b1" "card-text")
(get (host/blog-field-values-of "htdoc__body__b1") "text") ;; strong flattened to text
(host/blog-is-a? "htdoc__body__b2" "card-image")
(get (host/blog-field-values-of "htdoc__body__b2") "caption")
(host/blog-is-a? "htdoc__body__b3" "card-embed")))
(list true true "Some bold text." true "cap" true))
(define host-ht-tests-run!
(fn ()
{:total (+ host-ht-pass host-ht-fail)
:passed host-ht-pass :failed host-ht-fail :fails host-ht-fails}))