From 7e2275b90cd983c37edd597e680a2ff53586e64c Mon Sep 17 00:00:00 2001 From: giles Date: Wed, 1 Jul 2026 15:32:06 +0000 Subject: [PATCH] =?UTF-8?q?host:=20SX-native=20HTML=E2=86=92SX=20converter?= =?UTF-8?q?=20(the=20radar=20migrator)=20+=20first-class=20HTML=20import?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit lib/host/htmlsx.sx — a pure-SX HTML → SX converter (char-level tokenizer + stack parser): host/html->sx turns a post's HTML into an (article …) tree that host/blog--decompose! consumes — img / p / figure+figcaption / iframe / headings / blockquote / lists, inline strong/em/a kept nested (decompose flattens to text), entities decoded to UTF-8, comments+doctype skipped. This replaces the one-off external Python converter used for the nt-live-encore import. import-post! now accepts a raw "html" field (converted via html->sx, serialized to sx_content, decomposed) alongside "sx_content" — so importing real Ghost HTML is first-class. Wired htmlsx.sx into conformance.sh + serve.sh module lists (loads in conformance AND live). New htmlsx suite 8/8 (text/entities/void/nested/figure/iframe/comments + an html→sx→decompose→ typed-cards round-trip); blog 197/197 (+ import-from-html test). Co-Authored-By: Claude Opus 4.8 --- lib/host/blog.sx | 18 ++++-- lib/host/conformance.sh | 2 + lib/host/htmlsx.sx | 116 +++++++++++++++++++++++++++++++++++++++ lib/host/serve.sh | 1 + lib/host/tests/blog.sx | 11 ++++ lib/host/tests/htmlsx.sx | 63 +++++++++++++++++++++ 6 files changed, 205 insertions(+), 6 deletions(-) create mode 100644 lib/host/htmlsx.sx create mode 100644 lib/host/tests/htmlsx.sx diff --git a/lib/host/blog.sx b/lib/host/blog.sx index 8935f9e4..01ffed34 100644 --- a/lib/host/blog.sx +++ b/lib/host/blog.sx @@ -2211,9 +2211,16 @@ ;; (put!/seed!/relate! are sets). Contract: plans/NOTE-blog-types-for-radar.md. (define host/blog-import-post! (fn (gp) - (let ((slug (get gp "slug")) (title (get gp "title"))) + (let ((slug (get gp "slug")) (title (get gp "title")) + ;; content may arrive as raw "html" (converted to an SX tree by the pure-SX + ;; converter) OR as "sx_content" (SX source). Either way -> one tree. + (tree (if (get gp "html") + (host/html->sx (get gp "html")) + (parse-safe (or (get gp "sx_content") ""))))) (begin - (host/blog-put! slug title (or (get gp "sx_content") "") (or (get gp "status") "published")) + (host/blog-put! slug title + (if (get gp "html") (serialize tree) (or (get gp "sx_content") "")) + (or (get gp "status") "published")) (host/blog-relate! slug "article" "is-a") (host/blog--set-field-values! slug {"subtitle" (or (get gp "custom_excerpt") (get gp "excerpt") "") @@ -2226,10 +2233,9 @@ (host/blog-relate! tslug "tag" "is-a") (host/blog-relate! slug tslug "tagged")))) (or (get gp "tags") (list))) - ;; cards-as-objects: decompose the Ghost body into card objects + a `contains` - ;; body, so the post renders via the composition fold (its :body supersedes the - ;; opaque sx_content). parse-safe degrades to nil on bad input -> decompose no-ops. - (host/blog--decompose! slug (parse-safe (or (get gp "sx_content") ""))) + ;; cards-as-objects: decompose the (html- or sx-derived) content tree into card + ;; objects + a `contains` body, so the post renders via the composition fold. + (host/blog--decompose! slug tree) slug)))) ;; Import a batch; returns the imported slugs. (define host/blog-import-all! diff --git a/lib/host/conformance.sh b/lib/host/conformance.sh index 4b7d5775..2a88c839 100755 --- a/lib/host/conformance.sh +++ b/lib/host/conformance.sh @@ -94,6 +94,7 @@ MODULES=( "lib/host/relations.sx" "lib/host/compose.sx" "lib/host/execute.sx" + "lib/host/htmlsx.sx" "lib/host/blog.sx" "lib/host/page.sx" "lib/host/server.sx" @@ -109,6 +110,7 @@ SUITES=( "feed host-fd-tests-run! lib/host/tests/feed.sx" "relations host-rl-tests-run! lib/host/tests/relations.sx" "blog host-bl-tests-run! lib/host/tests/blog.sx" + "htmlsx host-ht-tests-run! lib/host/tests/htmlsx.sx" "compose host-cp-tests-run! lib/host/tests/compose.sx" "execute host-ex-tests-run! lib/host/tests/execute.sx" "session host-se-tests-run! lib/host/tests/session.sx" diff --git a/lib/host/htmlsx.sx b/lib/host/htmlsx.sx new file mode 100644 index 00000000..3c99f18d --- /dev/null +++ b/lib/host/htmlsx.sx @@ -0,0 +1,116 @@ +;; lib/host/htmlsx.sx — a pure-SX HTML → SX converter (the "radar migrator" core). Turns a +;; post's HTML content into an SX (article …) tree that host/blog--decompose! consumes: img, +;; p, figure/figcaption, iframe, headings, blockquote, lists, inline strong/em/a (kept nested; +;; decompose flattens them to text at the block level). Char-level tokenizer + a stack parser. +;; NOTE: substr is (string, start, LENGTH); index-of returns -1 when absent. + +;; ── string helpers ────────────────────────────────────────────────── +(define host/html--at (fn (s i) (if (< i (len s)) (substr s i 1) ""))) +(define host/html--from (fn (s i) (substr s i (- (len s) i)))) ;; s[i:] +(define host/html--slice (fn (s a b) (substr s a (- b a)))) ;; s[a:b) +(define host/html--replace-all + (fn (s old new) + (let ((i (index-of s old))) + (if (< i 0) s + (str (host/html--slice s 0 i) new + (host/html--replace-all (host/html--from s (+ i (len old))) old new)))))) + +;; ── entity decode (the common named + a few numeric entities → UTF-8) ── +(define host/html--entities + (list (list " " " ") (list "&" "&") (list "<" "<") (list ">" ">") + (list """ "\"") (list "'" "'") (list "'" "'") (list "'" "'") + (list "’" "’") (list "’" "’") (list "‘" "‘") + (list "…" "…") (list "…" "…") (list "—" "—") (list "–" "–") + (list "£" "£") (list "£" "£") (list "£" "£"))) +(define host/html--decode + (fn (s) (reduce (fn (acc pair) (host/html--replace-all acc (first pair) (first (rest pair)))) s host/html--entities))) + +;; ── tag classification + name/attr parsing ────────────────────────── +(define host/html--void? + (fn (n) (contains? (list "img" "br" "hr" "iframe" "input" "meta" "link" "source" "embed") n))) +;; the tag name from a tag's inner text ("img src=…" -> "img"): up to the first space or '/'. +(define host/html--tag-name + (fn (inner) + (let ((sp (index-of inner " "))) + (lower (trim (host/html--replace-all (if (< sp 0) inner (host/html--slice inner 0 sp)) "/" "")))))) +;; parse the attrs of a tag's inner text into a dict (quoted or unquoted values). +(define host/html--attrs-loop + (fn (rest acc) + (let ((r (trim rest))) + (if (or (= r "") (= r "/")) acc + (let ((eq (index-of r "="))) + (if (< eq 0) acc + (let ((name (lower (trim (host/html--slice r 0 eq)))) + (after (trim (host/html--from r (+ eq 1))))) + (let ((q (host/html--at after 0))) + (if (or (= q "\"") (= q "'")) + (let ((close (index-of (host/html--from after 1) q))) + (if (< close 0) acc + (host/html--attrs-loop (host/html--from after (+ close 2)) + (assoc acc name (host/html--decode (host/html--slice after 1 (+ 1 close))))))) + (let ((sp2 (index-of after " "))) + (host/html--attrs-loop (if (< sp2 0) "" (host/html--from after sp2)) + (assoc acc name (if (< sp2 0) after (host/html--slice after 0 sp2)))))))))))))) +(define host/html--parse-attrs + (fn (inner) + (let ((sp (index-of inner " "))) + (if (< sp 0) {} (host/html--attrs-loop (host/html--from inner (+ sp 1)) {}))))) + +;; ── tokenizer: HTML string → a list of {:t text|open|close|void …} tokens ── +(define host/html--tokens + (fn (s) + (let loop ((i 0) (acc (list))) + (if (>= i (len s)) acc + (if (= (host/html--at s i) "<") + (let ((rel (index-of (host/html--from s i) ">"))) + (if (< rel 0) acc + (let ((gt (+ i rel)) (inner (host/html--slice s (+ i 1) (+ i rel)))) + (cond + ((starts-with? inner "!") (loop (+ gt 1) acc)) ;; comment / doctype + ((starts-with? inner "/") + (loop (+ gt 1) (concat acc (list {:t "close" :name (host/html--tag-name (host/html--from inner 1))})))) + (else + (let ((name (host/html--tag-name inner))) + (loop (+ gt 1) (concat acc (list {:t (if (or (host/html--void? name) (ends-with? inner "/")) "void" "open") + :name name :attrs (host/html--parse-attrs inner)}))))))))) + (let ((rel (index-of (host/html--from s i) "<"))) + (let ((te (if (< rel 0) (len s) (+ i rel)))) + (let ((txt (host/html--decode (host/html--slice s i te)))) + (loop te (if (= (trim txt) "") acc (concat acc (list {:t "text" :text txt})))))))))))) + +;; ── parser: tokens → a tree of {:name :attrs :kids} nodes (kids: node | string, in order). +;; A functional stack of open frames; a synthetic root frame collects the top-level nodes. ── +(define host/html--push-kid + (fn (stack kid) + (let ((top (first stack))) + (cons (assoc top :kids (concat (get top :kids) (list kid))) (rest stack))))) +(define host/html--parse + (fn (tokens) + (let loop ((ts tokens) (stack (list {:name "article" :attrs {} :kids (list)}))) + (if (empty? ts) (get (first stack) :kids) + (let ((tok (first ts))) + (cond + ((= (get tok :t) "text") (loop (rest ts) (host/html--push-kid stack (get tok :text)))) + ((= (get tok :t) "void") (loop (rest ts) (host/html--push-kid stack {:name (get tok :name) :attrs (get tok :attrs) :kids (list)}))) + ((= (get tok :t) "open") (loop (rest ts) (cons {:name (get tok :name) :attrs (get tok :attrs) :kids (list)} stack))) + ((= (get tok :t) "close") + (if (> (len stack) 1) + (loop (rest ts) (host/html--push-kid (rest stack) (first stack))) + (loop (rest ts) stack))) + (else (loop (rest ts) stack)))))))) + +;; ── tree → SX. A node becomes (name :attr val … child …); text stays a string. Attr keys +;; become keywords via parse-safe (":src" -> the keyword :src) so decompose reads them. ── +(define host/html--attrs->sx + (fn (attrs) + (reduce (fn (acc k) (concat acc (list (parse-safe (str ":" k)) (get attrs k)))) (list) (keys attrs)))) +(define host/html--node->sx + (fn (node) + (if (= (type-of node) "string") node + (cons (string->symbol (get node :name)) + (concat (host/html--attrs->sx (get node :attrs)) + (map host/html--node->sx (get node :kids))))))) +;; HTML content string → an (article …) SX tree, ready for host/blog--decompose!. +(define host/html->sx + (fn (html) + (cons (quote article) (map host/html--node->sx (host/html--parse (host/html--tokens html)))))) diff --git a/lib/host/serve.sh b/lib/host/serve.sh index 5dd1d34c..6ff249c2 100755 --- a/lib/host/serve.sh +++ b/lib/host/serve.sh @@ -88,6 +88,7 @@ MODULES=( "lib/host/relations.sx" "lib/host/compose.sx" "lib/host/execute.sx" + "lib/host/htmlsx.sx" "lib/host/blog.sx" "lib/host/server.sx" ) diff --git a/lib/host/tests/blog.sx b/lib/host/tests/blog.sx index a9e6103f..99448724 100644 --- a/lib/host/tests/blog.sx +++ b/lib/host/tests/blog.sx @@ -835,6 +835,17 @@ (host/blog-is-a? "imp-fig__body__b1" "card-embed") (get (host/blog-field-values-of "imp-fig__body__b1") "url"))) (list true "the cap" "p.jpg" true "https://youtube.com/embed/xyz")) +;; import accepts RAW HTML (converted by the pure-SX host/html->sx) — the first-class path, +;; replacing the one-off external converter used for the nt-live-encore import. +(host-bl-test "import-post! accepts raw \"html\" (html->sx) and decomposes it into typed cards" + (begin + (host/blog-import-post! {"slug" "html-imp" "title" "HI" "status" "published" + "html" "

Hi

Some bold text.

\"a\""}) + (list (host/blog-is-a? "html-imp__body__b0" "card-heading") + (host/blog-is-a? "html-imp__body__b1" "card-text") + (get (host/blog-field-values-of "html-imp__body__b1") "text") + (host/blog-is-a? "html-imp__body__b2" "card-image"))) + (list true true "Some bold text." true)) ;; -- block editor: structural edits to the post :body composition (step 6). -- (host-bl-test "block-add! creates a card object + contains edge + appends a ref to the body" (begin diff --git a/lib/host/tests/htmlsx.sx b/lib/host/tests/htmlsx.sx new file mode 100644 index 00000000..6c89cbf4 --- /dev/null +++ b/lib/host/tests/htmlsx.sx @@ -0,0 +1,63 @@ +;; lib/host/tests/htmlsx.sx — the pure-SX HTML → SX converter (host/html->sx). Covers text, +;; entities, void/nested tags, attributes, figure/iframe, and an end-to-end import round-trip. + +(define host-ht-pass 0) +(define host-ht-fail 0) +(define host-ht-fails (list)) +(define host-ht-test + (fn (name actual expected) + (if (= actual expected) + (set! host-ht-pass (+ host-ht-pass 1)) + (begin + (set! host-ht-fail (+ host-ht-fail 1)) + (append! host-ht-fails {:name name :actual actual :expected expected}))))) + +;; a paragraph with inline formatting — kept nested (decompose flattens to text later). +(host-ht-test "a

with inline parses to (p \"…\" (strong \"…\") \"…\")" + (str (host/html->sx "

Hello world now

")) + "(article (p \"Hello \" (strong \"world\") \" now\"))") +;; HTML entities decode to UTF-8 (not \\uXXXX). +(host-ht-test "entities decode (& £ ’)" + (str (host/html->sx "

Tom & Jerry cost £5 ’n up

")) + "(article (p \"Tom & Jerry cost £5 ’n up\"))") +;; a void keeps its attributes as keyword attrs. +(host-ht-test "a void keeps :src/:alt attrs" + (str (host/html->sx "\"a")) + "(article (img :alt \"a photo\" :src \"a.jpg\"))") +;; a
with an +
nests correctly. +(host-ht-test "a
nests an and a
" + (str (host/html->sx "
\"y\"
a caption
")) + "(article (figure (img :alt \"y\" :src \"y.jpg\") (figcaption \"a caption\")))") +;; an ")) + "(article (iframe :src \"https://youtube.com/embed/x\"))") +;; comments + doctype are skipped; whitespace-only text is dropped. +(host-ht-test "comments/doctype are skipped, blank text dropped" + (str (host/html->sx "

x

\n

y

")) + "(article (p \"x\") (p \"y\"))") +;; headings map through (decompose then turns h2 into card-heading). +(host-ht-test "headings + paragraphs come through in order" + (str (host/html->sx "

Title

body

")) + "(article (h2 \"Title\") (p \"body\"))") + +;; ── END TO END: HTML → SX → decompose → typed card objects ────────── +(host/blog-use-store! (persist/open)) +(host/blog-seed-types!) +(host-ht-test "html->sx feeds decompose: a real snippet becomes typed cards" + (begin + (host/blog-put! "htdoc" "HT" "(p)" "published") + (host/blog--decompose! "htdoc" + (host/html->sx "

Heading

Some bold text.

\"a\"
cap
")) + (list (host/blog-is-a? "htdoc__body__b0" "card-heading") + (host/blog-is-a? "htdoc__body__b1" "card-text") + (get (host/blog-field-values-of "htdoc__body__b1") "text") ;; strong flattened to text + (host/blog-is-a? "htdoc__body__b2" "card-image") + (get (host/blog-field-values-of "htdoc__body__b2") "caption") + (host/blog-is-a? "htdoc__body__b3" "card-embed"))) + (list true true "Some bold text." true "cap" true)) + +(define host-ht-tests-run! + (fn () + {:total (+ host-ht-pass host-ht-fail) + :passed host-ht-pass :failed host-ht-fail :fails host-ht-fails}))