host: SX-native HTML→SX converter (the radar migrator) + first-class HTML import
lib/host/htmlsx.sx — a pure-SX HTML → SX converter (char-level tokenizer + stack parser): host/html->sx turns a post's HTML into an (article …) tree that host/blog--decompose! consumes — img / p / figure+figcaption / iframe / headings / blockquote / lists, inline strong/em/a kept nested (decompose flattens to text), entities decoded to UTF-8, comments+doctype skipped. This replaces the one-off external Python converter used for the nt-live-encore import. import-post! now accepts a raw "html" field (converted via html->sx, serialized to sx_content, decomposed) alongside "sx_content" — so importing real Ghost HTML is first-class. Wired htmlsx.sx into conformance.sh + serve.sh module lists (loads in conformance AND live). New htmlsx suite 8/8 (text/entities/void/nested/figure/iframe/comments + an html→sx→decompose→ typed-cards round-trip); blog 197/197 (+ import-from-html test). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -2211,9 +2211,16 @@
|
|||||||
;; (put!/seed!/relate! are sets). Contract: plans/NOTE-blog-types-for-radar.md.
|
;; (put!/seed!/relate! are sets). Contract: plans/NOTE-blog-types-for-radar.md.
|
||||||
(define host/blog-import-post!
|
(define host/blog-import-post!
|
||||||
(fn (gp)
|
(fn (gp)
|
||||||
(let ((slug (get gp "slug")) (title (get gp "title")))
|
(let ((slug (get gp "slug")) (title (get gp "title"))
|
||||||
|
;; content may arrive as raw "html" (converted to an SX tree by the pure-SX
|
||||||
|
;; converter) OR as "sx_content" (SX source). Either way -> one tree.
|
||||||
|
(tree (if (get gp "html")
|
||||||
|
(host/html->sx (get gp "html"))
|
||||||
|
(parse-safe (or (get gp "sx_content") "")))))
|
||||||
(begin
|
(begin
|
||||||
(host/blog-put! slug title (or (get gp "sx_content") "") (or (get gp "status") "published"))
|
(host/blog-put! slug title
|
||||||
|
(if (get gp "html") (serialize tree) (or (get gp "sx_content") ""))
|
||||||
|
(or (get gp "status") "published"))
|
||||||
(host/blog-relate! slug "article" "is-a")
|
(host/blog-relate! slug "article" "is-a")
|
||||||
(host/blog--set-field-values! slug
|
(host/blog--set-field-values! slug
|
||||||
{"subtitle" (or (get gp "custom_excerpt") (get gp "excerpt") "")
|
{"subtitle" (or (get gp "custom_excerpt") (get gp "excerpt") "")
|
||||||
@@ -2226,10 +2233,9 @@
|
|||||||
(host/blog-relate! tslug "tag" "is-a")
|
(host/blog-relate! tslug "tag" "is-a")
|
||||||
(host/blog-relate! slug tslug "tagged"))))
|
(host/blog-relate! slug tslug "tagged"))))
|
||||||
(or (get gp "tags") (list)))
|
(or (get gp "tags") (list)))
|
||||||
;; cards-as-objects: decompose the Ghost body into card objects + a `contains`
|
;; cards-as-objects: decompose the (html- or sx-derived) content tree into card
|
||||||
;; body, so the post renders via the composition fold (its :body supersedes the
|
;; objects + a `contains` body, so the post renders via the composition fold.
|
||||||
;; opaque sx_content). parse-safe degrades to nil on bad input -> decompose no-ops.
|
(host/blog--decompose! slug tree)
|
||||||
(host/blog--decompose! slug (parse-safe (or (get gp "sx_content") "")))
|
|
||||||
slug))))
|
slug))))
|
||||||
;; Import a batch; returns the imported slugs.
|
;; Import a batch; returns the imported slugs.
|
||||||
(define host/blog-import-all!
|
(define host/blog-import-all!
|
||||||
|
|||||||
@@ -94,6 +94,7 @@ MODULES=(
|
|||||||
"lib/host/relations.sx"
|
"lib/host/relations.sx"
|
||||||
"lib/host/compose.sx"
|
"lib/host/compose.sx"
|
||||||
"lib/host/execute.sx"
|
"lib/host/execute.sx"
|
||||||
|
"lib/host/htmlsx.sx"
|
||||||
"lib/host/blog.sx"
|
"lib/host/blog.sx"
|
||||||
"lib/host/page.sx"
|
"lib/host/page.sx"
|
||||||
"lib/host/server.sx"
|
"lib/host/server.sx"
|
||||||
@@ -109,6 +110,7 @@ SUITES=(
|
|||||||
"feed host-fd-tests-run! lib/host/tests/feed.sx"
|
"feed host-fd-tests-run! lib/host/tests/feed.sx"
|
||||||
"relations host-rl-tests-run! lib/host/tests/relations.sx"
|
"relations host-rl-tests-run! lib/host/tests/relations.sx"
|
||||||
"blog host-bl-tests-run! lib/host/tests/blog.sx"
|
"blog host-bl-tests-run! lib/host/tests/blog.sx"
|
||||||
|
"htmlsx host-ht-tests-run! lib/host/tests/htmlsx.sx"
|
||||||
"compose host-cp-tests-run! lib/host/tests/compose.sx"
|
"compose host-cp-tests-run! lib/host/tests/compose.sx"
|
||||||
"execute host-ex-tests-run! lib/host/tests/execute.sx"
|
"execute host-ex-tests-run! lib/host/tests/execute.sx"
|
||||||
"session host-se-tests-run! lib/host/tests/session.sx"
|
"session host-se-tests-run! lib/host/tests/session.sx"
|
||||||
|
|||||||
116
lib/host/htmlsx.sx
Normal file
116
lib/host/htmlsx.sx
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
;; lib/host/htmlsx.sx — a pure-SX HTML → SX converter (the "radar migrator" core). Turns a
|
||||||
|
;; post's HTML content into an SX (article …) tree that host/blog--decompose! consumes: img,
|
||||||
|
;; p, figure/figcaption, iframe, headings, blockquote, lists, inline strong/em/a (kept nested;
|
||||||
|
;; decompose flattens them to text at the block level). Char-level tokenizer + a stack parser.
|
||||||
|
;; NOTE: substr is (string, start, LENGTH); index-of returns -1 when absent.
|
||||||
|
|
||||||
|
;; ── string helpers ──────────────────────────────────────────────────
|
||||||
|
(define host/html--at (fn (s i) (if (< i (len s)) (substr s i 1) "")))
|
||||||
|
(define host/html--from (fn (s i) (substr s i (- (len s) i)))) ;; s[i:]
|
||||||
|
(define host/html--slice (fn (s a b) (substr s a (- b a)))) ;; s[a:b)
|
||||||
|
(define host/html--replace-all
|
||||||
|
(fn (s old new)
|
||||||
|
(let ((i (index-of s old)))
|
||||||
|
(if (< i 0) s
|
||||||
|
(str (host/html--slice s 0 i) new
|
||||||
|
(host/html--replace-all (host/html--from s (+ i (len old))) old new))))))
|
||||||
|
|
||||||
|
;; ── entity decode (the common named + a few numeric entities → UTF-8) ──
|
||||||
|
(define host/html--entities
|
||||||
|
(list (list " " " ") (list "&" "&") (list "<" "<") (list ">" ">")
|
||||||
|
(list """ "\"") (list "'" "'") (list "'" "'") (list "'" "'")
|
||||||
|
(list "’" "’") (list "’" "’") (list "‘" "‘")
|
||||||
|
(list "…" "…") (list "…" "…") (list "—" "—") (list "–" "–")
|
||||||
|
(list "£" "£") (list "£" "£") (list "£" "£")))
|
||||||
|
(define host/html--decode
|
||||||
|
(fn (s) (reduce (fn (acc pair) (host/html--replace-all acc (first pair) (first (rest pair)))) s host/html--entities)))
|
||||||
|
|
||||||
|
;; ── tag classification + name/attr parsing ──────────────────────────
|
||||||
|
(define host/html--void?
|
||||||
|
(fn (n) (contains? (list "img" "br" "hr" "iframe" "input" "meta" "link" "source" "embed") n)))
|
||||||
|
;; the tag name from a tag's inner text ("img src=…" -> "img"): up to the first space or '/'.
|
||||||
|
(define host/html--tag-name
|
||||||
|
(fn (inner)
|
||||||
|
(let ((sp (index-of inner " ")))
|
||||||
|
(lower (trim (host/html--replace-all (if (< sp 0) inner (host/html--slice inner 0 sp)) "/" ""))))))
|
||||||
|
;; parse the attrs of a tag's inner text into a dict (quoted or unquoted values).
|
||||||
|
(define host/html--attrs-loop
|
||||||
|
(fn (rest acc)
|
||||||
|
(let ((r (trim rest)))
|
||||||
|
(if (or (= r "") (= r "/")) acc
|
||||||
|
(let ((eq (index-of r "=")))
|
||||||
|
(if (< eq 0) acc
|
||||||
|
(let ((name (lower (trim (host/html--slice r 0 eq))))
|
||||||
|
(after (trim (host/html--from r (+ eq 1)))))
|
||||||
|
(let ((q (host/html--at after 0)))
|
||||||
|
(if (or (= q "\"") (= q "'"))
|
||||||
|
(let ((close (index-of (host/html--from after 1) q)))
|
||||||
|
(if (< close 0) acc
|
||||||
|
(host/html--attrs-loop (host/html--from after (+ close 2))
|
||||||
|
(assoc acc name (host/html--decode (host/html--slice after 1 (+ 1 close)))))))
|
||||||
|
(let ((sp2 (index-of after " ")))
|
||||||
|
(host/html--attrs-loop (if (< sp2 0) "" (host/html--from after sp2))
|
||||||
|
(assoc acc name (if (< sp2 0) after (host/html--slice after 0 sp2))))))))))))))
|
||||||
|
(define host/html--parse-attrs
|
||||||
|
(fn (inner)
|
||||||
|
(let ((sp (index-of inner " ")))
|
||||||
|
(if (< sp 0) {} (host/html--attrs-loop (host/html--from inner (+ sp 1)) {})))))
|
||||||
|
|
||||||
|
;; ── tokenizer: HTML string → a list of {:t text|open|close|void …} tokens ──
|
||||||
|
(define host/html--tokens
|
||||||
|
(fn (s)
|
||||||
|
(let loop ((i 0) (acc (list)))
|
||||||
|
(if (>= i (len s)) acc
|
||||||
|
(if (= (host/html--at s i) "<")
|
||||||
|
(let ((rel (index-of (host/html--from s i) ">")))
|
||||||
|
(if (< rel 0) acc
|
||||||
|
(let ((gt (+ i rel)) (inner (host/html--slice s (+ i 1) (+ i rel))))
|
||||||
|
(cond
|
||||||
|
((starts-with? inner "!") (loop (+ gt 1) acc)) ;; comment / doctype
|
||||||
|
((starts-with? inner "/")
|
||||||
|
(loop (+ gt 1) (concat acc (list {:t "close" :name (host/html--tag-name (host/html--from inner 1))}))))
|
||||||
|
(else
|
||||||
|
(let ((name (host/html--tag-name inner)))
|
||||||
|
(loop (+ gt 1) (concat acc (list {:t (if (or (host/html--void? name) (ends-with? inner "/")) "void" "open")
|
||||||
|
:name name :attrs (host/html--parse-attrs inner)})))))))))
|
||||||
|
(let ((rel (index-of (host/html--from s i) "<")))
|
||||||
|
(let ((te (if (< rel 0) (len s) (+ i rel))))
|
||||||
|
(let ((txt (host/html--decode (host/html--slice s i te))))
|
||||||
|
(loop te (if (= (trim txt) "") acc (concat acc (list {:t "text" :text txt}))))))))))))
|
||||||
|
|
||||||
|
;; ── parser: tokens → a tree of {:name :attrs :kids} nodes (kids: node | string, in order).
|
||||||
|
;; A functional stack of open frames; a synthetic root frame collects the top-level nodes. ──
|
||||||
|
(define host/html--push-kid
|
||||||
|
(fn (stack kid)
|
||||||
|
(let ((top (first stack)))
|
||||||
|
(cons (assoc top :kids (concat (get top :kids) (list kid))) (rest stack)))))
|
||||||
|
(define host/html--parse
|
||||||
|
(fn (tokens)
|
||||||
|
(let loop ((ts tokens) (stack (list {:name "article" :attrs {} :kids (list)})))
|
||||||
|
(if (empty? ts) (get (first stack) :kids)
|
||||||
|
(let ((tok (first ts)))
|
||||||
|
(cond
|
||||||
|
((= (get tok :t) "text") (loop (rest ts) (host/html--push-kid stack (get tok :text))))
|
||||||
|
((= (get tok :t) "void") (loop (rest ts) (host/html--push-kid stack {:name (get tok :name) :attrs (get tok :attrs) :kids (list)})))
|
||||||
|
((= (get tok :t) "open") (loop (rest ts) (cons {:name (get tok :name) :attrs (get tok :attrs) :kids (list)} stack)))
|
||||||
|
((= (get tok :t) "close")
|
||||||
|
(if (> (len stack) 1)
|
||||||
|
(loop (rest ts) (host/html--push-kid (rest stack) (first stack)))
|
||||||
|
(loop (rest ts) stack)))
|
||||||
|
(else (loop (rest ts) stack))))))))
|
||||||
|
|
||||||
|
;; ── tree → SX. A node becomes (name :attr val … child …); text stays a string. Attr keys
|
||||||
|
;; become keywords via parse-safe (":src" -> the keyword :src) so decompose reads them. ──
|
||||||
|
(define host/html--attrs->sx
|
||||||
|
(fn (attrs)
|
||||||
|
(reduce (fn (acc k) (concat acc (list (parse-safe (str ":" k)) (get attrs k)))) (list) (keys attrs))))
|
||||||
|
(define host/html--node->sx
|
||||||
|
(fn (node)
|
||||||
|
(if (= (type-of node) "string") node
|
||||||
|
(cons (string->symbol (get node :name))
|
||||||
|
(concat (host/html--attrs->sx (get node :attrs))
|
||||||
|
(map host/html--node->sx (get node :kids)))))))
|
||||||
|
;; HTML content string → an (article …) SX tree, ready for host/blog--decompose!.
|
||||||
|
(define host/html->sx
|
||||||
|
(fn (html)
|
||||||
|
(cons (quote article) (map host/html--node->sx (host/html--parse (host/html--tokens html))))))
|
||||||
@@ -88,6 +88,7 @@ MODULES=(
|
|||||||
"lib/host/relations.sx"
|
"lib/host/relations.sx"
|
||||||
"lib/host/compose.sx"
|
"lib/host/compose.sx"
|
||||||
"lib/host/execute.sx"
|
"lib/host/execute.sx"
|
||||||
|
"lib/host/htmlsx.sx"
|
||||||
"lib/host/blog.sx"
|
"lib/host/blog.sx"
|
||||||
"lib/host/server.sx"
|
"lib/host/server.sx"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -835,6 +835,17 @@
|
|||||||
(host/blog-is-a? "imp-fig__body__b1" "card-embed")
|
(host/blog-is-a? "imp-fig__body__b1" "card-embed")
|
||||||
(get (host/blog-field-values-of "imp-fig__body__b1") "url")))
|
(get (host/blog-field-values-of "imp-fig__body__b1") "url")))
|
||||||
(list true "the cap" "p.jpg" true "https://youtube.com/embed/xyz"))
|
(list true "the cap" "p.jpg" true "https://youtube.com/embed/xyz"))
|
||||||
|
;; import accepts RAW HTML (converted by the pure-SX host/html->sx) — the first-class path,
|
||||||
|
;; replacing the one-off external converter used for the nt-live-encore import.
|
||||||
|
(host-bl-test "import-post! accepts raw \"html\" (html->sx) and decomposes it into typed cards"
|
||||||
|
(begin
|
||||||
|
(host/blog-import-post! {"slug" "html-imp" "title" "HI" "status" "published"
|
||||||
|
"html" "<h2>Hi</h2><p>Some <strong>bold</strong> text.</p><img src=\"p.jpg\" alt=\"a\">"})
|
||||||
|
(list (host/blog-is-a? "html-imp__body__b0" "card-heading")
|
||||||
|
(host/blog-is-a? "html-imp__body__b1" "card-text")
|
||||||
|
(get (host/blog-field-values-of "html-imp__body__b1") "text")
|
||||||
|
(host/blog-is-a? "html-imp__body__b2" "card-image")))
|
||||||
|
(list true true "Some bold text." true))
|
||||||
;; -- block editor: structural edits to the post :body composition (step 6). --
|
;; -- block editor: structural edits to the post :body composition (step 6). --
|
||||||
(host-bl-test "block-add! creates a card object + contains edge + appends a ref to the body"
|
(host-bl-test "block-add! creates a card object + contains edge + appends a ref to the body"
|
||||||
(begin
|
(begin
|
||||||
|
|||||||
63
lib/host/tests/htmlsx.sx
Normal file
63
lib/host/tests/htmlsx.sx
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
;; lib/host/tests/htmlsx.sx — the pure-SX HTML → SX converter (host/html->sx). Covers text,
|
||||||
|
;; entities, void/nested tags, attributes, figure/iframe, and an end-to-end import round-trip.
|
||||||
|
|
||||||
|
(define host-ht-pass 0)
|
||||||
|
(define host-ht-fail 0)
|
||||||
|
(define host-ht-fails (list))
|
||||||
|
(define host-ht-test
|
||||||
|
(fn (name actual expected)
|
||||||
|
(if (= actual expected)
|
||||||
|
(set! host-ht-pass (+ host-ht-pass 1))
|
||||||
|
(begin
|
||||||
|
(set! host-ht-fail (+ host-ht-fail 1))
|
||||||
|
(append! host-ht-fails {:name name :actual actual :expected expected})))))
|
||||||
|
|
||||||
|
;; a paragraph with inline formatting — kept nested (decompose flattens to text later).
|
||||||
|
(host-ht-test "a <p> with inline <strong> parses to (p \"…\" (strong \"…\") \"…\")"
|
||||||
|
(str (host/html->sx "<p>Hello <strong>world</strong> now</p>"))
|
||||||
|
"(article (p \"Hello \" (strong \"world\") \" now\"))")
|
||||||
|
;; HTML entities decode to UTF-8 (not \\uXXXX).
|
||||||
|
(host-ht-test "entities decode (& £ ’)"
|
||||||
|
(str (host/html->sx "<p>Tom & Jerry cost £5 ’n up</p>"))
|
||||||
|
"(article (p \"Tom & Jerry cost £5 ’n up\"))")
|
||||||
|
;; a void <img> keeps its attributes as keyword attrs.
|
||||||
|
(host-ht-test "a void <img> keeps :src/:alt attrs"
|
||||||
|
(str (host/html->sx "<img src=\"a.jpg\" alt=\"a photo\">"))
|
||||||
|
"(article (img :alt \"a photo\" :src \"a.jpg\"))")
|
||||||
|
;; a <figure> with an <img> + <figcaption> nests correctly.
|
||||||
|
(host-ht-test "a <figure> nests an <img> and a <figcaption>"
|
||||||
|
(str (host/html->sx "<figure><img src=\"y.jpg\" alt=\"y\"><figcaption>a caption</figcaption></figure>"))
|
||||||
|
"(article (figure (img :alt \"y\" :src \"y.jpg\") (figcaption \"a caption\")))")
|
||||||
|
;; an <iframe> is a void-ish embed (self-contained token).
|
||||||
|
(host-ht-test "an <iframe> becomes a leaf with its :src"
|
||||||
|
(str (host/html->sx "<iframe src=\"https://youtube.com/embed/x\"></iframe>"))
|
||||||
|
"(article (iframe :src \"https://youtube.com/embed/x\"))")
|
||||||
|
;; comments + doctype are skipped; whitespace-only text is dropped.
|
||||||
|
(host-ht-test "comments/doctype are skipped, blank text dropped"
|
||||||
|
(str (host/html->sx "<!-- hi --> <p>x</p>\n <p>y</p>"))
|
||||||
|
"(article (p \"x\") (p \"y\"))")
|
||||||
|
;; headings map through (decompose then turns h2 into card-heading).
|
||||||
|
(host-ht-test "headings + paragraphs come through in order"
|
||||||
|
(str (host/html->sx "<h2>Title</h2><p>body</p>"))
|
||||||
|
"(article (h2 \"Title\") (p \"body\"))")
|
||||||
|
|
||||||
|
;; ── END TO END: HTML → SX → decompose → typed card objects ──────────
|
||||||
|
(host/blog-use-store! (persist/open))
|
||||||
|
(host/blog-seed-types!)
|
||||||
|
(host-ht-test "html->sx feeds decompose: a real snippet becomes typed cards"
|
||||||
|
(begin
|
||||||
|
(host/blog-put! "htdoc" "HT" "(p)" "published")
|
||||||
|
(host/blog--decompose! "htdoc"
|
||||||
|
(host/html->sx "<h2>Heading</h2><p>Some <strong>bold</strong> text.</p><figure><img src=\"p.jpg\" alt=\"a\"><figcaption>cap</figcaption></figure><iframe src=\"https://youtube.com/embed/z\"></iframe>"))
|
||||||
|
(list (host/blog-is-a? "htdoc__body__b0" "card-heading")
|
||||||
|
(host/blog-is-a? "htdoc__body__b1" "card-text")
|
||||||
|
(get (host/blog-field-values-of "htdoc__body__b1") "text") ;; strong flattened to text
|
||||||
|
(host/blog-is-a? "htdoc__body__b2" "card-image")
|
||||||
|
(get (host/blog-field-values-of "htdoc__body__b2") "caption")
|
||||||
|
(host/blog-is-a? "htdoc__body__b3" "card-embed")))
|
||||||
|
(list true true "Some bold text." true "cap" true))
|
||||||
|
|
||||||
|
(define host-ht-tests-run!
|
||||||
|
(fn ()
|
||||||
|
{:total (+ host-ht-pass host-ht-fail)
|
||||||
|
:passed host-ht-pass :failed host-ht-fail :fails host-ht-fails}))
|
||||||
Reference in New Issue
Block a user