lib/host/htmlsx.sx — a pure-SX HTML → SX converter (char-level tokenizer + stack parser): host/html->sx turns a post's HTML into an (article …) tree that host/blog--decompose! consumes — img / p / figure+figcaption / iframe / headings / blockquote / lists, inline strong/em/a kept nested (decompose flattens to text), entities decoded to UTF-8, comments+doctype skipped. This replaces the one-off external Python converter used for the nt-live-encore import. import-post! now accepts a raw "html" field (converted via html->sx, serialized to sx_content, decomposed) alongside "sx_content" — so importing real Ghost HTML is first-class. Wired htmlsx.sx into conformance.sh + serve.sh module lists (loads in conformance AND live). New htmlsx suite 8/8 (text/entities/void/nested/figure/iframe/comments + an html→sx→decompose→ typed-cards round-trip); blog 197/197 (+ import-from-html test). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
117 lines
6.7 KiB
Plaintext
117 lines
6.7 KiB
Plaintext
;; lib/host/htmlsx.sx — a pure-SX HTML → SX converter (the "radar migrator" core). Turns a
|
||
;; post's HTML content into an SX (article …) tree that host/blog--decompose! consumes: img,
|
||
;; p, figure/figcaption, iframe, headings, blockquote, lists, inline strong/em/a (kept nested;
|
||
;; decompose flattens them to text at the block level). Char-level tokenizer + a stack parser.
|
||
;; NOTE: substr is (string, start, LENGTH); index-of returns -1 when absent.
|
||
|
||
;; ── string helpers ──────────────────────────────────────────────────
|
||
(define host/html--at (fn (s i) (if (< i (len s)) (substr s i 1) "")))
|
||
(define host/html--from (fn (s i) (substr s i (- (len s) i)))) ;; s[i:]
|
||
(define host/html--slice (fn (s a b) (substr s a (- b a)))) ;; s[a:b)
|
||
(define host/html--replace-all
|
||
(fn (s old new)
|
||
(let ((i (index-of s old)))
|
||
(if (< i 0) s
|
||
(str (host/html--slice s 0 i) new
|
||
(host/html--replace-all (host/html--from s (+ i (len old))) old new))))))
|
||
|
||
;; ── entity decode (the common named + a few numeric entities → UTF-8) ──
|
||
(define host/html--entities
|
||
(list (list " " " ") (list "&" "&") (list "<" "<") (list ">" ">")
|
||
(list """ "\"") (list "'" "'") (list "'" "'") (list "'" "'")
|
||
(list "’" "’") (list "’" "’") (list "‘" "‘")
|
||
(list "…" "…") (list "…" "…") (list "—" "—") (list "–" "–")
|
||
(list "£" "£") (list "£" "£") (list "£" "£")))
|
||
(define host/html--decode
|
||
(fn (s) (reduce (fn (acc pair) (host/html--replace-all acc (first pair) (first (rest pair)))) s host/html--entities)))
|
||
|
||
;; ── tag classification + name/attr parsing ──────────────────────────
|
||
(define host/html--void?
|
||
(fn (n) (contains? (list "img" "br" "hr" "iframe" "input" "meta" "link" "source" "embed") n)))
|
||
;; the tag name from a tag's inner text ("img src=…" -> "img"): up to the first space or '/'.
|
||
(define host/html--tag-name
|
||
(fn (inner)
|
||
(let ((sp (index-of inner " ")))
|
||
(lower (trim (host/html--replace-all (if (< sp 0) inner (host/html--slice inner 0 sp)) "/" ""))))))
|
||
;; parse the attrs of a tag's inner text into a dict (quoted or unquoted values).
|
||
(define host/html--attrs-loop
|
||
(fn (rest acc)
|
||
(let ((r (trim rest)))
|
||
(if (or (= r "") (= r "/")) acc
|
||
(let ((eq (index-of r "=")))
|
||
(if (< eq 0) acc
|
||
(let ((name (lower (trim (host/html--slice r 0 eq))))
|
||
(after (trim (host/html--from r (+ eq 1)))))
|
||
(let ((q (host/html--at after 0)))
|
||
(if (or (= q "\"") (= q "'"))
|
||
(let ((close (index-of (host/html--from after 1) q)))
|
||
(if (< close 0) acc
|
||
(host/html--attrs-loop (host/html--from after (+ close 2))
|
||
(assoc acc name (host/html--decode (host/html--slice after 1 (+ 1 close)))))))
|
||
(let ((sp2 (index-of after " ")))
|
||
(host/html--attrs-loop (if (< sp2 0) "" (host/html--from after sp2))
|
||
(assoc acc name (if (< sp2 0) after (host/html--slice after 0 sp2))))))))))))))
|
||
(define host/html--parse-attrs
|
||
(fn (inner)
|
||
(let ((sp (index-of inner " ")))
|
||
(if (< sp 0) {} (host/html--attrs-loop (host/html--from inner (+ sp 1)) {})))))
|
||
|
||
;; ── tokenizer: HTML string → a list of {:t text|open|close|void …} tokens ──
|
||
(define host/html--tokens
|
||
(fn (s)
|
||
(let loop ((i 0) (acc (list)))
|
||
(if (>= i (len s)) acc
|
||
(if (= (host/html--at s i) "<")
|
||
(let ((rel (index-of (host/html--from s i) ">")))
|
||
(if (< rel 0) acc
|
||
(let ((gt (+ i rel)) (inner (host/html--slice s (+ i 1) (+ i rel))))
|
||
(cond
|
||
((starts-with? inner "!") (loop (+ gt 1) acc)) ;; comment / doctype
|
||
((starts-with? inner "/")
|
||
(loop (+ gt 1) (concat acc (list {:t "close" :name (host/html--tag-name (host/html--from inner 1))}))))
|
||
(else
|
||
(let ((name (host/html--tag-name inner)))
|
||
(loop (+ gt 1) (concat acc (list {:t (if (or (host/html--void? name) (ends-with? inner "/")) "void" "open")
|
||
:name name :attrs (host/html--parse-attrs inner)})))))))))
|
||
(let ((rel (index-of (host/html--from s i) "<")))
|
||
(let ((te (if (< rel 0) (len s) (+ i rel))))
|
||
(let ((txt (host/html--decode (host/html--slice s i te))))
|
||
(loop te (if (= (trim txt) "") acc (concat acc (list {:t "text" :text txt}))))))))))))
|
||
|
||
;; ── parser: tokens → a tree of {:name :attrs :kids} nodes (kids: node | string, in order).
|
||
;; A functional stack of open frames; a synthetic root frame collects the top-level nodes. ──
|
||
(define host/html--push-kid
|
||
(fn (stack kid)
|
||
(let ((top (first stack)))
|
||
(cons (assoc top :kids (concat (get top :kids) (list kid))) (rest stack)))))
|
||
(define host/html--parse
|
||
(fn (tokens)
|
||
(let loop ((ts tokens) (stack (list {:name "article" :attrs {} :kids (list)})))
|
||
(if (empty? ts) (get (first stack) :kids)
|
||
(let ((tok (first ts)))
|
||
(cond
|
||
((= (get tok :t) "text") (loop (rest ts) (host/html--push-kid stack (get tok :text))))
|
||
((= (get tok :t) "void") (loop (rest ts) (host/html--push-kid stack {:name (get tok :name) :attrs (get tok :attrs) :kids (list)})))
|
||
((= (get tok :t) "open") (loop (rest ts) (cons {:name (get tok :name) :attrs (get tok :attrs) :kids (list)} stack)))
|
||
((= (get tok :t) "close")
|
||
(if (> (len stack) 1)
|
||
(loop (rest ts) (host/html--push-kid (rest stack) (first stack)))
|
||
(loop (rest ts) stack)))
|
||
(else (loop (rest ts) stack))))))))
|
||
|
||
;; ── tree → SX. A node becomes (name :attr val … child …); text stays a string. Attr keys
|
||
;; become keywords via parse-safe (":src" -> the keyword :src) so decompose reads them. ──
|
||
(define host/html--attrs->sx
|
||
(fn (attrs)
|
||
(reduce (fn (acc k) (concat acc (list (parse-safe (str ":" k)) (get attrs k)))) (list) (keys attrs))))
|
||
(define host/html--node->sx
|
||
(fn (node)
|
||
(if (= (type-of node) "string") node
|
||
(cons (string->symbol (get node :name))
|
||
(concat (host/html--attrs->sx (get node :attrs))
|
||
(map host/html--node->sx (get node :kids)))))))
|
||
;; HTML content string → an (article …) SX tree, ready for host/blog--decompose!.
|
||
(define host/html->sx
|
||
(fn (html)
|
||
(cons (quote article) (map host/html--node->sx (host/html--parse (host/html--tokens html))))))
|