Files
rose-ash/lib/host/htmlsx.sx
giles 7e2275b90c host: SX-native HTML→SX converter (the radar migrator) + first-class HTML import
lib/host/htmlsx.sx — a pure-SX HTML → SX converter (char-level tokenizer + stack parser):
host/html->sx turns a post's HTML into an (article …) tree that host/blog--decompose! consumes
— img / p / figure+figcaption / iframe / headings / blockquote / lists, inline strong/em/a kept
nested (decompose flattens to text), entities decoded to UTF-8, comments+doctype skipped. This
replaces the one-off external Python converter used for the nt-live-encore import.

import-post! now accepts a raw "html" field (converted via html->sx, serialized to sx_content,
decomposed) alongside "sx_content" — so importing real Ghost HTML is first-class. Wired
htmlsx.sx into conformance.sh + serve.sh module lists (loads in conformance AND live).

New htmlsx suite 8/8 (text/entities/void/nested/figure/iframe/comments + an html→sx→decompose→
typed-cards round-trip); blog 197/197 (+ import-from-html test).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-07-01 15:32:06 +00:00

117 lines
6.7 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
;; lib/host/htmlsx.sx — a pure-SX HTML → SX converter (the "radar migrator" core). Turns a
;; post's HTML content into an SX (article …) tree that host/blog--decompose! consumes: img,
;; p, figure/figcaption, iframe, headings, blockquote, lists, inline strong/em/a (kept nested;
;; decompose flattens them to text at the block level). Char-level tokenizer + a stack parser.
;; NOTE: substr is (string, start, LENGTH); index-of returns -1 when absent.
;; ── string helpers ──────────────────────────────────────────────────
(define host/html--at (fn (s i) (if (< i (len s)) (substr s i 1) "")))
(define host/html--from (fn (s i) (substr s i (- (len s) i)))) ;; s[i:]
(define host/html--slice (fn (s a b) (substr s a (- b a)))) ;; s[a:b)
(define host/html--replace-all
(fn (s old new)
(let ((i (index-of s old)))
(if (< i 0) s
(str (host/html--slice s 0 i) new
(host/html--replace-all (host/html--from s (+ i (len old))) old new))))))
;; ── entity decode (the common named + a few numeric entities → UTF-8) ──
(define host/html--entities
(list (list "&nbsp;" " ") (list "&amp;" "&") (list "&lt;" "<") (list "&gt;" ">")
(list "&quot;" "\"") (list "&#39;" "'") (list "&apos;" "'") (list "&#x27;" "'")
(list "&#x2019;" "") (list "&#8217;" "") (list "&#x2018;" "")
(list "&hellip;" "…") (list "&#x2026;" "…") (list "&mdash;" "—") (list "&ndash;" "")
(list "&pound;" "£") (list "&#xA3;" "£") (list "&#163;" "£")))
(define host/html--decode
(fn (s) (reduce (fn (acc pair) (host/html--replace-all acc (first pair) (first (rest pair)))) s host/html--entities)))
;; ── tag classification + name/attr parsing ──────────────────────────
(define host/html--void?
(fn (n) (contains? (list "img" "br" "hr" "iframe" "input" "meta" "link" "source" "embed") n)))
;; the tag name from a tag's inner text ("img src=…" -> "img"): up to the first space or '/'.
(define host/html--tag-name
(fn (inner)
(let ((sp (index-of inner " ")))
(lower (trim (host/html--replace-all (if (< sp 0) inner (host/html--slice inner 0 sp)) "/" ""))))))
;; parse the attrs of a tag's inner text into a dict (quoted or unquoted values).
(define host/html--attrs-loop
(fn (rest acc)
(let ((r (trim rest)))
(if (or (= r "") (= r "/")) acc
(let ((eq (index-of r "=")))
(if (< eq 0) acc
(let ((name (lower (trim (host/html--slice r 0 eq))))
(after (trim (host/html--from r (+ eq 1)))))
(let ((q (host/html--at after 0)))
(if (or (= q "\"") (= q "'"))
(let ((close (index-of (host/html--from after 1) q)))
(if (< close 0) acc
(host/html--attrs-loop (host/html--from after (+ close 2))
(assoc acc name (host/html--decode (host/html--slice after 1 (+ 1 close)))))))
(let ((sp2 (index-of after " ")))
(host/html--attrs-loop (if (< sp2 0) "" (host/html--from after sp2))
(assoc acc name (if (< sp2 0) after (host/html--slice after 0 sp2))))))))))))))
(define host/html--parse-attrs
(fn (inner)
(let ((sp (index-of inner " ")))
(if (< sp 0) {} (host/html--attrs-loop (host/html--from inner (+ sp 1)) {})))))
;; ── tokenizer: HTML string → a list of {:t text|open|close|void …} tokens ──
(define host/html--tokens
(fn (s)
(let loop ((i 0) (acc (list)))
(if (>= i (len s)) acc
(if (= (host/html--at s i) "<")
(let ((rel (index-of (host/html--from s i) ">")))
(if (< rel 0) acc
(let ((gt (+ i rel)) (inner (host/html--slice s (+ i 1) (+ i rel))))
(cond
((starts-with? inner "!") (loop (+ gt 1) acc)) ;; comment / doctype
((starts-with? inner "/")
(loop (+ gt 1) (concat acc (list {:t "close" :name (host/html--tag-name (host/html--from inner 1))}))))
(else
(let ((name (host/html--tag-name inner)))
(loop (+ gt 1) (concat acc (list {:t (if (or (host/html--void? name) (ends-with? inner "/")) "void" "open")
:name name :attrs (host/html--parse-attrs inner)})))))))))
(let ((rel (index-of (host/html--from s i) "<")))
(let ((te (if (< rel 0) (len s) (+ i rel))))
(let ((txt (host/html--decode (host/html--slice s i te))))
(loop te (if (= (trim txt) "") acc (concat acc (list {:t "text" :text txt}))))))))))))
;; ── parser: tokens → a tree of {:name :attrs :kids} nodes (kids: node | string, in order).
;; A functional stack of open frames; a synthetic root frame collects the top-level nodes. ──
(define host/html--push-kid
(fn (stack kid)
(let ((top (first stack)))
(cons (assoc top :kids (concat (get top :kids) (list kid))) (rest stack)))))
(define host/html--parse
(fn (tokens)
(let loop ((ts tokens) (stack (list {:name "article" :attrs {} :kids (list)})))
(if (empty? ts) (get (first stack) :kids)
(let ((tok (first ts)))
(cond
((= (get tok :t) "text") (loop (rest ts) (host/html--push-kid stack (get tok :text))))
((= (get tok :t) "void") (loop (rest ts) (host/html--push-kid stack {:name (get tok :name) :attrs (get tok :attrs) :kids (list)})))
((= (get tok :t) "open") (loop (rest ts) (cons {:name (get tok :name) :attrs (get tok :attrs) :kids (list)} stack)))
((= (get tok :t) "close")
(if (> (len stack) 1)
(loop (rest ts) (host/html--push-kid (rest stack) (first stack)))
(loop (rest ts) stack)))
(else (loop (rest ts) stack))))))))
;; ── tree → SX. A node becomes (name :attr val … child …); text stays a string. Attr keys
;; become keywords via parse-safe (":src" -> the keyword :src) so decompose reads them. ──
(define host/html--attrs->sx
(fn (attrs)
(reduce (fn (acc k) (concat acc (list (parse-safe (str ":" k)) (get attrs k)))) (list) (keys attrs))))
(define host/html--node->sx
(fn (node)
(if (= (type-of node) "string") node
(cons (string->symbol (get node :name))
(concat (host/html--attrs->sx (get node :attrs))
(map host/html--node->sx (get node :kids)))))))
;; HTML content string → an (article …) SX tree, ready for host/blog--decompose!.
(define host/html->sx
(fn (html)
(cons (quote article) (map host/html--node->sx (host/html--parse (host/html--tokens html))))))