;; lib/host/htmlsx.sx — a pure-SX HTML → SX converter (the "radar migrator" core). Turns a
;; post's HTML content into an SX (article …) tree that host/blog--decompose! consumes: img,
;; p, figure/figcaption, iframe, headings, blockquote, lists, inline strong/em/a (kept nested;
;; decompose flattens them to text at the block level). Char-level tokenizer + a stack parser.
;; NOTE: substr is (string, start, LENGTH); index-of returns -1 when absent.
;; ── string helpers ──────────────────────────────────────────────────
(define host/html--at (fn (s i) (if (< i (len s)) (substr s i 1) "")))
(define host/html--from (fn (s i) (substr s i (- (len s) i)))) ;; s[i:]
(define host/html--slice (fn (s a b) (substr s a (- b a)))) ;; s[a:b)
(define host/html--replace-all
(fn (s old new)
(let ((i (index-of s old)))
(if (< i 0) s
(str (host/html--slice s 0 i) new
(host/html--replace-all (host/html--from s (+ i (len old))) old new))))))
;; ── entity decode (the common named + a few numeric entities → UTF-8) ──
(define host/html--entities
(list (list " " " ") (list "&" "&") (list "<" "<") (list ">" ">")
(list """ "\"") (list "'" "'") (list "'" "'") (list "'" "'")
(list "’" "’") (list "’" "’") (list "‘" "‘")
(list "…" "…") (list "…" "…") (list "—" "—") (list "–" "–")
(list "£" "£") (list "£" "£") (list "£" "£")))
(define host/html--decode
(fn (s) (reduce (fn (acc pair) (host/html--replace-all acc (first pair) (first (rest pair)))) s host/html--entities)))
;; ── tag classification + name/attr parsing ──────────────────────────
(define host/html--void?
(fn (n) (contains? (list "img" "br" "hr" "iframe" "input" "meta" "link" "source" "embed") n)))
;; the tag name from a tag's inner text ("img src=…" -> "img"): up to the first space or '/'.
(define host/html--tag-name
(fn (inner)
(let ((sp (index-of inner " ")))
(lower (trim (host/html--replace-all (if (< sp 0) inner (host/html--slice inner 0 sp)) "/" ""))))))
;; parse the attrs of a tag's inner text into a dict (quoted or unquoted values).
(define host/html--attrs-loop
(fn (rest acc)
(let ((r (trim rest)))
(if (or (= r "") (= r "/")) acc
(let ((eq (index-of r "=")))
(if (< eq 0) acc
(let ((name (lower (trim (host/html--slice r 0 eq))))
(after (trim (host/html--from r (+ eq 1)))))
(let ((q (host/html--at after 0)))
(if (or (= q "\"") (= q "'"))
(let ((close (index-of (host/html--from after 1) q)))
(if (< close 0) acc
(host/html--attrs-loop (host/html--from after (+ close 2))
(assoc acc name (host/html--decode (host/html--slice after 1 (+ 1 close)))))))
(let ((sp2 (index-of after " ")))
(host/html--attrs-loop (if (< sp2 0) "" (host/html--from after sp2))
(assoc acc name (if (< sp2 0) after (host/html--slice after 0 sp2))))))))))))))
(define host/html--parse-attrs
(fn (inner)
(let ((sp (index-of inner " ")))
(if (< sp 0) {} (host/html--attrs-loop (host/html--from inner (+ sp 1)) {})))))
;; ── tokenizer: HTML string → a list of {:t text|open|close|void …} tokens ──
(define host/html--tokens
(fn (s)
(let loop ((i 0) (acc (list)))
(if (>= i (len s)) acc
(if (= (host/html--at s i) "<")
(let ((rel (index-of (host/html--from s i) ">")))
(if (< rel 0) acc
(let ((gt (+ i rel)) (inner (host/html--slice s (+ i 1) (+ i rel))))
(cond
((starts-with? inner "!") (loop (+ gt 1) acc)) ;; comment / doctype
((starts-with? inner "/")
(loop (+ gt 1) (concat acc (list {:t "close" :name (host/html--tag-name (host/html--from inner 1))}))))
(else
(let ((name (host/html--tag-name inner)))
(loop (+ gt 1) (concat acc (list {:t (if (or (host/html--void? name) (ends-with? inner "/")) "void" "open")
:name name :attrs (host/html--parse-attrs inner)})))))))))
(let ((rel (index-of (host/html--from s i) "<")))
(let ((te (if (< rel 0) (len s) (+ i rel))))
(let ((txt (host/html--decode (host/html--slice s i te))))
(loop te (if (= (trim txt) "") acc (concat acc (list {:t "text" :text txt}))))))))))))
;; ── parser: tokens → a tree of {:name :attrs :kids} nodes (kids: node | string, in order).
;; A functional stack of open frames; a synthetic root frame collects the top-level nodes. ──
(define host/html--push-kid
(fn (stack kid)
(let ((top (first stack)))
(cons (assoc top :kids (concat (get top :kids) (list kid))) (rest stack)))))
(define host/html--parse
(fn (tokens)
(let loop ((ts tokens) (stack (list {:name "article" :attrs {} :kids (list)})))
(if (empty? ts) (get (first stack) :kids)
(let ((tok (first ts)))
(cond
((= (get tok :t) "text") (loop (rest ts) (host/html--push-kid stack (get tok :text))))
((= (get tok :t) "void") (loop (rest ts) (host/html--push-kid stack {:name (get tok :name) :attrs (get tok :attrs) :kids (list)})))
((= (get tok :t) "open") (loop (rest ts) (cons {:name (get tok :name) :attrs (get tok :attrs) :kids (list)} stack)))
((= (get tok :t) "close")
(if (> (len stack) 1)
(loop (rest ts) (host/html--push-kid (rest stack) (first stack)))
(loop (rest ts) stack)))
(else (loop (rest ts) stack))))))))
;; ── tree → SX. A node becomes (name :attr val … child …); text stays a string. Attr keys
;; become keywords via parse-safe (":src" -> the keyword :src) so decompose reads them. ──
(define host/html--attrs->sx
(fn (attrs)
(reduce (fn (acc k) (concat acc (list (parse-safe (str ":" k)) (get attrs k)))) (list) (keys attrs))))
(define host/html--node->sx
(fn (node)
(if (= (type-of node) "string") node
(cons (string->symbol (get node :name))
(concat (host/html--attrs->sx (get node :attrs))
(map host/html--node->sx (get node :kids)))))))
;; HTML content string → an (article …) SX tree, ready for host/blog--decompose!.
(define host/html->sx
(fn (html)
(cons (quote article) (map host/html--node->sx (host/html--parse (host/html--tokens html))))))