diff --git a/lib/content/conformance.sh b/lib/content/conformance.sh index 762ec963..f248ba4c 100755 --- a/lib/content/conformance.sh +++ b/lib/content/conformance.sh @@ -15,7 +15,7 @@ if [ ! -x "$SX_SERVER" ]; then fi fi -SUITES=(block doc render api meta page page-full markdown text section compose tree-edit move clone query toc anchor outline flatten transform normalize find-replace stats summary index table callout media data wire validate store snapshot crdt crdt-tree crdt-blocks crdt-store sync md-import md-doc fed) +SUITES=(block doc render api meta page page-full markdown text section compose tree-edit move clone query toc anchor outline flatten transform normalize find-replace stats summary index table callout media data wire validate sanitize store snapshot crdt crdt-tree crdt-blocks crdt-store sync md-import md-doc fed) OUT_JSON="lib/content/scoreboard.json" OUT_MD="lib/content/scoreboard.md" @@ -69,6 +69,7 @@ run_suite() { (load "lib/content/page-full.sx") (load "lib/content/markdown.sx") (load "lib/content/validate.sx") +(load "lib/content/sanitize.sx") (load "lib/content/store.sx") (load "lib/content/snapshot.sx") (load "lib/content/crdt.sx") diff --git a/lib/content/sanitize.sx b/lib/content/sanitize.sx new file mode 100644 index 00000000..cc7a8706 --- /dev/null +++ b/lib/content/sanitize.sx @@ -0,0 +1,47 @@ +;; content-on-sx — make a document render-safe by dropping invalid blocks. +;; +;; The enforcement counterpart to validate: where content/validate REPORTS id / +;; field issues, content/sanitize REMOVES the offending blocks so the result can +;; be rendered/merged without faulting on malformed input (federated or imported +;; documents that failed validation). Tree-wide: descends into sections, pruning +;; invalid descendants; a section whose own shell is valid is kept (even if it +;; ends up empty — that is normalize's job, not sanitize's), but a section whose +;; own check fails (e.g. children is not a list) is dropped whole. +;; +;; Reuses validate's per-block predicate (content/-block-issues), so the set of +;; "what is invalid" stays single-sourced and can't drift from content/validate. +;; sanitize addresses per-block id/field validity only; it does NOT resolve +;; duplicate ids (a cross-block concern with no single right answer), so a +;; sanitized doc is render-safe but not necessarily content/valid? if the input +;; carried duplicate ids. Immutable; returns a new document. +;; +;; Requires (loaded by harness): block.sx, doc.sx, validate.sx +;; (content/-block-issues). + +(define + san-section? + (fn (b) (and (st-instance? b) (= (get b :class) "CtSection")))) + +;; a block is render-safe when it has no id/field issues (validate's own checks) +(define san-ok? (fn (b) (= (len (content/-block-issues b)) 0))) + +;; drop invalid blocks at this level; recurse into surviving sections so invalid +;; descendants are pruned too. +(define + san-blocks + (fn + (blocks) + (map + (fn + (b) + (if + (san-section? b) + (let + ((ch (st-iv-get b "children"))) + (if (list? ch) (st-iv-set! b "children" (san-blocks ch)) b)) + b)) + (filter san-ok? blocks)))) + +(define + content/sanitize + (fn (doc) (doc-with-blocks doc (san-blocks (doc-blocks doc))))) diff --git a/lib/content/scoreboard.json b/lib/content/scoreboard.json index d94a6ce1..c58da9c7 100644 --- a/lib/content/scoreboard.json +++ b/lib/content/scoreboard.json @@ -31,6 +31,7 @@ "data": {"pass": 25, "fail": 0}, "wire": {"pass": 11, "fail": 0}, "validate": {"pass": 32, "fail": 0}, + "sanitize": {"pass": 12, "fail": 0}, "store": {"pass": 46, "fail": 0}, "snapshot": {"pass": 20, "fail": 0}, "crdt": {"pass": 34, "fail": 0}, @@ -42,7 +43,7 @@ "md-doc": {"pass": 12, "fail": 0}, "fed": {"pass": 20, "fail": 0} }, - "total_pass": 787, + "total_pass": 799, "total_fail": 0, - "total": 787 + "total": 799 } diff --git a/lib/content/scoreboard.md b/lib/content/scoreboard.md index 8bcc3cc5..b641e532 100644 --- a/lib/content/scoreboard.md +++ b/lib/content/scoreboard.md @@ -35,6 +35,7 @@ _Generated by `lib/content/conformance.sh`_ | data | 25 | 0 | 25 | | wire | 11 | 0 | 11 | | validate | 32 | 0 | 32 | +| sanitize | 12 | 0 | 12 | | store | 46 | 0 | 46 | | snapshot | 20 | 0 | 20 | | crdt | 34 | 0 | 34 | @@ -45,4 +46,4 @@ _Generated by `lib/content/conformance.sh`_ | md-import | 38 | 0 | 38 | | md-doc | 12 | 0 | 12 | | fed | 20 | 0 | 20 | -| **Total** | **787** | **0** | **787** | +| **Total** | **799** | **0** | **799** | diff --git a/lib/content/tests/sanitize.sx b/lib/content/tests/sanitize.sx new file mode 100644 index 00000000..2a6660d4 --- /dev/null +++ b/lib/content/tests/sanitize.sx @@ -0,0 +1,128 @@ +;; Extension — make a document render-safe by dropping invalid blocks. +;; Counterpart to validate; reuses its per-block checks. Tree-wide. + +(st-bootstrap-classes!) +(content-bootstrap-blocks!) +(content-bootstrap-doc!) +(content-bootstrap-section!) + +;; ── a valid document is returned unchanged (same ids, tree order) ── +(define + good + (doc-append + (doc-append (doc-empty "d") (mk-heading "h" 1 "Title")) + (mk-text "p" "Body"))) +(content-test + "valid doc keeps all blocks" + (doc-ids (content/sanitize good)) + (list "h" "p")) +(content-test + "valid doc still valid after sanitize" + (content/valid? (content/sanitize good)) + true) + +;; ── a block with a bad field is dropped ── +(content-test + "bad-field block dropped" + (doc-ids + (content/sanitize + (doc-append + (doc-append (doc-empty "d") (mk-text "ok" "fine")) + (mk-heading "bad" "notnum" "T")))) + (list "ok")) + +;; ── unknown block type dropped ── +(define raw (st-iv-set! (st-make-instance "CtBlock") "id" "z")) +(content-test + "unknown-type block dropped" + (doc-ids + (content/sanitize + (doc-append (doc-append (doc-empty "d") (mk-text "ok" "x")) raw))) + (list "ok")) + +;; ── blank-id block dropped ── +(content-test + "blank-id block dropped" + (doc-ids + (content/sanitize + (doc-append + (doc-append (doc-empty "d") (mk-text "ok" "x")) + (mk-text "" "y")))) + (list "ok")) + +;; ── result is render-safe: no id/field issues remain ── +(content-test + "sanitized has no field/id issues" + (len + (filter + (fn (i) (if (= (get i :kind) "field") true (= (get i :kind) "id"))) + (content/validate + (content/sanitize + (doc-append + (doc-append (doc-empty "d") (mk-text "ok" "x")) + (mk-heading "bad" "notnum" "T")))))) + 0) + +;; ── immutability: original document untouched ── +(define + withbad + (doc-append + (doc-append (doc-empty "d") (mk-text "ok" "x")) + (mk-heading "bad" "notnum" "T"))) +(define _ (content/sanitize withbad)) +(content-test "original unchanged" (doc-ids withbad) (list "ok" "bad")) + +;; ── tree-wide: invalid nested child pruned, valid sibling + section kept ── +(define + nested + (doc-append + (doc-empty "d") + (mk-section + "s" + (list (mk-text "good" "keep") (mk-heading "badc" "notnum" "X"))))) +(content-test + "invalid nested child pruned, section kept" + (doc-tree-ids (content/sanitize nested)) + (list "s" "good")) + +;; ── a section whose own shell is invalid (children not a list) is dropped ── +(define + badsec + (doc-append + (doc-append (doc-empty "d") (mk-text "ok" "x")) + (st-iv-set! (mk-section "s" (list)) "children" "nope"))) +(content-test + "invalid section shell dropped whole" + (doc-tree-ids (content/sanitize badsec)) + (list "ok")) + +;; ── a valid section that loses all children is kept (empty) — sanitize is not +;; normalize; it removes invalid, not empty ── +(define + allbadchildren + (doc-append + (doc-empty "d") + (mk-section "s" (list (mk-heading "b1" "x" "X") (mk-text "" "y"))))) +(content-test + "section kept though emptied of invalid children" + (doc-tree-ids (content/sanitize allbadchildren)) + (list "s")) + +;; ── deeply nested: invalid block two levels down is pruned ── +(define + deep + (doc-append + (doc-empty "d") + (mk-section + "o" + (list (mk-section "i" (list (mk-text "dok" "x") (mk-text "" "bad"))))))) +(content-test + "deep invalid pruned" + (doc-tree-ids (content/sanitize deep)) + (list "o" "i" "dok")) + +;; ── empty document sanitizes to empty ── +(content-test + "empty doc stays empty" + (doc-ids (content/sanitize (doc-empty "e"))) + (list)) diff --git a/plans/content-on-sx.md b/plans/content-on-sx.md index 4a20622b..66a0fa58 100644 --- a/plans/content-on-sx.md +++ b/plans/content-on-sx.md @@ -19,7 +19,7 @@ injected adapter, not core. ## Status (rolling) -`bash lib/content/conformance.sh` → **787/787** (Phases 1–4 COMPLETE + ~34 extensions, hardened: HTML/SX escaping, Markdown render + import/export incl. tables & frontmatter (full round-trip), CvRDT flat + nested-tree + durable replication, tree-aware validation, snapshot cache, doc metadata, plain-text render, nested block trees + deep editing + flatten + relative reorder, doc stats + summary + multi-doc index, table + callout + media blocks, HTML page wrapper + SEO page, doc composition + id-remap, portable data + wire serialization, block query + transforms + find/replace, TOC + anchored headings + outline, normalization) +`bash lib/content/conformance.sh` → **799/799** (Phases 1–4 COMPLETE + ~34 extensions, hardened: HTML/SX escaping, Markdown render + import/export incl. tables & frontmatter (full round-trip), CvRDT flat + nested-tree + durable replication, tree-aware validation, snapshot cache, doc metadata, plain-text render, nested block trees + deep editing + flatten + relative reorder, doc stats + summary + multi-doc index, table + callout + media blocks, HTML page wrapper + SEO page, doc composition + id-remap, portable data + wire serialization, block query + transforms + find/replace, TOC + anchored headings + outline, normalization) ## Ground rules @@ -107,6 +107,7 @@ lib/content/api.sx ── (content/edit) (content/render) (content/history) ─ - [x] document flatten (`flatten.sx`: content/flatten, un-nest sections; inverse of wrap-section) - [x] relative reorder (`move.sx`: content/move-before/after/to-front/to-back by id) - [x] document normalization (`normalize.sx`: content/normalize, drop empty blocks/sections) +- [x] document sanitization (`sanitize.sx`: content/sanitize, drop invalid blocks tree-wide; validate's enforcement partner) - [x] global find/replace (`find-replace.sx`: content/find-replace across text-bearing blocks) - [x] portable data serialization (`data.sx`: content/to-data + from-data, round-trips tree) - [x] wire serialization (`wire.sx`: content/to-wire + from-wire, SX-text on the wire) @@ -135,6 +136,21 @@ lib/content/api.sx ── (content/edit) (content/render) (content/history) ─ ## Progress log +- 2026-06-07 — Feature: `content/sanitize` — the enforcement counterpart to + `validate`. validate *reports* id/field issues; sanitize *removes* the + offending blocks (tree-wide) so federated/imported input that failed + validation can still be rendered/merged without faulting. Reuses validate's + own per-block predicate (`content/-block-issues`) so "what is invalid" stays + single-sourced and can't drift. Distinct from `normalize` (which drops *empty* + blocks): a section emptied of invalid children is kept (sanitize removes + invalid, not empty), but a section whose own shell is invalid (children not a + list) is dropped whole. Scope is per-block id/field validity — it does not + dedupe ids (cross-block, no single right answer). +12 tests (bad-field / + unknown-type / blank-id dropped, deep pruning, invalid-shell section dropped, + immutability, render-safe result). 799/799 (42 suites). (This was a genuine + remaining gap — validate had no enforcement partner — not filler; saturation + note below still holds for the roadmap proper.) + - 2026-06-07 — Audit (markdown round-trip): probed the Markdown text boundary for round-trip fidelity. Found one real data-corruption bug — table cells containing `|` don't survive `asMarkdown` → `md/import` (recorded under