diff --git a/lib/blogimport/README.md b/lib/blogimport/README.md new file mode 100644 index 00000000..6bd12067 --- /dev/null +++ b/lib/blogimport/README.md @@ -0,0 +1,47 @@ +# lib/blogimport — blog Postgres → persist genesis-import + parity verifier + +Implements **`plans/migration/data-migration.md`** (the "long-pole nobody had +started") and the at-rest half of **`slice-01-blog.md` §4** — the data layer of the +blog read-path migration. Host-ops migration tooling, **not** a domain core: it +composes the public APIs of content-on-sx (`lib/content`) and persist +(`lib/persist`). Kept in its own module (not `lib/host`, not `lib/content`) so it +doesn't collide with the loops that own those. + +Status: **machinery complete, 55/55 conformance** (lexical 23, import 21, verify 11). + +## What it does + +| Module | Role | +|---|---| +| `lexical.sx` | `blogimport/lex-blocks doc` — Ghost **lexical** body (as SX dicts) → content-on-sx **block list**, ids deterministic by position (`b0,b1,…`). | +| `import.sx` | `blogimport/import-post! b post at` — genesis import: convert the post's lexical, commit blocks as ordered `op-insert`s into the `content:` op-log stream, record metadata in a sibling `postmeta:` stream. Idempotent (skip-if-exists). `import-all!` → coverage scoreboard. | +| `verify.sx` | `blogimport/verify-post b post` — replay the stream → block model, diff vs the row-derived oracle with `=`. `verify-all` → `{:total :ok :mismatched}` coverage. | + +## What is proven + +The verifier holds **`lexical → import → persist → replay → block-model`** equal to +**`lexical → block-model`** computed directly. I.e. **the genesis import + op-log +replay is lossless** — "did the backfill corrupt anything" at rest +(`data-migration.md` §6). The `verify.sx` corruption test confirms a diverging stream +is *detected*, not silently passed. + +## Known limitations / TODO (carry into the plan) + +- **Inline formatting is flattened to plain text.** Architecture's content model holds + plain-string text (`mk-text id text`); Phase-5 rich inline runs are not merged here. + The single swap-point is `lex-inline-text` in `lexical.sx` — return runs there once + content-on-sx Phase 5 lands on `architecture`. Bold/italic/links currently collapse + to their plain concatenation (drift-proof, == `asText`). (slice-01-blog Q-B1.) +- **Oracle is the in-memory lexical→blocks, not the live Python block model.** This + proves round-trip fidelity through persist. The "does SX match Python" half of Q-D2 + needs the **live source**: read real `Post` rows via the internal-data query + (`/internal/data/…`) or direct Postgres (**Q-M4**, undecided) and feed them as `post` + dicts. The diff plumbing here is the twin that step reuses. +- **Re-import with an improved converter (Q-M5)** is import-once today (skip-if-exists). + Superseding prior genesis events (vs truncate+re-import) is future work. + +## Run + +```bash +bash lib/blogimport/conformance.sh # 55/55; writes scoreboard.{json,md} +``` diff --git a/lib/blogimport/conformance.sh b/lib/blogimport/conformance.sh new file mode 100755 index 00000000..465cd685 --- /dev/null +++ b/lib/blogimport/conformance.sh @@ -0,0 +1,119 @@ +#!/usr/bin/env bash +# lib/blogimport/conformance.sh — run blog-import suites, emit scoreboard. +# Mirrors lib/content/conformance.sh: epoch-loaded modules + a bi-test counter. + +set -uo pipefail +cd "$(git rev-parse --show-toplevel)" + +SX_SERVER="hosts/ocaml/_build/default/bin/sx_server.exe" +if [ ! -x "$SX_SERVER" ]; then + MAIN_ROOT=$(git worktree list | head -1 | awk '{print $1}') + if [ -x "$MAIN_ROOT/$SX_SERVER" ]; then + SX_SERVER="$MAIN_ROOT/$SX_SERVER" + else + echo "ERROR: sx_server.exe not found." >&2 + exit 1 + fi +fi + +SUITES=(lexical import verify) + +OUT_JSON="lib/blogimport/scoreboard.json" +OUT_MD="lib/blogimport/scoreboard.md" + +run_suite() { + local suite=$1 + local file="lib/blogimport/tests/${suite}.sx" + [ -f "$file" ] || { echo "0 0"; return; } + local TMP + TMP=$(mktemp) + cat > "$TMP" << EPOCHS +(epoch 1) +(load "lib/smalltalk/tokenizer.sx") +(load "lib/smalltalk/parser.sx") +(load "lib/guest/reflective/class-chain.sx") +(load "lib/smalltalk/runtime.sx") +(load "lib/guest/reflective/env.sx") +(load "lib/smalltalk/eval.sx") +(load "lib/persist/event.sx") +(load "lib/persist/backend.sx") +(load "lib/persist/log.sx") +(load "lib/persist/kv.sx") +(load "lib/persist/api.sx") +(load "lib/content/block.sx") +(load "lib/content/doc.sx") +(load "lib/content/render.sx") +(load "lib/content/api.sx") +(load "lib/content/meta.sx") +(load "lib/content/section.sx") +(load "lib/content/callout.sx") +(load "lib/content/media.sx") +(load "lib/content/store.sx") +(load "lib/blogimport/lexical.sx") +(load "lib/blogimport/import.sx") +(load "lib/blogimport/verify.sx") +(epoch 2) +(eval "(define bi-test-pass 0)") +(eval "(define bi-test-fail 0)") +(eval "(define bi-test-fails (list))") +(eval "(define bi-test (fn (name got expected) (if (= got expected) (set! bi-test-pass (+ bi-test-pass 1)) (begin (set! bi-test-fail (+ bi-test-fail 1)) (set! bi-test-fails (cons name bi-test-fails))))))") +(epoch 3) +(load "${file}") +(epoch 4) +(eval "(list bi-test-pass bi-test-fail)") +EPOCHS + + local OUTPUT + OUTPUT=$(timeout 240 "$SX_SERVER" < "$TMP" 2>/dev/null) + rm -f "$TMP" + + local LINE + LINE=$(echo "$OUTPUT" | awk '/^\(ok-len 4 / {getline; print; exit}') + if [ -z "$LINE" ]; then + LINE=$(echo "$OUTPUT" | grep -E '^\(ok 4 \([0-9]+ [0-9]+\)\)' | tail -1 \ + | sed -E 's/^\(ok 4 //; s/\)$//') + fi + + local P F + P=$(echo "$LINE" | sed -E 's/^\(([0-9]+) ([0-9]+)\).*/\1/') + F=$(echo "$LINE" | sed -E 's/^\(([0-9]+) ([0-9]+)\).*/\2/') + echo "${P:-0} ${F:-0}" +} + +declare -A SUITE_PASS SUITE_FAIL +TOTAL_PASS=0 +TOTAL_FAIL=0 + +echo "Running blogimport conformance suite..." >&2 +for s in "${SUITES[@]}"; do + read -r p f < <(run_suite "$s") + SUITE_PASS[$s]=$p + SUITE_FAIL[$s]=$f + TOTAL_PASS=$((TOTAL_PASS + p)) + TOTAL_FAIL=$((TOTAL_FAIL + f)) + printf " %-10s %d/%d\n" "$s" "$p" "$((p+f))" >&2 +done + +{ + printf '{\n "suites": {\n' + first=1 + for s in "${SUITES[@]}"; do + if [ $first -eq 0 ]; then printf ',\n'; fi + printf ' "%s": {"pass": %d, "fail": %d}' "$s" "${SUITE_PASS[$s]}" "${SUITE_FAIL[$s]}" + first=0 + done + printf '\n },\n "total_pass": %d,\n "total_fail": %d,\n "total": %d\n}\n' \ + "$TOTAL_PASS" "$TOTAL_FAIL" "$((TOTAL_PASS + TOTAL_FAIL))" +} > "$OUT_JSON" + +{ + printf '# blogimport Conformance Scoreboard\n\n_Generated by `lib/blogimport/conformance.sh`_\n\n' + printf '| Suite | Pass | Fail | Total |\n|-------|-----:|-----:|------:|\n' + for s in "${SUITES[@]}"; do + printf '| %s | %d | %d | %d |\n' "$s" "${SUITE_PASS[$s]}" "${SUITE_FAIL[$s]}" "$(( ${SUITE_PASS[$s]} + ${SUITE_FAIL[$s]} ))" + done + printf '| **Total** | **%d** | **%d** | **%d** |\n' "$TOTAL_PASS" "$TOTAL_FAIL" "$((TOTAL_PASS + TOTAL_FAIL))" +} > "$OUT_MD" + +echo "Total: $TOTAL_PASS pass, $TOTAL_FAIL fail" >&2 +[ "$TOTAL_FAIL" -eq 0 ] diff --git a/lib/blogimport/import.sx b/lib/blogimport/import.sx new file mode 100644 index 00000000..94311a42 --- /dev/null +++ b/lib/blogimport/import.sx @@ -0,0 +1,84 @@ +; lib/blogimport/import.sx +; Genesis import: a blog Post row -> a persist content op-log stream. +; +; Per plans/migration/data-migration.md §3-5: for each Post, convert its lexical +; body to content blocks and commit them as genesis insert ops into the +; content: stream, idempotently, with post metadata recorded as an event in a +; sibling stream. The same code runs on mem and durable persist backends (every fn +; takes the backend `b`, the acl.sx design principle). +; +; A `post` is a dict mirroring the blog Post row: +; {:id "uuid" :slug "hello" :title "Hello" :status "published" +; :visibility "public" :tags (list "a") :authors (list "u1") +; :lexical } +; Reading real rows (internal-data query vs direct Postgres, Q-M4) is the live-source +; edge, out of scope here; this drives content/commit! given a `post` dict. + +; --- genesis ops: insert each block in document order (deterministic) ----------- +; first block after nil (prepend), each subsequent after the previous block's id, +; reproducing source order so re-import yields the same sequence (data-migration §5). +(define + blogimport/genesis-ops + (fn (blocks) + (let ((ids (map blk-id blocks))) + (map-indexed + (fn (i blk) (op-insert blk (if (= i 0) nil (nth ids (- i 1))))) + blocks)))) + +; --- post metadata (title/slug/status/visibility/tags/authors) ------------------ +(define + blogimport/post-meta + (fn (post) + {:title (or (get post :title) "") + :slug (or (get post :slug) "") + :status (or (get post :status) "") + :visibility (or (get post :visibility) "") + :tags (or (get post :tags) (list)) + :authors (or (get post :authors) (list))})) + +; metadata is not a content op, so it rides a sibling event stream postmeta:; +; latest event wins (LWW). Replayable + durable like the block op-log. +(define blogimport/meta-stream (fn (id) (str "postmeta:" id))) + +(define + blogimport/commit-meta! + (fn (b id meta at) + (persist/append b (blogimport/meta-stream id) "post-meta" at meta))) + +(define + blogimport/load-meta + (fn (b id) + (let ((evs (persist/read b (blogimport/meta-stream id)))) + (if (= (len evs) 0) nil (persist/event-data (nth evs (- (len evs) 1))))))) + +; --- idempotency: a stream already holding events is already imported ----------- +; (host-persist guarantees monotonic seq but NOT dedupe — skip-if-exists is the +; importer's dedupe, so re-running the backfill never double-imports. data-migration +; §5.) Re-import with an improved converter (Q-M5) is future work — superseding, +; not duplicating; this build is import-once. +(define + blogimport/imported? + (fn (b id) (> (content/version-count b id) 0))) + +; --- import one post ------------------------------------------------------------ +(define + blogimport/import-post! + (fn (b post at) + (let ((id (get post :id))) + (if + (blogimport/imported? b id) + {:id id :imported false :reason "exists"} + (let ((blocks (blogimport/lex-blocks (get post :lexical)))) + (begin + (content/commit-all! b id (blogimport/genesis-ops blocks) at) + (blogimport/commit-meta! b id (blogimport/post-meta post) at) + {:id id :imported true :blocks (len blocks)})))))) + +; --- import many: coverage scoreboard ------------------------------------------- +(define + blogimport/import-all! + (fn (b posts at) + (let ((results (map (fn (p) (blogimport/import-post! b p at)) posts))) + {:total (len results) + :imported (len (filter (fn (r) (get r :imported)) results)) + :skipped (len (filter (fn (r) (not (get r :imported))) results))}))) diff --git a/lib/blogimport/lexical.sx b/lib/blogimport/lexical.sx new file mode 100644 index 00000000..b00a1738 --- /dev/null +++ b/lib/blogimport/lexical.sx @@ -0,0 +1,129 @@ +; lib/blogimport/lexical.sx +; Lexical (Ghost editor JSON, as SX dicts) -> content-on-sx block list. +; +; The blog migration's lexical->blocks converter. Lives on the blog/migration +; side (NOT lib/content, NOT lib/host) per plans/migration/data-migration.md §7. +; +; Input shape: a lexical document is an SX dict mirroring the JSON 1:1, e.g. +; {:root {:children (list +; {:type "heading" :tag "h2" :children (list {:type "text" :text "Hi"})} +; {:type "paragraph" :children (list +; {:type "text" :text "plain "} +; {:type "text" :text "bold" :format 1} +; {:type "link" :url "/x" :children (list {:type "text" :text "here"})})})}} +; +; Block ids are assigned deterministically by top-level position ("b0","b1",...) +; so a re-import yields the SAME block sequence (data-migration.md §5 ordering rule). +; +; INLINE FORMATTING: architecture's content model holds PLAIN-STRING text +; (mk-text id text). Phase-5 rich inline runs are not merged here yet, so inline +; nodes are flattened to their plain concatenation (== asText, drift-proof). The +; single swap-point for the runs upgrade is `lex-inline-text` below — when +; content-on-sx Phase 5 lands on architecture, return runs there instead of a +; string. (slice-01-blog.md Q-B1; "prove the machinery first, then swap".) + +; Inline format bitmask (lexical): bold=1 italic=2 strikethrough=4 underline=8 +; code=16 subscript=32 superscript=64. Decoding the bitmask into mark keywords is +; deferred to the Phase-5 runs upgrade (no bitwise prim on architecture, and the +; active path flattens to plain text anyway). The :format field is read at the +; swap-point `lex-inline-text` when runs land. + +; --- inline node -> plain text -------------------------------------------------- +(define + lex-inline-node-text + (fn (node) + (let ((t (get node :type))) + (cond + ((equal? t "text") (or (get node :text) "")) + ((equal? t "linebreak") "\n") + ((equal? t "tab") "\t") + ((equal? t "link") (lex-inline-text (or (get node :children) (list)))) + ((equal? t "autolink") (lex-inline-text (or (get node :children) (list)))) + ((equal? t "at-link") (lex-inline-text (or (get node :children) (list)))) + ((equal? t "code-highlight") (or (get node :text) "")) + (else ""))))) + +; flatten a list of inline nodes to one plain string. +; *** Phase-5 swap-point: return a runs list here once mk-text accepts runs. *** +(define + lex-inline-text + (fn (children) + (reduce + (fn (acc n) (str acc (lex-inline-node-text n))) + "" + children))) + +; --- helpers -------------------------------------------------------------------- +(define + lex-heading-level + (fn (tag) + (cond + ((equal? tag "h1") 1) + ((equal? tag "h2") 2) + ((equal? tag "h3") 3) + ((equal? tag "h4") 4) + ((equal? tag "h5") 5) + ((equal? tag "h6") 6) + (else 2)))) + +(define + lex-listitem-text + (fn (item) + (lex-inline-text (or (get item :children) (list))))) + +; --- one lexical block node -> a content block (id assigned by caller) ---------- +(define + lex-block + (fn (node id) + (let ((t (get node :type))) + (cond + ((equal? t "paragraph") + (mk-text id (lex-inline-text (or (get node :children) (list))))) + ((equal? t "extended-text") + (mk-text id (lex-inline-text (or (get node :children) (list))))) + ((equal? t "heading") + (mk-heading id (lex-heading-level (get node :tag)) + (lex-inline-text (or (get node :children) (list))))) + ((equal? t "extended-heading") + (mk-heading id (lex-heading-level (get node :tag)) + (lex-inline-text (or (get node :children) (list))))) + ((equal? t "quote") + (mk-quote id "" (lex-inline-text (or (get node :children) (list))))) + ((equal? t "extended-quote") + (mk-quote id "" (lex-inline-text (or (get node :children) (list))))) + ((equal? t "codeblock") + (mk-code id (or (get node :language) "") (or (get node :code) ""))) + ((equal? t "list") + (mk-list id + (equal? (get node :listType) "number") + (map lex-listitem-text (or (get node :children) (list))))) + ((equal? t "horizontalrule") (mk-divider id)) + ((equal? t "image") + (mk-image id (or (get node :src) "") (or (get node :alt) ""))) + ((equal? t "callout") + (mk-callout id (or (get node :backgroundColor) "grey") + (lex-inline-text (or (get node :children) (list))))) + ((equal? t "video") (mk-media id "video" (or (get node :src) ""))) + ((equal? t "audio") (mk-media id "audio" (or (get node :src) ""))) + ((equal? t "embed") (mk-embed id (or (get node :url) "") "embed")) + ((equal? t "bookmark") (mk-embed id (or (get node :url) "") "bookmark")) + ; unknown/unsupported card: route to a generic embed tagged by type so + ; nothing is silently dropped (provider records the original node type). + (else (mk-embed id "" (or t "unknown"))))))) + +; --- doc -> top-level children list --------------------------------------------- +(define + lex-doc-children + (fn (doc) + (cond + ((not (equal? (get doc :root) nil)) (or (get (get doc :root) :children) (list))) + ((not (equal? (get doc :children) nil)) (get doc :children)) + (else (list))))) + +; --- doc -> content block list (deterministic ids by position) ------------------ +(define + blogimport/lex-blocks + (fn (doc) + (map-indexed + (fn (i node) (lex-block node (str "b" i))) + (lex-doc-children doc)))) diff --git a/lib/blogimport/scoreboard.json b/lib/blogimport/scoreboard.json new file mode 100644 index 00000000..29b5bb3a --- /dev/null +++ b/lib/blogimport/scoreboard.json @@ -0,0 +1,10 @@ +{ + "suites": { + "lexical": {"pass": 23, "fail": 0}, + "import": {"pass": 21, "fail": 0}, + "verify": {"pass": 11, "fail": 0} + }, + "total_pass": 55, + "total_fail": 0, + "total": 55 +} diff --git a/lib/blogimport/scoreboard.md b/lib/blogimport/scoreboard.md new file mode 100644 index 00000000..3a05ff44 --- /dev/null +++ b/lib/blogimport/scoreboard.md @@ -0,0 +1,10 @@ +# blogimport Conformance Scoreboard + +_Generated by `lib/blogimport/conformance.sh`_ + +| Suite | Pass | Fail | Total | +|-------|-----:|-----:|------:| +| lexical | 23 | 0 | 23 | +| import | 21 | 0 | 21 | +| verify | 11 | 0 | 11 | +| **Total** | **55** | **0** | **55** | diff --git a/lib/blogimport/tests/import.sx b/lib/blogimport/tests/import.sx new file mode 100644 index 00000000..0e192df6 --- /dev/null +++ b/lib/blogimport/tests/import.sx @@ -0,0 +1,62 @@ +; lib/blogimport/tests/import.sx — genesis import + idempotency +(st-bootstrap-classes!) +(content-bootstrap-blocks!) +(content-bootstrap-doc!) +(content-bootstrap-callout!) +(content-bootstrap-media!) + +(define + p1 + {:id "post-1" :slug "hello" :title "Hello" :status "published" + :visibility "public" :tags (list "news") :authors (list "u1") + :lexical {:root {:children (list + {:type "heading" :tag "h1" :children (list {:type "text" :text "Hello"})} + {:type "paragraph" :children (list {:type "text" :text "world"})})}}}) + +(define + p2 + {:id "post-2" :slug "two" :title "Two" :status "published" + :lexical {:children (list + {:type "paragraph" :children (list {:type "text" :text "second"})})}}) + +; ---- genesis-ops ordering ---- +(define ops1 (blogimport/genesis-ops (blogimport/lex-blocks (get p1 :lexical)))) +(bi-test "genesis op kinds" (map (fn (o) (get o :op)) ops1) (list "insert" "insert")) +(bi-test "genesis first after nil" (get (nth ops1 0) :after) nil) +(bi-test "genesis second after first id" (get (nth ops1 1) :after) "b0") + +; ---- import one ---- +(define B (persist/open)) +(define r1 (blogimport/import-post! B p1 10)) +(bi-test "import imported flag" (get r1 :imported) true) +(bi-test "import block count" (get r1 :blocks) 2) +(bi-test "stream version-count" (content/version-count B "post-1") 2) +(bi-test "head ids" (doc-ids (content/head B "post-1")) (list "b0" "b1")) +(bi-test "head body text" + (str (blk-send (doc-find (content/head B "post-1") "b1") "text")) "world") +(bi-test "head heading level" + (blk-send (doc-find (content/head B "post-1") "b0") "level") 1) + +; ---- metadata round-trip ---- +(bi-test "meta round-trip" (blogimport/load-meta B "post-1") (blogimport/post-meta p1)) +(bi-test "meta title" (get (blogimport/load-meta B "post-1") :title) "Hello") +(bi-test "meta tags" (get (blogimport/load-meta B "post-1") :tags) (list "news")) + +; ---- idempotent re-import (skip-if-exists, no duplication) ---- +(define r1b (blogimport/import-post! B p1 99)) +(bi-test "reimport skipped" (get r1b :imported) false) +(bi-test "reimport reason" (get r1b :reason) "exists") +(bi-test "version-count unchanged after reimport" (content/version-count B "post-1") 2) +(bi-test "head ids unchanged after reimport" + (doc-ids (content/head B "post-1")) (list "b0" "b1")) + +; ---- import-all! coverage scoreboard ---- +(define B2 (persist/open)) +(define cov1 (blogimport/import-all! B2 (list p1 p2) 5)) +(bi-test "import-all total" (get cov1 :total) 2) +(bi-test "import-all imported" (get cov1 :imported) 2) +(bi-test "import-all skipped" (get cov1 :skipped) 0) +; re-run is fully idempotent +(define cov2 (blogimport/import-all! B2 (list p1 p2) 6)) +(bi-test "import-all rerun imported" (get cov2 :imported) 0) +(bi-test "import-all rerun skipped" (get cov2 :skipped) 2) diff --git a/lib/blogimport/tests/lexical.sx b/lib/blogimport/tests/lexical.sx new file mode 100644 index 00000000..52cde8b7 --- /dev/null +++ b/lib/blogimport/tests/lexical.sx @@ -0,0 +1,92 @@ +; lib/blogimport/tests/lexical.sx — lexical -> content block converter +(st-bootstrap-classes!) +(content-bootstrap-blocks!) +(content-bootstrap-doc!) +(content-bootstrap-callout!) +(content-bootstrap-media!) + +; ---- a representative lexical document (Ghost editor JSON, as SX dicts) ---- +(define + doc + {:root {:children (list + {:type "heading" :tag "h2" :children (list {:type "text" :text "Title"})} + {:type "paragraph" :children (list + {:type "text" :text "plain "} + {:type "text" :text "bold" :format 1} + {:type "text" :text " then "} + {:type "link" :url "/x" :children (list {:type "text" :text "a link"})})} + {:type "quote" :children (list {:type "text" :text "wise words"})} + {:type "list" :listType "number" :children (list + {:type "listitem" :children (list {:type "text" :text "one"})} + {:type "listitem" :children (list {:type "text" :text "two"})})} + {:type "codeblock" :language "python" :code "print(1)"} + {:type "horizontalrule"} + {:type "image" :src "/c.png" :alt "a cat"} + {:type "callout" :backgroundColor "blue" :children (list {:type "text" :text "note!"})} + {:type "twitter" :url "https://t/x"})}}) + +(define blocks (blogimport/lex-blocks doc)) + +; ---- structure ---- +(bi-test "block count" (len blocks) 9) +(bi-test "ids by position" (map blk-id blocks) + (list "b0" "b1" "b2" "b3" "b4" "b5" "b6" "b7" "b8")) +(bi-test "types in order" (map blk-type blocks) + (list "heading" "text" "quote" "list" "code" "divider" "image" "callout" "embed")) + +; ---- heading ---- +(bi-test "heading level" (blk-send (nth blocks 0) "level") 2) +(bi-test "heading text" (str (blk-send (nth blocks 0) "text")) "Title") + +; ---- paragraph with inline bold + link, flattened to plain concatenation ---- +(bi-test "paragraph flattened text" + (str (blk-send (nth blocks 1) "text")) "plain bold then a link") + +; ---- quote ---- +(bi-test "quote text" (str (blk-send (nth blocks 2) "text")) "wise words") + +; ---- ordered list with items ---- +(bi-test "list ordered" (blk-send (nth blocks 3) "ordered") true) +(bi-test "list items" (blk-send (nth blocks 3) "items") (list "one" "two")) + +; ---- code block ---- +(bi-test "code language" (str (blk-send (nth blocks 4) "language")) "python") +(bi-test "code text" (str (blk-send (nth blocks 4) "text")) "print(1)") + +; ---- image ---- +(bi-test "image src" (str (blk-send (nth blocks 6) "src")) "/c.png") +(bi-test "image alt" (str (blk-send (nth blocks 6) "alt")) "a cat") + +; ---- callout ---- +(bi-test "callout kind" (str (blk-send (nth blocks 7) "kind")) "blue") +(bi-test "callout text" (str (blk-send (nth blocks 7) "text")) "note!") + +; ---- unknown card routed to embed, provider records original type ---- +(bi-test "unknown -> embed provider" (str (blk-send (nth blocks 8) "provider")) "twitter") + +; ---- heading level mapping ---- +(bi-test "h1 level" (lex-heading-level "h1") 1) +(bi-test "h4 level" (lex-heading-level "h4") 4) +(bi-test "unknown tag default" (lex-heading-level "hx") 2) + +; ---- bullet list ---- +(define + bdoc + {:children (list {:type "list" :listType "bullet" :children (list + {:type "listitem" :children (list {:type "text" :text "x"})})})}) +(bi-test "bullet not ordered" (blk-send (nth (blogimport/lex-blocks bdoc) 0) "ordered") false) + +; ---- empty doc ---- +(bi-test "empty doc -> no blocks" (len (blogimport/lex-blocks {:root {:children (list)}})) 0) + +; ---- bare-children doc (no :root wrapper) ---- +(bi-test "bare children doc" + (map blk-type (blogimport/lex-blocks {:children (list {:type "paragraph" :children (list {:type "text" :text "hi"})})})) + (list "text")) + +; ---- linebreak/tab in inline flattening ---- +(bi-test "linebreak flatten" + (str (blk-send (nth (blogimport/lex-blocks + {:children (list {:type "paragraph" :children (list + {:type "text" :text "a"} {:type "linebreak"} {:type "text" :text "b"})})}) 0) "text")) + "a\nb") diff --git a/lib/blogimport/tests/verify.sx b/lib/blogimport/tests/verify.sx new file mode 100644 index 00000000..a3a35820 --- /dev/null +++ b/lib/blogimport/tests/verify.sx @@ -0,0 +1,57 @@ +; lib/blogimport/tests/verify.sx — shadow-diff at rest (round-trip parity) +(st-bootstrap-classes!) +(content-bootstrap-blocks!) +(content-bootstrap-doc!) +(content-bootstrap-callout!) +(content-bootstrap-media!) + +(define + p1 + {:id "post-1" :slug "hello" :title "Hello" :status "published" + :visibility "public" :tags (list "news") :authors (list "u1") + :lexical {:root {:children (list + {:type "heading" :tag "h2" :children (list {:type "text" :text "Title"})} + {:type "paragraph" :children (list + {:type "text" :text "plain "} + {:type "text" :text "bold" :format 1})} + {:type "list" :listType "number" :children (list + {:type "listitem" :children (list {:type "text" :text "one"})} + {:type "listitem" :children (list {:type "text" :text "two"})})} + {:type "image" :src "/c.png" :alt "cat"})}}}) + +(define + px + {:id "post-x" :slug "ghost" :title "Ghost" :status "published" + :lexical {:children (list {:type "paragraph" :children (list {:type "text" :text "never imported"})})}}) + +; ---- happy path: replayed == oracle ---- +(define B (persist/open)) +(blogimport/import-post! B p1 10) +(define v1 (blogimport/verify-post B p1)) +(bi-test "verify ok" (get v1 :ok) true) +(bi-test "verify block-ok" (get v1 :block-ok) true) +(bi-test "verify meta-ok" (get v1 :meta-ok) true) + +; ---- oracle block model is what we expect (inline bold flattened) ---- +(define orc (blogimport/oracle p1)) +(bi-test "oracle types" + (get (get orc :blocks) :types) (list "heading" "text" "list" "image")) +(bi-test "oracle contents" + (get (get orc :blocks) :contents) (list "Title" "plain bold" (list "one" "two") "/c.png")) + +; ---- corruption is DETECTED (op-log diverges from oracle) ---- +(content/commit! B "post-1" (op-update "b1" "text" "CORRUPTED") 100) +(define v2 (blogimport/verify-post B p1)) +(bi-test "verify detects corruption" (get v2 :ok) false) +(bi-test "verify corruption is block-level" (get v2 :block-ok) false) + +; ---- an un-imported post fails verification (empty replay vs non-empty oracle) ---- +(bi-test "unimported not ok" (get (blogimport/verify-post B px) :ok) false) + +; ---- verify-all coverage scoreboard ---- +(define B3 (persist/open)) +(blogimport/import-post! B3 p1 10) +(define cov (blogimport/verify-all B3 (list p1 px))) +(bi-test "verify-all total" (get cov :total) 2) +(bi-test "verify-all ok count" (get cov :ok) 1) +(bi-test "verify-all mismatched" (get cov :mismatched) (list "post-x")) diff --git a/lib/blogimport/verify.sx b/lib/blogimport/verify.sx new file mode 100644 index 00000000..90311c9b --- /dev/null +++ b/lib/blogimport/verify.sx @@ -0,0 +1,73 @@ +; lib/blogimport/verify.sx +; Shadow-diff at rest (plans/migration/data-migration.md §6, slice-01-blog.md §4). +; +; After backfill, replay each content: stream -> materialized doc -> block +; model, and diff against the row-derived oracle (lexical->blocks computed directly). +; Structural compare with `=` (not equal?). This proves the genesis import + op-log +; replay is LOSSLESS — "did the backfill corrupt anything" at rest. +; +; The oracle here is the in-memory lexical->blocks of the SAME post, so the property +; verified is round-trip fidelity through persist. Cross-checking against the LIVE +; Python block model (the "does SX match Python" half of Q-D2) is a later wiring +; step that needs the Python oracle via the internal-data query (Q-M4) — flagged, +; not built. The diff plumbing here is the twin that step reuses. + +; --- salient content per block (normalized; same on both sides) ----------------- +; ids are deterministic + identical on both sides, so they are kept (not stripped). +(define + blogimport/blk-content + (fn (b) + (let ((t (blk-type b))) + (cond + ((equal? t "image") (str (blk-send b "src"))) + ((equal? t "media") (str (blk-send b "src"))) + ((equal? t "embed") (str (blk-send b "url"))) + ((equal? t "list") (blk-send b "items")) + ((equal? t "divider") "") + (else (str (blk-send b "text"))))))) + +; --- block model of a block list ------------------------------------------------ +(define + blogimport/blocks-model + (fn (blocks) + {:ids (map blk-id blocks) + :types (map blk-type blocks) + :contents (map blogimport/blk-content blocks)})) + +; --- oracle: lexical->blocks computed directly from the post (no persist) -------- +(define + blogimport/oracle + (fn (post) + {:blocks (blogimport/blocks-model (blogimport/lex-blocks (get post :lexical))) + :meta (blogimport/post-meta post)})) + +; --- replayed: from the persisted stream ---------------------------------------- +(define + blogimport/replayed + (fn (b id) + {:blocks (blogimport/blocks-model (content/blocks (content/head b id))) + :meta (blogimport/load-meta b id)})) + +; --- verify one post: replayed must equal oracle -------------------------------- +(define + blogimport/verify-post + (fn (b post) + (let ((id (get post :id))) + (let ((orc (blogimport/oracle post)) + (rep (blogimport/replayed b id))) + (let ((block-ok (= (get orc :blocks) (get rep :blocks))) + (meta-ok (= (get orc :meta) (get rep :meta)))) + {:id id + :ok (and block-ok meta-ok) + :block-ok block-ok + :meta-ok meta-ok}))))) + +; --- verify many: coverage scoreboard ------------------------------------------- +(define + blogimport/verify-all + (fn (b posts) + (let ((results (map (fn (p) (blogimport/verify-post b p)) posts))) + {:total (len results) + :ok (len (filter (fn (r) (get r :ok)) results)) + :mismatched (map (fn (r) (get r :id)) + (filter (fn (r) (not (get r :ok))) results))})))