From 28fed7c7994ff9d4d3f53fb2494669bb272d7e0f Mon Sep 17 00:00:00 2001 From: giles Date: Sun, 7 Jun 2026 12:32:14 +0000 Subject: [PATCH] =?UTF-8?q?artdag:=20fault-tolerant=20execution=20?= =?UTF-8?q?=E2=80=94=20confined=20failure,=20cache=20never=20poisoned=20+?= =?UTF-8?q?=2014=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fault.sx run-safe: a node op may return (artdag/fail reason); failure is confined to that node + downstream dependents while independent branches compute, and failed results are never cached, so retry after a fix recomputes only the failed closure and hits the good nodes. fault 14/14, total 158/158. Co-Authored-By: Claude Opus 4.8 (1M context) --- lib/artdag/conformance.sh | 3 +- lib/artdag/fault.sx | 56 +++++++++++++++ lib/artdag/scoreboard.json | 7 +- lib/artdag/scoreboard.md | 3 +- lib/artdag/tests/fault.sx | 144 +++++++++++++++++++++++++++++++++++++ plans/artdag-on-sx.md | 9 ++- 6 files changed, 216 insertions(+), 6 deletions(-) create mode 100644 lib/artdag/fault.sx create mode 100644 lib/artdag/tests/fault.sx diff --git a/lib/artdag/conformance.sh b/lib/artdag/conformance.sh index c2d19bba..7efa9077 100755 --- a/lib/artdag/conformance.sh +++ b/lib/artdag/conformance.sh @@ -13,7 +13,7 @@ if [ ! -x "$SX_SERVER" ]; then exit 1 fi -SUITES=(dag analyze plan execute optimize fed cost serialize stats) +SUITES=(dag analyze plan execute optimize fed cost serialize stats fault) OUT_JSON="lib/artdag/scoreboard.json" OUT_MD="lib/artdag/scoreboard.md" @@ -50,6 +50,7 @@ run_suite() { (load "lib/artdag/cost.sx") (load "lib/artdag/serialize.sx") (load "lib/artdag/stats.sx") +(load "lib/artdag/fault.sx") (epoch 2) (eval "(define artdag-test-pass 0)") (eval "(define artdag-test-fail 0)") diff --git a/lib/artdag/fault.sx b/lib/artdag/fault.sx new file mode 100644 index 00000000..fb579199 --- /dev/null +++ b/lib/artdag/fault.sx @@ -0,0 +1,56 @@ +; lib/artdag/fault.sx — fault-tolerant execution. A node op may fail by returning +; (artdag/fail reason); the failure is confined to that node and its transitive +; dependents (which cannot run without it), while independent branches still +; compute. Failed results are NEVER cached, so a later run with the fault fixed +; recomputes only the failed closure. Depends on execute.sx and plan.sx. + +(define artdag/fail (fn (reason) {:artdag-fail true :reason reason})) +(define artdag/failed? (fn (v) (and (dict? v) (has-key? v :artdag-fail)))) + +(define + artdag/-exec-safe-node + (fn + (dag runner cache acc id) + (let + ((node (artdag/dag-get dag id))) + (let + ((ins (artdag/node-inputs node))) + (if + (some (fn (in) (artdag/member? in (get acc :failed))) ins) + (assoc acc :failed (concat (get acc :failed) (list id))) + (if + (persist/kv-has? cache id) + (assoc + acc + :results (assoc (get acc :results) id (persist/kv-get cache id)) + :hits (concat (get acc :hits) (list id))) + (let + ((inputs (map (fn (in) (artdag/-input-result (get acc :results) cache in)) ins))) + (let + ((result (runner (artdag/node-op node) (artdag/node-params node) inputs))) + (if + (artdag/failed? result) + (assoc acc :failed (concat (get acc :failed) (list id))) + (begin + (persist/kv-put cache id result) + (assoc + acc + :results (assoc (get acc :results) id result) + :recomputed (concat (get acc :recomputed) (list id))))))))))))) + +(define + artdag/run-safe + (fn + (dag runner cache) + (reduce + (fn (acc id) (artdag/-exec-safe-node dag runner cache acc id)) + {:recomputed (list) :results {} :hits (list) :failed (list)} + (artdag/plan-flatten (artdag/plan dag 0))))) + +(define + artdag/failed-nodes + (fn (exec) (artdag/sort-strings (get exec :failed)))) +(define artdag/failure-count (fn (exec) (len (get exec :failed)))) +(define + artdag/all-ok? + (fn (exec) (= (len (get exec :failed)) 0))) diff --git a/lib/artdag/scoreboard.json b/lib/artdag/scoreboard.json index 2b4698d7..52d1d93b 100644 --- a/lib/artdag/scoreboard.json +++ b/lib/artdag/scoreboard.json @@ -8,9 +8,10 @@ "fed": {"pass": 15, "fail": 0}, "cost": {"pass": 13, "fail": 0}, "serialize": {"pass": 13, "fail": 0}, - "stats": {"pass": 12, "fail": 0} + "stats": {"pass": 12, "fail": 0}, + "fault": {"pass": 14, "fail": 0} }, - "total_pass": 144, + "total_pass": 158, "total_fail": 0, - "total": 144 + "total": 158 } diff --git a/lib/artdag/scoreboard.md b/lib/artdag/scoreboard.md index 5e61f292..ea91478b 100644 --- a/lib/artdag/scoreboard.md +++ b/lib/artdag/scoreboard.md @@ -13,4 +13,5 @@ _Generated by `lib/artdag/conformance.sh`_ | cost | 13 | 0 | 13 | | serialize | 13 | 0 | 13 | | stats | 12 | 0 | 12 | -| **Total** | **144** | **0** | **144** | +| fault | 14 | 0 | 14 | +| **Total** | **158** | **0** | **158** | diff --git a/lib/artdag/tests/fault.sx b/lib/artdag/tests/fault.sx new file mode 100644 index 00000000..a7bcbc0a --- /dev/null +++ b/lib/artdag/tests/fault.sx @@ -0,0 +1,144 @@ +; fault-tolerant execution: failure confined to its closure, cache never poisoned. + +(define ft-BAD (artdag/op-table-runner {:boom (fn (p i) (artdag/fail "kaboom")) :in (fn (p i) (get p :v)) :add (fn (p i) (+ (nth i 0) (nth i 1))) :inc (fn (p i) (+ 1 (first i)))})) + +(define ft-GOOD (artdag/op-table-runner {:boom (fn (p i) 99) :in (fn (p i) (get p :v)) :add (fn (p i) (+ (nth i 0) (nth i 1))) :inc (fn (p i) (+ 1 (first i)))})) + +; p,q leaves; b=inc(p) (independent); c=boom(q); d=add(b,c) +(define + ft-D + (artdag/build + (list + (list "p" "in" (list) {:v 10}) + (list "q" "in" (list) {:v 20}) + (list "b" "inc" (list "p") {}) + (list "c" "boom" (list "q") {}) + (list "d" "add" (list "b" "c") {} true)))) + +; ---- markers ---- + +(artdag-test + "fail constructor is detected" + (artdag/failed? (artdag/fail "x")) + true) + +(artdag-test + "plain values are not failures" + (artdag/failed? 42) + false) + +; ---- failure confinement ---- + +(artdag-test + "failure count covers node and its dependents" + (let + ((cache (persist/open))) + (artdag/failure-count (artdag/run-safe ft-D ft-BAD cache))) + 2) + +(artdag-test + "failed set is exactly c and d" + (let + ((cache (persist/open))) + (artdag/failed-nodes (artdag/run-safe ft-D ft-BAD cache))) + (artdag/sort-strings + (list (artdag/dag-id ft-D "c") (artdag/dag-id ft-D "d")))) + +(artdag-test + "independent branch still computes" + (let + ((cache (persist/open))) + (artdag/recompute-count (artdag/run-safe ft-D ft-BAD cache))) + 3) + +(artdag-test + "independent node result is available" + (let + ((cache (persist/open))) + (artdag/result-of + (artdag/run-safe ft-D ft-BAD cache) + (artdag/dag-id ft-D "b"))) + 11) + +(artdag-test + "all-ok? is false when something failed" + (let + ((cache (persist/open))) + (artdag/all-ok? (artdag/run-safe ft-D ft-BAD cache))) + false) + +(artdag-test + "all-ok? is true on a clean run" + (let + ((cache (persist/open))) + (artdag/all-ok? (artdag/run-safe ft-D ft-GOOD cache))) + true) + +; ---- cache integrity ---- + +(artdag-test + "good node is cached" + (let + ((cache (persist/open))) + (begin + (artdag/run-safe ft-D ft-BAD cache) + (persist/kv-has? cache (artdag/dag-id ft-D "b")))) + true) + +(artdag-test + "failed node is never cached" + (let + ((cache (persist/open))) + (begin + (artdag/run-safe ft-D ft-BAD cache) + (persist/kv-has? cache (artdag/dag-id ft-D "c")))) + false) + +; ---- retry after fix ---- + +(artdag-test + "retry recomputes only the failed closure" + (let + ((cache (persist/open))) + (begin + (artdag/run-safe ft-D ft-BAD cache) + (artdag/recompute-count (artdag/run-safe ft-D ft-GOOD cache)))) + 2) + +(artdag-test + "retry reuses the good nodes from cache" + (let + ((cache (persist/open))) + (begin + (artdag/run-safe ft-D ft-BAD cache) + (artdag/hit-count (artdag/run-safe ft-D ft-GOOD cache)))) + 3) + +(artdag-test + "retry produces the correct result" + (let + ((cache (persist/open))) + (begin + (artdag/run-safe ft-D ft-BAD cache) + (artdag/result-of + (artdag/run-safe ft-D ft-GOOD cache) + (artdag/dag-id ft-D "d")))) + 110) + +; ---- transitive cascade ---- + +(artdag-test + "failure cascades through a deep chain" + (let + ((cache (persist/open))) + (artdag/failure-count + (artdag/run-safe + (artdag/build + (list + (list "a" "in" (list) {:v 1}) + (list "b" "boom" (list "a") {}) + (list "c" "inc" (list "b") {}) + (list "d" "inc" (list "c") {}))) + ft-BAD + cache))) + 3) diff --git a/plans/artdag-on-sx.md b/plans/artdag-on-sx.md index 1a105d98..f5f7cbab 100644 --- a/plans/artdag-on-sx.md +++ b/plans/artdag-on-sx.md @@ -30,7 +30,7 @@ edges. ## Status (rolling) -`bash lib/artdag/conformance.sh` → **144/144** (9 suites: dag, analyze, plan, execute, optimize, fed, cost, serialize, stats) +`bash lib/artdag/conformance.sh` → **158/158** (10 suites: dag, analyze, plan, execute, optimize, fed, cost, serialize, stats, fault) Base roadmap (Phases 1–6) COMPLETE. Now extending. @@ -138,6 +138,13 @@ lib/artdag/optimize.sx lib/artdag/federation.sx ## Progress log +- **Ext: fault-tolerant execution** (fault suite 14/14, total 158/158). + `lib/artdag/fault.sx`: a node op may fail via `(artdag/fail reason)`; `run-safe` + confines the failure to that node + its transitive dependents (independent branches + still compute) and NEVER caches a failed result, so a later run with the fault fixed + recomputes only the failed closure and cache-hits the good nodes. `failed?`/`fail` + markers, `failed-nodes`/`failure-count`/`all-ok?`. + - **Ext: execution stats / cache analytics** (stats suite 12/12, total 144/144). `lib/artdag/stats.sx` over an exec record: `hit-ratio`, `work-recomputed`/`work-saved` (cost-weighted via the cost model), `savings-ratio`, and `exec-summary`. Cold run =