artdag: fault-tolerant execution — confined failure, cache never poisoned + 14 tests
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 1m4s
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 1m4s
fault.sx run-safe: a node op may return (artdag/fail reason); failure is confined to that node + downstream dependents while independent branches compute, and failed results are never cached, so retry after a fix recomputes only the failed closure and hits the good nodes. fault 14/14, total 158/158. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -13,7 +13,7 @@ if [ ! -x "$SX_SERVER" ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
SUITES=(dag analyze plan execute optimize fed cost serialize stats)
|
||||
SUITES=(dag analyze plan execute optimize fed cost serialize stats fault)
|
||||
|
||||
OUT_JSON="lib/artdag/scoreboard.json"
|
||||
OUT_MD="lib/artdag/scoreboard.md"
|
||||
@@ -50,6 +50,7 @@ run_suite() {
|
||||
(load "lib/artdag/cost.sx")
|
||||
(load "lib/artdag/serialize.sx")
|
||||
(load "lib/artdag/stats.sx")
|
||||
(load "lib/artdag/fault.sx")
|
||||
(epoch 2)
|
||||
(eval "(define artdag-test-pass 0)")
|
||||
(eval "(define artdag-test-fail 0)")
|
||||
|
||||
56
lib/artdag/fault.sx
Normal file
56
lib/artdag/fault.sx
Normal file
@@ -0,0 +1,56 @@
|
||||
; lib/artdag/fault.sx — fault-tolerant execution. A node op may fail by returning
|
||||
; (artdag/fail reason); the failure is confined to that node and its transitive
|
||||
; dependents (which cannot run without it), while independent branches still
|
||||
; compute. Failed results are NEVER cached, so a later run with the fault fixed
|
||||
; recomputes only the failed closure. Depends on execute.sx and plan.sx.
|
||||
|
||||
(define artdag/fail (fn (reason) {:artdag-fail true :reason reason}))
|
||||
(define artdag/failed? (fn (v) (and (dict? v) (has-key? v :artdag-fail))))
|
||||
|
||||
(define
|
||||
artdag/-exec-safe-node
|
||||
(fn
|
||||
(dag runner cache acc id)
|
||||
(let
|
||||
((node (artdag/dag-get dag id)))
|
||||
(let
|
||||
((ins (artdag/node-inputs node)))
|
||||
(if
|
||||
(some (fn (in) (artdag/member? in (get acc :failed))) ins)
|
||||
(assoc acc :failed (concat (get acc :failed) (list id)))
|
||||
(if
|
||||
(persist/kv-has? cache id)
|
||||
(assoc
|
||||
acc
|
||||
:results (assoc (get acc :results) id (persist/kv-get cache id))
|
||||
:hits (concat (get acc :hits) (list id)))
|
||||
(let
|
||||
((inputs (map (fn (in) (artdag/-input-result (get acc :results) cache in)) ins)))
|
||||
(let
|
||||
((result (runner (artdag/node-op node) (artdag/node-params node) inputs)))
|
||||
(if
|
||||
(artdag/failed? result)
|
||||
(assoc acc :failed (concat (get acc :failed) (list id)))
|
||||
(begin
|
||||
(persist/kv-put cache id result)
|
||||
(assoc
|
||||
acc
|
||||
:results (assoc (get acc :results) id result)
|
||||
:recomputed (concat (get acc :recomputed) (list id)))))))))))))
|
||||
|
||||
(define
|
||||
artdag/run-safe
|
||||
(fn
|
||||
(dag runner cache)
|
||||
(reduce
|
||||
(fn (acc id) (artdag/-exec-safe-node dag runner cache acc id))
|
||||
{:recomputed (list) :results {} :hits (list) :failed (list)}
|
||||
(artdag/plan-flatten (artdag/plan dag 0)))))
|
||||
|
||||
(define
|
||||
artdag/failed-nodes
|
||||
(fn (exec) (artdag/sort-strings (get exec :failed))))
|
||||
(define artdag/failure-count (fn (exec) (len (get exec :failed))))
|
||||
(define
|
||||
artdag/all-ok?
|
||||
(fn (exec) (= (len (get exec :failed)) 0)))
|
||||
@@ -8,9 +8,10 @@
|
||||
"fed": {"pass": 15, "fail": 0},
|
||||
"cost": {"pass": 13, "fail": 0},
|
||||
"serialize": {"pass": 13, "fail": 0},
|
||||
"stats": {"pass": 12, "fail": 0}
|
||||
"stats": {"pass": 12, "fail": 0},
|
||||
"fault": {"pass": 14, "fail": 0}
|
||||
},
|
||||
"total_pass": 144,
|
||||
"total_pass": 158,
|
||||
"total_fail": 0,
|
||||
"total": 144
|
||||
"total": 158
|
||||
}
|
||||
|
||||
@@ -13,4 +13,5 @@ _Generated by `lib/artdag/conformance.sh`_
|
||||
| cost | 13 | 0 | 13 |
|
||||
| serialize | 13 | 0 | 13 |
|
||||
| stats | 12 | 0 | 12 |
|
||||
| **Total** | **144** | **0** | **144** |
|
||||
| fault | 14 | 0 | 14 |
|
||||
| **Total** | **158** | **0** | **158** |
|
||||
|
||||
144
lib/artdag/tests/fault.sx
Normal file
144
lib/artdag/tests/fault.sx
Normal file
@@ -0,0 +1,144 @@
|
||||
; fault-tolerant execution: failure confined to its closure, cache never poisoned.
|
||||
|
||||
(define ft-BAD (artdag/op-table-runner {:boom (fn (p i) (artdag/fail "kaboom")) :in (fn (p i) (get p :v)) :add (fn (p i) (+ (nth i 0) (nth i 1))) :inc (fn (p i) (+ 1 (first i)))}))
|
||||
|
||||
(define ft-GOOD (artdag/op-table-runner {:boom (fn (p i) 99) :in (fn (p i) (get p :v)) :add (fn (p i) (+ (nth i 0) (nth i 1))) :inc (fn (p i) (+ 1 (first i)))}))
|
||||
|
||||
; p,q leaves; b=inc(p) (independent); c=boom(q); d=add(b,c)
|
||||
(define
|
||||
ft-D
|
||||
(artdag/build
|
||||
(list
|
||||
(list "p" "in" (list) {:v 10})
|
||||
(list "q" "in" (list) {:v 20})
|
||||
(list "b" "inc" (list "p") {})
|
||||
(list "c" "boom" (list "q") {})
|
||||
(list "d" "add" (list "b" "c") {} true))))
|
||||
|
||||
; ---- markers ----
|
||||
|
||||
(artdag-test
|
||||
"fail constructor is detected"
|
||||
(artdag/failed? (artdag/fail "x"))
|
||||
true)
|
||||
|
||||
(artdag-test
|
||||
"plain values are not failures"
|
||||
(artdag/failed? 42)
|
||||
false)
|
||||
|
||||
; ---- failure confinement ----
|
||||
|
||||
(artdag-test
|
||||
"failure count covers node and its dependents"
|
||||
(let
|
||||
((cache (persist/open)))
|
||||
(artdag/failure-count (artdag/run-safe ft-D ft-BAD cache)))
|
||||
2)
|
||||
|
||||
(artdag-test
|
||||
"failed set is exactly c and d"
|
||||
(let
|
||||
((cache (persist/open)))
|
||||
(artdag/failed-nodes (artdag/run-safe ft-D ft-BAD cache)))
|
||||
(artdag/sort-strings
|
||||
(list (artdag/dag-id ft-D "c") (artdag/dag-id ft-D "d"))))
|
||||
|
||||
(artdag-test
|
||||
"independent branch still computes"
|
||||
(let
|
||||
((cache (persist/open)))
|
||||
(artdag/recompute-count (artdag/run-safe ft-D ft-BAD cache)))
|
||||
3)
|
||||
|
||||
(artdag-test
|
||||
"independent node result is available"
|
||||
(let
|
||||
((cache (persist/open)))
|
||||
(artdag/result-of
|
||||
(artdag/run-safe ft-D ft-BAD cache)
|
||||
(artdag/dag-id ft-D "b")))
|
||||
11)
|
||||
|
||||
(artdag-test
|
||||
"all-ok? is false when something failed"
|
||||
(let
|
||||
((cache (persist/open)))
|
||||
(artdag/all-ok? (artdag/run-safe ft-D ft-BAD cache)))
|
||||
false)
|
||||
|
||||
(artdag-test
|
||||
"all-ok? is true on a clean run"
|
||||
(let
|
||||
((cache (persist/open)))
|
||||
(artdag/all-ok? (artdag/run-safe ft-D ft-GOOD cache)))
|
||||
true)
|
||||
|
||||
; ---- cache integrity ----
|
||||
|
||||
(artdag-test
|
||||
"good node is cached"
|
||||
(let
|
||||
((cache (persist/open)))
|
||||
(begin
|
||||
(artdag/run-safe ft-D ft-BAD cache)
|
||||
(persist/kv-has? cache (artdag/dag-id ft-D "b"))))
|
||||
true)
|
||||
|
||||
(artdag-test
|
||||
"failed node is never cached"
|
||||
(let
|
||||
((cache (persist/open)))
|
||||
(begin
|
||||
(artdag/run-safe ft-D ft-BAD cache)
|
||||
(persist/kv-has? cache (artdag/dag-id ft-D "c"))))
|
||||
false)
|
||||
|
||||
; ---- retry after fix ----
|
||||
|
||||
(artdag-test
|
||||
"retry recomputes only the failed closure"
|
||||
(let
|
||||
((cache (persist/open)))
|
||||
(begin
|
||||
(artdag/run-safe ft-D ft-BAD cache)
|
||||
(artdag/recompute-count (artdag/run-safe ft-D ft-GOOD cache))))
|
||||
2)
|
||||
|
||||
(artdag-test
|
||||
"retry reuses the good nodes from cache"
|
||||
(let
|
||||
((cache (persist/open)))
|
||||
(begin
|
||||
(artdag/run-safe ft-D ft-BAD cache)
|
||||
(artdag/hit-count (artdag/run-safe ft-D ft-GOOD cache))))
|
||||
3)
|
||||
|
||||
(artdag-test
|
||||
"retry produces the correct result"
|
||||
(let
|
||||
((cache (persist/open)))
|
||||
(begin
|
||||
(artdag/run-safe ft-D ft-BAD cache)
|
||||
(artdag/result-of
|
||||
(artdag/run-safe ft-D ft-GOOD cache)
|
||||
(artdag/dag-id ft-D "d"))))
|
||||
110)
|
||||
|
||||
; ---- transitive cascade ----
|
||||
|
||||
(artdag-test
|
||||
"failure cascades through a deep chain"
|
||||
(let
|
||||
((cache (persist/open)))
|
||||
(artdag/failure-count
|
||||
(artdag/run-safe
|
||||
(artdag/build
|
||||
(list
|
||||
(list "a" "in" (list) {:v 1})
|
||||
(list "b" "boom" (list "a") {})
|
||||
(list "c" "inc" (list "b") {})
|
||||
(list "d" "inc" (list "c") {})))
|
||||
ft-BAD
|
||||
cache)))
|
||||
3)
|
||||
@@ -30,7 +30,7 @@ edges.
|
||||
|
||||
## Status (rolling)
|
||||
|
||||
`bash lib/artdag/conformance.sh` → **144/144** (9 suites: dag, analyze, plan, execute, optimize, fed, cost, serialize, stats)
|
||||
`bash lib/artdag/conformance.sh` → **158/158** (10 suites: dag, analyze, plan, execute, optimize, fed, cost, serialize, stats, fault)
|
||||
|
||||
Base roadmap (Phases 1–6) COMPLETE. Now extending.
|
||||
|
||||
@@ -138,6 +138,13 @@ lib/artdag/optimize.sx lib/artdag/federation.sx
|
||||
|
||||
## Progress log
|
||||
|
||||
- **Ext: fault-tolerant execution** (fault suite 14/14, total 158/158).
|
||||
`lib/artdag/fault.sx`: a node op may fail via `(artdag/fail reason)`; `run-safe`
|
||||
confines the failure to that node + its transitive dependents (independent branches
|
||||
still compute) and NEVER caches a failed result, so a later run with the fault fixed
|
||||
recomputes only the failed closure and cache-hits the good nodes. `failed?`/`fail`
|
||||
markers, `failed-nodes`/`failure-count`/`all-ok?`.
|
||||
|
||||
- **Ext: execution stats / cache analytics** (stats suite 12/12, total 144/144).
|
||||
`lib/artdag/stats.sx` over an exec record: `hit-ratio`, `work-recomputed`/`work-saved`
|
||||
(cost-weighted via the cost model), `savings-ratio`, and `exec-summary`. Cold run =
|
||||
|
||||
Reference in New Issue
Block a user