Files
rose-ash/lib/artdag/fault.sx
giles 28fed7c799
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 1m4s
artdag: fault-tolerant execution — confined failure, cache never poisoned + 14 tests
fault.sx run-safe: a node op may return (artdag/fail reason); failure is confined
to that node + downstream dependents while independent branches compute, and failed
results are never cached, so retry after a fix recomputes only the failed closure
and hits the good nodes. fault 14/14, total 158/158.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-07 12:32:14 +00:00

57 lines
2.2 KiB
Plaintext

; lib/artdag/fault.sx — fault-tolerant execution. A node op may fail by returning
; (artdag/fail reason); the failure is confined to that node and its transitive
; dependents (which cannot run without it), while independent branches still
; compute. Failed results are NEVER cached, so a later run with the fault fixed
; recomputes only the failed closure. Depends on execute.sx and plan.sx.
(define artdag/fail (fn (reason) {:artdag-fail true :reason reason}))
(define artdag/failed? (fn (v) (and (dict? v) (has-key? v :artdag-fail))))
(define
artdag/-exec-safe-node
(fn
(dag runner cache acc id)
(let
((node (artdag/dag-get dag id)))
(let
((ins (artdag/node-inputs node)))
(if
(some (fn (in) (artdag/member? in (get acc :failed))) ins)
(assoc acc :failed (concat (get acc :failed) (list id)))
(if
(persist/kv-has? cache id)
(assoc
acc
:results (assoc (get acc :results) id (persist/kv-get cache id))
:hits (concat (get acc :hits) (list id)))
(let
((inputs (map (fn (in) (artdag/-input-result (get acc :results) cache in)) ins)))
(let
((result (runner (artdag/node-op node) (artdag/node-params node) inputs)))
(if
(artdag/failed? result)
(assoc acc :failed (concat (get acc :failed) (list id)))
(begin
(persist/kv-put cache id result)
(assoc
acc
:results (assoc (get acc :results) id result)
:recomputed (concat (get acc :recomputed) (list id)))))))))))))
(define
artdag/run-safe
(fn
(dag runner cache)
(reduce
(fn (acc id) (artdag/-exec-safe-node dag runner cache acc id))
{:recomputed (list) :results {} :hits (list) :failed (list)}
(artdag/plan-flatten (artdag/plan dag 0)))))
(define
artdag/failed-nodes
(fn (exec) (artdag/sort-strings (get exec :failed))))
(define artdag/failure-count (fn (exec) (len (get exec :failed))))
(define
artdag/all-ok?
(fn (exec) (= (len (get exec :failed)) 0)))