From 3cc760082cc1133279fed5addc8a462ad13944c8 Mon Sep 17 00:00:00 2001 From: giles Date: Fri, 8 May 2026 08:42:10 +0000 Subject: [PATCH] datalog: hash-set membership for facts (Phase 5b perf) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit db gains a parallel :facts-keys {: {: true}} index alongside :facts. dl-tuple-key derives a stable string via (str lit) — (p 30) and (p 30.0) collide correctly because SX prints them identically. dl-add-fact! membership is now O(1) instead of O(n) list scan; insert sequences for relations sized N drop from O(N²) to O(N). Wall clock on chain-7 saturation halves (~12s → ~6s); chain-15 roughly halves (~50s → ~25s) under shared CPU. Larger chains still slow due to body-join overhead in dl-find-bindings — Blocker entry refreshed with proposed follow-ups. dl-retract! keeps both indices consistent: kept-keys is rebuilt during the EDB filter, IDB wipes clear both lists and key dicts. --- lib/datalog/api.sx | 16 ++++++++++++---- lib/datalog/db.sx | 22 +++++++++++++++++----- plans/datalog-on-sx.md | 24 ++++++++++++++++++------ 3 files changed, 47 insertions(+), 15 deletions(-) diff --git a/lib/datalog/api.sx b/lib/datalog/api.sx index 37ae9d6c..01abec95 100644 --- a/lib/datalog/api.sx +++ b/lib/datalog/api.sx @@ -111,21 +111,29 @@ (has-key? (get db :facts) rel-key) (let ((existing (get (get db :facts) rel-key)) - (kept (list))) + (kept (list)) + (kept-keys {})) (do (for-each (fn (t) (when (not (dl-tuple-equal? t lit)) - (append! kept t))) + (do + (append! kept t) + (dict-set! kept-keys (dl-tuple-key t) true)))) existing) - (dict-set! (get db :facts) rel-key kept)))) + (dict-set! (get db :facts) rel-key kept) + (dict-set! (get db :facts-keys) rel-key kept-keys)))) ;; Wipe all relations that have a rule (these are IDB) so the ;; saturator regenerates them from the surviving EDB. (let ((rule-heads (dl-rule-head-rels db))) (for-each - (fn (k) (dict-set! (get db :facts) k (list))) + (fn + (k) + (do + (dict-set! (get db :facts) k (list)) + (dict-set! (get db :facts-keys) k {}))) rule-heads)) (dl-saturate! db) db)))) diff --git a/lib/datalog/db.sx b/lib/datalog/db.sx index 620f565c..6a3f69a0 100644 --- a/lib/datalog/db.sx +++ b/lib/datalog/db.sx @@ -12,7 +12,7 @@ ;; lib/datalog/builtins.sx) swaps in the real `dl-rule-check-safety`, ;; which is order-aware and understands built-in predicates. -(define dl-make-db (fn () {:facts {} :rules (list)})) +(define dl-make-db (fn () {:facts {} :facts-keys {} :rules (list)})) (define dl-rel-name @@ -97,13 +97,19 @@ (fn (db rel-key) (let - ((facts (get db :facts))) + ((facts (get db :facts)) + (fk (get db :facts-keys))) (do (when (not (has-key? facts rel-key)) (dict-set! facts rel-key (list))) + (when + (not (has-key? fk rel-key)) + (dict-set! fk rel-key {})) (get facts rel-key))))) +(define dl-tuple-key (fn (lit) (str lit))) + (define dl-rel-tuples (fn @@ -125,10 +131,16 @@ (let ((rel-key (dl-rel-name lit))) (let - ((tuples (dl-ensure-rel! db rel-key))) + ((tuples (dl-ensure-rel! db rel-key)) + (key-dict (get (get db :facts-keys) rel-key)) + (tk (dl-tuple-key lit))) (cond - ((dl-tuple-member? lit tuples) false) - (else (do (append! tuples lit) true))))))))) + ((has-key? key-dict tk) false) + (else + (do + (dict-set! key-dict tk true) + (append! tuples lit) + true))))))))) ;; The full safety check lives in builtins.sx (it has to know which ;; predicates are built-ins). dl-add-rule! calls it via forward diff --git a/plans/datalog-on-sx.md b/plans/datalog-on-sx.md index 21c8123f..b9990c17 100644 --- a/plans/datalog-on-sx.md +++ b/plans/datalog-on-sx.md @@ -246,17 +246,29 @@ large graphs. ## Blockers -- **Hash-set membership for relations.** `dl-tuple-member?` uses a linear - list scan; insert is O(n) and saturating chain-N pushes O(n²) → O(n³) - total. Under bundled conformance (CPU shared with other loop agents) - even chain-15 hits multi-minute wall-clock. Tests scoped to chain-5 - for now. Fix: maintain a `{tuple-key → true}` dict per relation - alongside the list; key tuples by their serialized form. +- **Saturation perf on long chains.** Resolved one bottleneck (hash-set + membership in `dl-add-fact!`) but `dl-saturate!` still spends + significant time per iteration on rule body joins — chain-15 takes + ~25s real / 3s user under contention even after the membership fix. + Two follow-ups to consider: (a) avoid `(rest lits)` in + `dl-find-bindings`/`dl-fbs-aux` (uses indexed iteration like the + membership fix), (b) memoize the per-rule body shape so `(len lits)` + and accessor calls don't re-walk the list each step. ## Progress log _Newest first._ +- 2026-05-08 — Phase 5b perf: hash-set membership in `dl-add-fact!`. + db gains a parallel `:facts-keys {: {: true}}` + index alongside `:facts`. `dl-tuple-key` derives a stable string + key via `(str lit)` — `(p 30)` and `(p 30.0)` collide correctly + because SX prints them identically. Insertion is O(1) instead of + O(n). chain-7 saturation drops from ~12s to ~6s; chain-15 from + ~50s to ~25s under shared CPU. Larger chains are still slow due + to body-join overhead in dl-find-bindings (Blocker updated). + `dl-retract!` updated to keep both indices consistent. 143/143. + - 2026-05-08 — Phase 9 done. New `lib/datalog/api.sx` exposes a parser-free embedding: `dl-program-data facts rules` accepts SX data lists, with rules in either dict form or list form using