datalog: aggregation count/sum/min/max (Phase 8, 134/134)
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 50s

New lib/datalog/aggregates.sx: (count R V Goal), (sum R V Goal),
(min R V Goal), (max R V Goal). dl-eval-aggregate runs
dl-find-bindings on the goal under the outer subst, collects
distinct values of V, applies the operator, binds R. Empty input:
count/sum return 0; min/max produce no binding (rule fails).

Group-by emerges naturally from outer-subst substitution into the
goal — `popular(P) :- post(P), count(N, U, liked(U, P)), >=(N, 3).`
counts per-post.

Stratifier extended: dl-aggregate-dep-edge contributes a
negation-like edge so the aggregate's goal relation is fully
derived before the aggregate fires (non-monotonicity respected).
Safety relaxed for aggregates: goal-internal vars are existentials,
only the result var becomes bound.
This commit is contained in:
2026-05-08 08:28:45 +00:00
parent caec05eb27
commit 6d04cf7bf2
9 changed files with 447 additions and 27 deletions

139
lib/datalog/aggregates.sx Normal file
View File

@@ -0,0 +1,139 @@
;; lib/datalog/aggregates.sx — count / sum / min / max aggregation.
;;
;; Surface form (always 3-arg in the parsed AST):
;;
;; (count Result Var GoalLit)
;; (sum Result Var GoalLit)
;; (min Result Var GoalLit)
;; (max Result Var GoalLit)
;;
;; Parsed naturally because arg-position compounds are already allowed
;; (Phase 4 needs them for arithmetic). At evaluation time the aggregator
;; runs `dl-find-bindings` on `GoalLit` under the current subst, collects
;; the distinct values of `Var`, and binds `Result`.
;;
;; Aggregation is non-monotonic — `count(C, X, p(X))` shrinks as p loses
;; tuples. The stratifier (lib/datalog/strata.sx) treats every aggregate's
;; goal relation as a negation-like edge so the inner relation is fully
;; derived before the aggregate fires.
;;
;; Empty input: count → 0, sum → 0, min/max → no binding (rule fails).
(define dl-aggregate-rels (list "count" "sum" "min" "max"))
(define
dl-aggregate?
(fn
(lit)
(and
(list? lit)
(>= (len lit) 4)
(let ((rel (dl-rel-name lit)))
(cond
((nil? rel) false)
(else (dl-member-string? rel dl-aggregate-rels)))))))
;; Apply aggregation operator to a list of (already-distinct) numeric or
;; symbolic values. Returns the aggregated value, or :empty if min/max
;; has no input.
(define
dl-do-aggregate
(fn
(op vals)
(cond
((= op "count") (len vals))
((= op "sum") (dl-sum-vals vals 0))
((= op "min")
(cond
((= (len vals) 0) :empty)
(else (dl-min-vals vals 1 (first vals)))))
((= op "max")
(cond
((= (len vals) 0) :empty)
(else (dl-max-vals vals 1 (first vals)))))
(else (error (str "datalog: unknown aggregate " op))))))
(define
dl-sum-vals
(fn
(vals acc)
(cond
((= (len vals) 0) acc)
(else (dl-sum-vals (rest vals) (+ acc (first vals)))))))
(define
dl-min-vals
(fn
(vals i cur)
(cond
((>= i (len vals)) cur)
(else
(let ((v (nth vals i)))
(dl-min-vals vals (+ i 1) (if (< v cur) v cur)))))))
(define
dl-max-vals
(fn
(vals i cur)
(cond
((>= i (len vals)) cur)
(else
(let ((v (nth vals i)))
(dl-max-vals vals (+ i 1) (if (> v cur) v cur)))))))
;; Membership check by deep equality (so 30 == 30.0 etc).
(define
dl-val-member?
(fn
(v xs)
(cond
((= (len xs) 0) false)
((dl-tuple-equal? v (first xs)) true)
(else (dl-val-member? v (rest xs))))))
;; Evaluate an aggregate body lit under `subst`. Returns the list of
;; extended substitutions (0 or 1 element).
(define
dl-eval-aggregate
(fn
(lit db subst)
(let
((op (dl-rel-name lit))
(result-var (nth lit 1))
(agg-var (nth lit 2))
(goal (nth lit 3)))
(let
((substs (dl-find-bindings (list goal) db subst))
(vals (list)))
(do
(for-each
(fn
(s)
(let ((v (dl-apply-subst agg-var s)))
(when
(not (dl-val-member? v vals))
(append! vals v))))
substs)
(let ((agg-val (dl-do-aggregate op vals)))
(cond
((= agg-val :empty) (list))
(else
(let ((s2 (dl-unify result-var agg-val subst)))
(if (nil? s2) (list) (list s2)))))))))))
;; Stratification edges from aggregates: like negation, the goal's
;; relation must be in a strictly lower stratum so that the aggregate
;; fires only after the underlying tuples are settled.
(define
dl-aggregate-dep-edge
(fn
(lit)
(cond
((dl-aggregate? lit)
(let ((goal (nth lit 3)))
(cond
((and (list? goal) (> (len goal) 0))
(let ((rel (dl-rel-name goal)))
(if (nil? rel) nil {:rel rel :neg true})))
(else nil))))
(else nil))))

View File

@@ -262,6 +262,19 @@
missing
" — they must be bound by an earlier "
"positive body literal")))))))
(define
dl-process-agg!
(fn
(lit)
(let
((result-var (nth lit 1)))
;; Aggregate goal vars are existentially quantified within
;; the aggregate; nothing required from outer context. The
;; result var becomes bound after the aggregate fires.
(when
(dl-var? result-var)
(dl-add-bound! (list (symbol->string result-var)))))))
(define
dl-process-lit!
(fn
@@ -271,6 +284,7 @@
(cond
((and (dict? lit) (has-key? lit :neg))
(dl-process-neg! lit))
((dl-aggregate? lit) (dl-process-agg! lit))
((dl-eq? lit) (dl-process-eq! lit))
((dl-is? lit) (dl-process-is! lit))
((dl-comparison? lit) (dl-process-cmp! lit))

View File

@@ -9,6 +9,7 @@ PRELOADS=(
lib/datalog/unify.sx
lib/datalog/db.sx
lib/datalog/builtins.sx
lib/datalog/aggregates.sx
lib/datalog/strata.sx
lib/datalog/eval.sx
)
@@ -21,4 +22,5 @@ SUITES=(
"builtins:lib/datalog/tests/builtins.sx:(dl-builtins-tests-run!)"
"semi_naive:lib/datalog/tests/semi_naive.sx:(dl-semi-naive-tests-run!)"
"negation:lib/datalog/tests/negation.sx:(dl-negation-tests-run!)"
"aggregates:lib/datalog/tests/aggregates.sx:(dl-aggregates-tests-run!)"
)

View File

@@ -206,6 +206,7 @@
(cond
((and (dict? lit) (has-key? lit :neg))
(dl-match-negation (get lit :neg) db subst))
((dl-aggregate? lit) (dl-eval-aggregate lit db subst))
((dl-builtin? lit)
(let
((s (dl-eval-builtin lit subst)))

View File

@@ -1,8 +1,8 @@
{
"lang": "datalog",
"total_passed": 124,
"total_passed": 134,
"total_failed": 0,
"total": 124,
"total": 134,
"suites": [
{"name":"tokenize","passed":26,"failed":0,"total":26},
{"name":"parse","passed":18,"failed":0,"total":18},
@@ -10,7 +10,8 @@
{"name":"eval","passed":15,"failed":0,"total":15},
{"name":"builtins","passed":19,"failed":0,"total":19},
{"name":"semi_naive","passed":8,"failed":0,"total":8},
{"name":"negation","passed":10,"failed":0,"total":10}
{"name":"negation","passed":10,"failed":0,"total":10},
{"name":"aggregates","passed":10,"failed":0,"total":10}
],
"generated": "2026-05-08T08:20:41+00:00"
"generated": "2026-05-08T08:28:29+00:00"
}

View File

@@ -1,6 +1,6 @@
# datalog scoreboard
**124 / 124 passing** (0 failure(s)).
**134 / 134 passing** (0 failure(s)).
| Suite | Passed | Total | Status |
|-------|--------|-------|--------|
@@ -11,3 +11,4 @@
| builtins | 19 | 19 | ok |
| semi_naive | 8 | 8 | ok |
| negation | 10 | 10 | ok |
| aggregates | 10 | 10 | ok |

View File

@@ -33,22 +33,30 @@
(for-each
(fn
(lit)
(let
((target
(cond
((and (dict? lit) (has-key? lit :neg))
(dl-rel-name (get lit :neg)))
((dl-builtin? lit) nil)
((and (list? lit) (> (len lit) 0))
(dl-rel-name lit))
(else nil)))
(neg?
(and (dict? lit) (has-key? lit :neg))))
(when
(not (nil? target))
(append!
existing
{:rel target :neg neg?}))))
(cond
((dl-aggregate? lit)
(let
((edge (dl-aggregate-dep-edge lit)))
(when
(not (nil? edge))
(append! existing edge))))
(else
(let
((target
(cond
((and (dict? lit) (has-key? lit :neg))
(dl-rel-name (get lit :neg)))
((dl-builtin? lit) nil)
((and (list? lit) (> (len lit) 0))
(dl-rel-name lit))
(else nil)))
(neg?
(and (dict? lit) (has-key? lit :neg))))
(when
(not (nil? target))
(append!
existing
{:rel target :neg neg?}))))))
(get rule :body)))))))
(dl-rules db))
g))))
@@ -79,6 +87,9 @@
(let
((t
(cond
((dl-aggregate? lit)
(let ((edge (dl-aggregate-dep-edge lit)))
(if (nil? edge) nil (get edge :rel))))
((and (dict? lit) (has-key? lit :neg))
(dl-rel-name (get lit :neg)))
((dl-builtin? lit) nil)

View File

@@ -0,0 +1,223 @@
;; lib/datalog/tests/aggregates.sx — count / sum / min / max.
(define dl-at-pass 0)
(define dl-at-fail 0)
(define dl-at-failures (list))
(define
dl-at-deep=?
(fn
(a b)
(cond
((and (list? a) (list? b))
(and (= (len a) (len b)) (dl-at-deq-l? a b 0)))
((and (dict? a) (dict? b))
(let ((ka (keys a)) (kb (keys b)))
(and (= (len ka) (len kb)) (dl-at-deq-d? a b ka 0))))
((and (number? a) (number? b)) (= a b))
(else (equal? a b)))))
(define
dl-at-deq-l?
(fn
(a b i)
(cond
((>= i (len a)) true)
((not (dl-at-deep=? (nth a i) (nth b i))) false)
(else (dl-at-deq-l? a b (+ i 1))))))
(define
dl-at-deq-d?
(fn
(a b ka i)
(cond
((>= i (len ka)) true)
((let ((k (nth ka i)))
(not (dl-at-deep=? (get a k) (get b k))))
false)
(else (dl-at-deq-d? a b ka (+ i 1))))))
(define
dl-at-set=?
(fn
(a b)
(and
(= (len a) (len b))
(dl-at-subset? a b)
(dl-at-subset? b a))))
(define
dl-at-subset?
(fn
(xs ys)
(cond
((= (len xs) 0) true)
((not (dl-at-contains? ys (first xs))) false)
(else (dl-at-subset? (rest xs) ys)))))
(define
dl-at-contains?
(fn
(xs target)
(cond
((= (len xs) 0) false)
((dl-at-deep=? (first xs) target) true)
(else (dl-at-contains? (rest xs) target)))))
(define
dl-at-test!
(fn
(name got expected)
(if
(dl-at-deep=? got expected)
(set! dl-at-pass (+ dl-at-pass 1))
(do
(set! dl-at-fail (+ dl-at-fail 1))
(append!
dl-at-failures
(str
name
"\n expected: " expected
"\n got: " got))))))
(define
dl-at-test-set!
(fn
(name got expected)
(if
(dl-at-set=? got expected)
(set! dl-at-pass (+ dl-at-pass 1))
(do
(set! dl-at-fail (+ dl-at-fail 1))
(append!
dl-at-failures
(str
name
"\n expected (set): " expected
"\n got: " got))))))
(define
dl-at-throws?
(fn
(thunk)
(let
((threw false))
(do
(guard
(e (#t (set! threw true)))
(thunk))
threw))))
(define
dl-at-run-all!
(fn
()
(do
;; count
(dl-at-test-set! "count siblings"
(dl-query
(dl-program
"parent(p, bob). parent(p, alice). parent(p, charlie).
sibling(X, Y) :- parent(P, X), parent(P, Y), !=(X, Y).
sib_count(N) :- count(N, S, sibling(bob, S)).")
(list (quote sib_count) (quote N)))
(list {:N 2}))
;; sum
(dl-at-test-set! "sum prices"
(dl-query
(dl-program
"price(apple, 5). price(pear, 7). price(plum, 3).
total(T) :- sum(T, X, price(F, X)).")
(list (quote total) (quote T)))
(list {:T 15}))
;; min
(dl-at-test-set! "min score"
(dl-query
(dl-program
"score(alice, 80). score(bob, 65). score(carol, 92).
lo(M) :- min(M, S, score(P, S)).")
(list (quote lo) (quote M)))
(list {:M 65}))
;; max
(dl-at-test-set! "max score"
(dl-query
(dl-program
"score(alice, 80). score(bob, 65). score(carol, 92).
hi(M) :- max(M, S, score(P, S)).")
(list (quote hi) (quote M)))
(list {:M 92}))
;; count over derived relation (stratification needed).
(dl-at-test-set! "count over derived"
(dl-query
(dl-program
"parent(a, b). parent(a, c). parent(b, d). parent(c, e).
ancestor(X, Y) :- parent(X, Y).
ancestor(X, Z) :- parent(X, Y), ancestor(Y, Z).
num_ancestors(N) :- count(N, X, ancestor(a, X)).")
(list (quote num_ancestors) (quote N)))
(list {:N 4}))
;; count with no matches → 0.
(dl-at-test-set! "count empty"
(dl-query
(dl-program
"p(1). p(2).
zero(N) :- count(N, X, q(X)).")
(list (quote zero) (quote N)))
(list {:N 0}))
;; sum with no matches → 0.
(dl-at-test-set! "sum empty"
(dl-query
(dl-program
"p(1). p(2).
total(T) :- sum(T, X, q(X)).")
(list (quote total) (quote T)))
(list {:T 0}))
;; min with no matches → rule does not fire.
(dl-at-test-set! "min empty"
(dl-query
(dl-program
"p(1). p(2).
lo(M) :- min(M, X, q(X)).")
(list (quote lo) (quote M)))
(list))
;; Aggregate with comparison filter on result.
(dl-at-test-set! "popularity threshold"
(dl-query
(dl-program
"post(p1). post(p2).
liked(u1, p1). liked(u2, p1). liked(u3, p1).
liked(u1, p2). liked(u2, p2).
popular(P) :- post(P), count(N, U, liked(U, P)), >=(N, 3).")
(list (quote popular) (quote P)))
(list {:P (quote p1)}))
;; Aggregate vs single distinct.
(dl-at-test-set! "distinct counted once"
(dl-query
(dl-program
"rated(alice, x). rated(alice, y). rated(bob, x).
rater_count(N) :- count(N, U, rated(U, F)).")
(list (quote rater_count) (quote N)))
(list {:N 2})))))
(define
dl-aggregates-tests-run!
(fn
()
(do
(set! dl-at-pass 0)
(set! dl-at-fail 0)
(set! dl-at-failures (list))
(dl-at-run-all!)
{:passed dl-at-pass
:failed dl-at-fail
:total (+ dl-at-pass dl-at-fail)
:failures dl-at-failures})))

View File

@@ -185,12 +185,27 @@ large graphs.
rejection, and a negation safety violation.
### Phase 8 — aggregation (Datalog+)
- [ ] `count(X, Goal)` → number of distinct X satisfying Goal
- [ ] `sum(X, Goal)` → sum of X values satisfying Goal
- [ ] `min(X, Goal)` / `max(X, Goal)` → min/max of X satisfying Goal
- [ ] `group-by` semantics: `count(X, sibling(bob, X))` → count of bob's siblings
- [ ] Aggregation breaks stratification — evaluate in a separate post-fixpoint pass
- [ ] Tests: social network statistics, grade aggregation, inventory sums
- [x] `(count R V Goal)`, `(sum R V Goal)`, `(min R V Goal)`,
`(max R V Goal)` — first arg is the result variable, second is the
aggregated variable, third is the goal literal. Live in
`lib/datalog/aggregates.sx`.
- [x] `dl-eval-aggregate`: runs `dl-find-bindings` on the goal under the
current subst (which provides outer-context bindings), collects
distinct values of the aggregated var, applies the aggregate.
`count`/`sum` produce 0 when no matches; `min`/`max` produce no
binding (rule fails) when empty.
- [x] Group-by emerges naturally: outer-context vars in the goal are
substituted from the current subst, so `popular(P) :- post(P),
count(N, U, liked(U, P)), >=(N, 3).` correctly counts per-post.
- [x] Stratification: `dl-aggregate-dep-edge` returns a negation-like
edge so the aggregate's goal relation is fully derived before the
aggregate fires. Non-monotonicity respected.
- [x] Safety: aggregate body lit binds the result var; goal-internal
vars are existentially quantified and don't need outer binding.
- [x] Tests in `lib/datalog/tests/aggregates.sx` (10): count siblings,
sum prices, min/max scores, count over derived relation,
empty-input cases for each operator, popularity threshold with
group-by, distinct-counted-once.
### Phase 9 — SX embedding API
- [ ] `(dl-program facts rules)` → database from SX data directly (no parsing required)
@@ -230,6 +245,19 @@ large graphs.
_Newest first._
- 2026-05-08 — Phase 8 done. New `lib/datalog/aggregates.sx` (~110
LOC): count / sum / min / max. Each is a body literal of shape
`(op R V Goal)` — `dl-eval-aggregate` runs `dl-find-bindings` on
the goal under the outer subst (so outer vars in the goal get
substituted, giving group-by-style aggregation), collects the
distinct values of `V`, and binds `R`. Empty input: count/sum
return 0; min/max produce no binding (rule fails). Stratifier
extended via `dl-aggregate-dep-edge` so the aggregate's goal
relation is fully derived before the aggregate fires. Safety check
treats goal-internal vars as existentials (no outer binding
required); only the result var becomes bound. Conformance now
134 / 134.
- 2026-05-08 — Phase 7 done (Phase 6 magic sets deferred — opt-in,
semi-naive default suffices for current test suite). New
`lib/datalog/strata.sx` (~290 LOC): dep graph build, Floyd-Warshall