feed: TF-IDF content ranking over :tags — tag-df/idf, tfidf-score, by-relevance + 15 tests
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 43s
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 43s
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -13,7 +13,7 @@ if [ ! -x "$SX_SERVER" ]; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
SUITES=(basic fanout rank integration)
|
SUITES=(basic fanout rank integration content)
|
||||||
|
|
||||||
OUT_JSON="lib/feed/scoreboard.json"
|
OUT_JSON="lib/feed/scoreboard.json"
|
||||||
OUT_MD="lib/feed/scoreboard.md"
|
OUT_MD="lib/feed/scoreboard.md"
|
||||||
@@ -37,6 +37,7 @@ run_suite() {
|
|||||||
(load "lib/feed/rank.sx")
|
(load "lib/feed/rank.sx")
|
||||||
(load "lib/feed/acl.sx")
|
(load "lib/feed/acl.sx")
|
||||||
(load "lib/feed/fed.sx")
|
(load "lib/feed/fed.sx")
|
||||||
|
(load "lib/feed/content.sx")
|
||||||
(epoch 2)
|
(epoch 2)
|
||||||
(eval "(define feed-test-pass 0)")
|
(eval "(define feed-test-pass 0)")
|
||||||
(eval "(define feed-test-fail 0)")
|
(eval "(define feed-test-fail 0)")
|
||||||
|
|||||||
68
lib/feed/content.sx
Normal file
68
lib/feed/content.sx
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
; feed/content — TF-IDF relevance over activity :tags. Rare tags carry more
|
||||||
|
; signal, so an activity matching an uncommon tag ranks above one matching a
|
||||||
|
; common tag. Composes with rank.sx: feed/tfidf-score is just another scorer.
|
||||||
|
;
|
||||||
|
; Requires: lib/feed/normalize.sx, lib/feed/stream.sx, lib/feed/fanout.sx
|
||||||
|
; (feed/-distinct), lib/feed/rank.sx (feed/rank).
|
||||||
|
|
||||||
|
; document frequency: tag -> number of activities whose :tags contain it
|
||||||
|
; (a tag repeated within one activity counts once toward df)
|
||||||
|
(define
|
||||||
|
feed/tag-df
|
||||||
|
(fn
|
||||||
|
(stream)
|
||||||
|
(reduce
|
||||||
|
(fn
|
||||||
|
(df a)
|
||||||
|
(reduce
|
||||||
|
(fn (d t) (assoc d t (+ (get d t 0) 1)))
|
||||||
|
df
|
||||||
|
(feed/-distinct (get a :tags))))
|
||||||
|
{}
|
||||||
|
(feed/items stream))))
|
||||||
|
|
||||||
|
; inverse document frequency: tag -> log(N / df)
|
||||||
|
(define
|
||||||
|
feed/tag-idf
|
||||||
|
(fn
|
||||||
|
(stream)
|
||||||
|
(let
|
||||||
|
((n (feed/count stream)) (df (feed/tag-df stream)))
|
||||||
|
(reduce
|
||||||
|
(fn (idf t) (assoc idf t (log (/ n (get df t)))))
|
||||||
|
{}
|
||||||
|
(keys df)))))
|
||||||
|
|
||||||
|
; term frequency within one activity: tag -> occurrence count
|
||||||
|
(define
|
||||||
|
feed/-tf
|
||||||
|
(fn
|
||||||
|
(a)
|
||||||
|
(reduce
|
||||||
|
(fn (tf t) (assoc tf t (+ (get tf t 0) 1)))
|
||||||
|
{}
|
||||||
|
(get a :tags))))
|
||||||
|
|
||||||
|
; relevance of an activity to a query (list of tags) given precomputed idf:
|
||||||
|
; sum over query tags of tf(tag in activity) * idf(tag in corpus)
|
||||||
|
(define
|
||||||
|
feed/tfidf-score
|
||||||
|
(fn
|
||||||
|
(idf query)
|
||||||
|
(fn
|
||||||
|
(a)
|
||||||
|
(let
|
||||||
|
((tf (feed/-tf a)))
|
||||||
|
(reduce
|
||||||
|
(fn
|
||||||
|
(acc t)
|
||||||
|
(+ acc (* (get tf t 0) (get idf t 0))))
|
||||||
|
0
|
||||||
|
query)))))
|
||||||
|
|
||||||
|
; rank a stream by relevance to query tags (idf computed over the stream itself)
|
||||||
|
(define
|
||||||
|
feed/by-relevance
|
||||||
|
(fn
|
||||||
|
(stream query)
|
||||||
|
(feed/rank stream (feed/tfidf-score (feed/tag-idf stream) query))))
|
||||||
@@ -3,9 +3,10 @@
|
|||||||
"basic": {"pass": 30, "fail": 0},
|
"basic": {"pass": 30, "fail": 0},
|
||||||
"fanout": {"pass": 29, "fail": 0},
|
"fanout": {"pass": 29, "fail": 0},
|
||||||
"rank": {"pass": 24, "fail": 0},
|
"rank": {"pass": 24, "fail": 0},
|
||||||
"integration": {"pass": 22, "fail": 0}
|
"integration": {"pass": 22, "fail": 0},
|
||||||
|
"content": {"pass": 15, "fail": 0}
|
||||||
},
|
},
|
||||||
"total_pass": 105,
|
"total_pass": 120,
|
||||||
"total_fail": 0,
|
"total_fail": 0,
|
||||||
"total": 105
|
"total": 120
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,4 +8,5 @@ _Generated by `lib/feed/conformance.sh`_
|
|||||||
| fanout | 29 | 0 | 29 |
|
| fanout | 29 | 0 | 29 |
|
||||||
| rank | 24 | 0 | 24 |
|
| rank | 24 | 0 | 24 |
|
||||||
| integration | 22 | 0 | 22 |
|
| integration | 22 | 0 | 22 |
|
||||||
| **Total** | **105** | **0** | **105** |
|
| content | 15 | 0 | 15 |
|
||||||
|
| **Total** | **120** | **0** | **120** |
|
||||||
|
|||||||
85
lib/feed/tests/content.sx
Normal file
85
lib/feed/tests/content.sx
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
; Follow-up — TF-IDF content ranking over :tags. (feed-test name got expected)
|
||||||
|
|
||||||
|
(define
|
||||||
|
corpus
|
||||||
|
(feed/stream
|
||||||
|
(list
|
||||||
|
(feed/normalize {:actor "u" :object "o1" :at 10 :tags (list "cats" "funny")})
|
||||||
|
(feed/normalize {:actor "u" :object "o2" :at 20 :tags (list "cats" "news")})
|
||||||
|
(feed/normalize {:actor "u" :object "o3" :at 30 :tags (list "politics" "news")})
|
||||||
|
(feed/normalize {:actor "u" :object "o4" :at 40 :tags (list "cats")}))))
|
||||||
|
|
||||||
|
; ---------- document frequency ----------
|
||||||
|
|
||||||
|
(feed-test "df cats" (get (feed/tag-df corpus) "cats") 3)
|
||||||
|
(feed-test "df news" (get (feed/tag-df corpus) "news") 2)
|
||||||
|
(feed-test "df funny" (get (feed/tag-df corpus) "funny") 1)
|
||||||
|
(feed-test "df politics" (get (feed/tag-df corpus) "politics") 1)
|
||||||
|
(feed-test "df full" (feed/tag-df corpus) {:news 2 :funny 1 :politics 1 :cats 3})
|
||||||
|
|
||||||
|
; ---------- inverse document frequency ----------
|
||||||
|
|
||||||
|
(feed-test
|
||||||
|
"idf news = log(4/2)"
|
||||||
|
(get (feed/tag-idf corpus) "news")
|
||||||
|
(log 2))
|
||||||
|
(feed-test
|
||||||
|
"idf funny = log(4/1)"
|
||||||
|
(get (feed/tag-idf corpus) "funny")
|
||||||
|
(log 4))
|
||||||
|
(feed-test
|
||||||
|
"rarer tag has higher idf"
|
||||||
|
(>
|
||||||
|
(get (feed/tag-idf corpus) "funny")
|
||||||
|
(get (feed/tag-idf corpus) "cats"))
|
||||||
|
true)
|
||||||
|
|
||||||
|
; ---------- tf-idf scoring ----------
|
||||||
|
|
||||||
|
(define idf (feed/tag-idf corpus))
|
||||||
|
|
||||||
|
(feed-test
|
||||||
|
"score query funny on o1"
|
||||||
|
((feed/tfidf-score idf (list "funny")) (feed/normalize {:actor "u" :object "x" :tags (list "cats" "funny")}))
|
||||||
|
(log 4))
|
||||||
|
(feed-test
|
||||||
|
"score query funny on non-match"
|
||||||
|
((feed/tfidf-score idf (list "funny")) (feed/normalize {:actor "u" :object "x" :tags (list "cats")}))
|
||||||
|
0)
|
||||||
|
(feed-test
|
||||||
|
"unknown query tag scores 0"
|
||||||
|
((feed/tfidf-score idf (list "zzz")) (feed/normalize {:actor "u" :object "x" :tags (list "cats")}))
|
||||||
|
0)
|
||||||
|
|
||||||
|
; ---------- ranking by relevance ----------
|
||||||
|
|
||||||
|
; query news: o2,o3 match (score log2), o1,o4 don't (0); ties break by :at desc
|
||||||
|
(feed-test
|
||||||
|
"by-relevance news order"
|
||||||
|
(map
|
||||||
|
(fn (a) (get a :object))
|
||||||
|
(feed/items (feed/by-relevance corpus (list "news"))))
|
||||||
|
(list "o3" "o2" "o4" "o1"))
|
||||||
|
|
||||||
|
; query funny: only o1 matches -> ranks first
|
||||||
|
(feed-test
|
||||||
|
"by-relevance funny first"
|
||||||
|
(get
|
||||||
|
(nth (feed/items (feed/by-relevance corpus (list "funny"))) 0)
|
||||||
|
:object)
|
||||||
|
"o1")
|
||||||
|
|
||||||
|
; query (cats news): o2 carries both tags -> highest combined tf-idf
|
||||||
|
(feed-test
|
||||||
|
"by-relevance cats+news top"
|
||||||
|
(get
|
||||||
|
(nth
|
||||||
|
(feed/items (feed/by-relevance corpus (list "cats" "news")))
|
||||||
|
0)
|
||||||
|
:object)
|
||||||
|
"o2")
|
||||||
|
|
||||||
|
(feed-test
|
||||||
|
"by-relevance preserves count"
|
||||||
|
(feed/count (feed/by-relevance corpus (list "cats")))
|
||||||
|
4)
|
||||||
@@ -14,7 +14,7 @@ APL, ACL visibility filtering via `lib/acl/`, federation via fed-sx.
|
|||||||
|
|
||||||
## Status (rolling)
|
## Status (rolling)
|
||||||
|
|
||||||
`bash lib/feed/conformance.sh` → **105/105** (Phases 1–4 complete)
|
`bash lib/feed/conformance.sh` → **120/120** (Phases 1–4 + TF-IDF complete)
|
||||||
|
|
||||||
## Ground rules
|
## Ground rules
|
||||||
|
|
||||||
@@ -138,7 +138,9 @@ are function parameters. Real acl-sx / fed-sx wire in at the call site unchanged
|
|||||||
|
|
||||||
- Wire real acl-sx once `lib/acl/` exists (swap injected `permit?`).
|
- Wire real acl-sx once `lib/acl/` exists (swap injected `permit?`).
|
||||||
- Wire real fed-sx transport (swap `send-fn`/`fetch-fn`).
|
- Wire real fed-sx transport (swap `send-fn`/`fetch-fn`).
|
||||||
- TF-IDF over `:tags` for content ranking (sketch mentions it; not yet built).
|
- [x] TF-IDF over `:tags` for content ranking — `content.sx`: `feed/tag-df`,
|
||||||
|
`feed/tag-idf` (log N/df), `feed/tfidf-score`, `feed/by-relevance`; 15 tests.
|
||||||
|
Composes as a scorer with rank.sx. (120/120 total.)
|
||||||
- Notification feed (verb-filtered, per-recipient) as a thin layer over fanout.
|
- Notification feed (verb-filtered, per-recipient) as a thin layer over fanout.
|
||||||
|
|
||||||
(none)
|
(none)
|
||||||
|
|||||||
Reference in New Issue
Block a user