; Follow-up — TF-IDF content ranking over :tags. (feed-test name got expected) (define corpus (feed/stream (list (feed/normalize {:actor "u" :object "o1" :at 10 :tags (list "cats" "funny")}) (feed/normalize {:actor "u" :object "o2" :at 20 :tags (list "cats" "news")}) (feed/normalize {:actor "u" :object "o3" :at 30 :tags (list "politics" "news")}) (feed/normalize {:actor "u" :object "o4" :at 40 :tags (list "cats")})))) ; ---------- document frequency ---------- (feed-test "df cats" (get (feed/tag-df corpus) "cats") 3) (feed-test "df news" (get (feed/tag-df corpus) "news") 2) (feed-test "df funny" (get (feed/tag-df corpus) "funny") 1) (feed-test "df politics" (get (feed/tag-df corpus) "politics") 1) (feed-test "df full" (feed/tag-df corpus) {:news 2 :funny 1 :politics 1 :cats 3}) ; ---------- inverse document frequency ---------- (feed-test "idf news = log(4/2)" (get (feed/tag-idf corpus) "news") (log 2)) (feed-test "idf funny = log(4/1)" (get (feed/tag-idf corpus) "funny") (log 4)) (feed-test "rarer tag has higher idf" (> (get (feed/tag-idf corpus) "funny") (get (feed/tag-idf corpus) "cats")) true) ; ---------- tf-idf scoring ---------- (define idf (feed/tag-idf corpus)) (feed-test "score query funny on o1" ((feed/tfidf-score idf (list "funny")) (feed/normalize {:actor "u" :object "x" :tags (list "cats" "funny")})) (log 4)) (feed-test "score query funny on non-match" ((feed/tfidf-score idf (list "funny")) (feed/normalize {:actor "u" :object "x" :tags (list "cats")})) 0) (feed-test "unknown query tag scores 0" ((feed/tfidf-score idf (list "zzz")) (feed/normalize {:actor "u" :object "x" :tags (list "cats")})) 0) ; ---------- ranking by relevance ---------- ; query news: o2,o3 match (score log2), o1,o4 don't (0); ties break by :at desc (feed-test "by-relevance news order" (map (fn (a) (get a :object)) (feed/items (feed/by-relevance corpus (list "news")))) (list "o3" "o2" "o4" "o1")) ; query funny: only o1 matches -> ranks first (feed-test "by-relevance funny first" (get (nth (feed/items (feed/by-relevance corpus (list "funny"))) 0) :object) "o1") ; query (cats news): o2 carries both tags -> highest combined tf-idf (feed-test "by-relevance cats+news top" (get (nth (feed/items (feed/by-relevance corpus (list "cats" "news"))) 0) :object) "o2") (feed-test "by-relevance preserves count" (feed/count (feed/by-relevance corpus (list "cats"))) 4)