diff --git a/lib/gitea/conformance.sh b/lib/gitea/conformance.sh index ef31fa05..b33884da 100644 --- a/lib/gitea/conformance.sh +++ b/lib/gitea/conformance.sh @@ -23,6 +23,7 @@ VERBOSE="${1:-}" ISSUES_EXTRAS="lib/relations/schema.sx;lib/relations/engine.sx;lib/relations/api.sx;lib/smalltalk/tokenizer.sx;lib/smalltalk/parser.sx;lib/guest/reflective/class-chain.sx;lib/smalltalk/runtime.sx;lib/guest/reflective/env.sx;lib/smalltalk/eval.sx;lib/content/block.sx;lib/content/doc.sx;lib/content/render.sx;lib/content/api.sx;lib/content/meta.sx;lib/content/text.sx;lib/content/section.sx;lib/content/table.sx;lib/content/markdown.sx;lib/content/md-import.sx;lib/gitea/issues.sx" PR_EXTRAS="$ISSUES_EXTRAS;lib/guest/lex.sx;lib/guest/reflective/quoting.sx;lib/scheme/parser.sx;lib/scheme/eval.sx;lib/scheme/runtime.sx;lib/flow/spec.sx;lib/flow/store.sx;lib/flow/remote.sx;lib/flow/host.sx;lib/flow/api.sx;lib/gitea/pr.sx" +SEARCH_EXTRAS_TAIL="lib/haskell/tokenizer.sx;lib/haskell/layout.sx;lib/haskell/parser.sx;lib/haskell/desugar.sx;lib/haskell/runtime.sx;lib/haskell/match.sx;lib/haskell/eval.sx;lib/haskell/map.sx;lib/haskell/set.sx;lib/haskell/testlib.sx;lib/search/tokenize.sx;lib/search/index.sx;lib/search/query.sx;lib/search/parse.sx;lib/search/rank.sx;lib/search/rankq.sx;lib/search/testlib.sx;lib/gitea/search.sx" ACT_EXTRAS="$PR_EXTRAS;lib/apl/runtime.sx;lib/feed/normalize.sx;lib/feed/stream.sx;lib/feed/api.sx;lib/feed/fanout.sx;lib/feed/dedupe.sx;lib/feed/aggregate.sx;lib/feed/rank.sx;lib/feed/acl.sx;lib/feed/mute.sx;lib/feed/page.sx;lib/feed/notify.sx;lib/feed/home.sx;lib/feed/fed.sx;lib/events/notify.sx;lib/gitea/activity.sx" SUITES=( @@ -32,6 +33,8 @@ SUITES=( "issues|gitea-issues-pass|gitea-issues-fail|gitea-issues-fails|$ISSUES_EXTRAS" "pr|gitea-pr-pass|gitea-pr-fail|gitea-pr-fails|$PR_EXTRAS" "activity|gitea-act-pass|gitea-act-fail|gitea-act-fails|$ACT_EXTRAS" + "search|gitea-search-pass|gitea-search-fail|gitea-search-fails|$PR_EXTRAS;$SEARCH_EXTRAS_TAIL" + "fed|gitea-fed-pass|gitea-fed-fail|gitea-fed-fails|$ACT_EXTRAS;lib/gitea/fed.sx" ) OUT_JSON="lib/gitea/scoreboard.json" @@ -102,7 +105,7 @@ run_suite() { } > "$TMP" local OUTPUT - OUTPUT=$(timeout 600 "$SX_SERVER" < "$TMP" 2>/dev/null) + OUTPUT=$(timeout 900 "$SX_SERVER" < "$TMP" 2>/dev/null) rm -f "$TMP" local LINE diff --git a/lib/gitea/search.sx b/lib/gitea/search.sx new file mode 100644 index 00000000..62a7e28d --- /dev/null +++ b/lib/gitea/search.sx @@ -0,0 +1,383 @@ +; lib/gitea/search.sx — sx-gitea Phase 7: code + issue/PR search. +; +; search-on-sx (haskell-on-sx) does the heavy lifting: the forge builds a +; document corpus SX-side — code files from the default branch head +; (path + blob text), issues (title + body + comment bodies), PRs (title +; + body + review bodies) — embeds it as one Haskell program, and asks +; searchRankTfIdf for ranked doc ids. Queries speak the search query +; language (terms, AND/OR/NOT, "phrases"). +; +; Cost model: ONE evaluation parses the Haskell search layers (~seconds), +; extra queries are nearly free. So the core primitive is +; gitea/search-multi — ANY number of corpora and queries in a single +; evaluation (each corpus becomes an idxN binding) — and everything else +; sugars over it. Only the six layers searchRankTfIdf needs are compiled +; (tokenize/index/query/parse/rank/rankq), not the full search/src. +; +; Requires: lib/gitea/{repo,access,web,issues,pr}.sx and their stacks, +; lib/haskell/* + lib/search/{tokenize,index,query,parse,rank,rankq, +; testlib}.sx (search-hk->sx, hk-core, hk-eval-program). + +; ── corpus ─────────────────────────────────────────────────────────── + +(define + gitea/code-docs + (fn + (forge owner name) + (let + ((rec (gitea/repo-get forge owner name))) + (if + (nil? rec) + (list) + (let + ((grepo (gitea/repo-git forge owner name))) + (let + ((head-cid (git/branch-get grepo (get rec :default-branch)))) + (if + (nil? head-cid) + (list) + (let + ((files (git/commit-files grepo head-cid))) + (map + (fn (p) {:text (str p " " (get files p)) :kind "code" :ref p}) + (artdag/sort-strings (keys files))))))))))) + +(define + gitea/issue-docs + (fn + (forge owner name) + (map (fn (r) {:text (str (get r :title) " " (get r :body) " " (join " " (map (fn (c) (get c :body)) (get r :comments)))) :kind "issue" :ref (str (get r :number))}) (gitea/issue-records forge owner name)))) + +(define + gitea/pr-search-docs + (fn + (forge owner name) + (map (fn (r) {:text (str (get r :title) " " (get r :body) " " (join " " (map (fn (v) (get v :body)) (get r :reviews)))) :kind "pr" :ref (str (get r :number))}) (gitea/pr-records forge owner name)))) + +(define gitea/search-kinds-all (list "code" "issue" "pr")) + +(define + gitea/repo-docs + (fn + (forge owner name kinds) + (filter + (fn (d) (contains? kinds (get d :kind))) + (concat + (gitea/code-docs forge owner name) + (concat + (gitea/issue-docs forge owner name) + (gitea/pr-search-docs forge owner name)))))) + +(define + gitea/visible-docs + (fn + (forge user kinds) + (reduce + (fn + (acc full) + (let + ((p (gitea/split-full full))) + (concat + acc + (map + (fn (d) (assoc d :repo full)) + (gitea/repo-docs forge (get p :owner) (get p :name) kinds))))) + (list) + (gitea/visible-repos forge user)))) + +; ── haskell program assembly ───────────────────────────────────────── + +; only the layers searchRankTfIdf needs — parsing is the dominant cost +(define + gitea/search-src + (str + search/tokenize-src + "\n" + search/index-src + "\n" + search/query-src + "\n" + search/parse-src + "\n" + search/rank-src + "\n" + search/rankq-src + "\n")) + +(define + gitea/hk-escape + (fn + (s) + (replace + (replace (replace (replace (or s "") "\\" "\\\\") "\"" "\\\"") "\n" "\\n") + "\t" + "\\t"))) + +; = indexDoc k "text_k" ( ... (indexDoc 1 "text_1" emptyIndex)) +(define + gitea/index-binding + (fn + (idxname docs) + (str + idxname + " = " + (reduce + (fn + (acc pair) + (str + "indexDoc " + (first pair) + " \"" + (gitea/hk-escape (get (nth pair 1) :text)) + "\" (" + acc + ")")) + "emptyIndex" + (map-indexed (fn (i d) (list (+ i 1) d)) docs)) + "\n"))) + +(define + gitea/hk-search-eval + (fn + (extra) + (search-hk->sx + (hk-deep-force + (get + (hk-eval-program (hk-core (str gitea/search-src extra))) + "result"))))) + +; THE core: corpora = (docs ...), specs = ({:corpus i :query q :n n} ...). +; One haskell evaluation; => one ranked {:kind :ref (:repo)} list per spec. +(define + gitea/search-multi + (fn + (corpora specs) + (let + ((bindings (join "" (map-indexed (fn (i docs) (gitea/index-binding (str "idx" i) docs)) corpora))) + (exprs + (map + (fn + (s) + (if + (empty? (nth corpora (get s :corpus))) + "[]" + (str + "take " + (get s :n) + " (searchRankTfIdf \"" + (gitea/hk-escape (get s :query)) + "\" idx" + (get s :corpus) + ")"))) + specs))) + (let + ((id-lists (gitea/hk-search-eval (str bindings "result = [" (join ", " exprs) "]\n")))) + (map-indexed + (fn + (si ids) + (let + ((docs (nth corpora (get (nth specs si) :corpus)))) + (map + (fn + (i) + (let + ((d (nth docs (- i 1)))) + (if (get d :repo) {:repo (get d :repo) :kind (get d :kind) :ref (get d :ref)} {:kind (get d :kind) :ref (get d :ref)}))) + ids))) + id-lists))))) + +; many queries against ONE corpus, still one evaluation +(define + gitea/search-many + (fn + (docs queries n) + (if + (empty? docs) + (map (fn (q) (list)) queries) + (gitea/search-multi (list docs) (map (fn (q) {:n n :query q :corpus 0}) queries))))) + +; ── repo-scoped search ─────────────────────────────────────────────── + +(define + gitea/search-repo + (fn + (forge owner name query kinds n) + (first + (gitea/search-many + (gitea/repo-docs forge owner name kinds) + (list query) + n)))) + +(define + gitea/search-code + (fn + (forge owner name query n) + (gitea/search-repo forge owner name query (list "code") n))) + +(define + gitea/search-issues + (fn + (forge owner name query n) + (gitea/search-repo forge owner name query (list "issue") n))) + +(define + gitea/search-prs + (fn + (forge owner name query n) + (gitea/search-repo forge owner name query (list "pr") n))) + +; ── global search over visible repos ───────────────────────────────── + +(define + gitea/search-visible + (fn + (forge user query kinds n) + (first + (gitea/search-many + (gitea/visible-docs forge user kinds) + (list query) + n)))) + +; ── web ────────────────────────────────────────────────────────────── + +(define gitea/w-query-param (fn (req name) (get (get req :query) name))) + +(define + gitea/w-search-kinds + (fn + (req) + (let + ((k (gitea/w-query-param req "kind"))) + (if + (contains? gitea/search-kinds-all k) + (list k) + gitea/search-kinds-all)))) + +(define + gitea/w-search-item + (fn + (owner name r) + (let + ((kind (get r :kind)) (ref (get r :ref))) + (cond + ((= kind "code") + (str + "
  • " + (dream-escape ref) + "
  • ")) + ((= kind "issue") + (str + "
  • #" + ref + "
  • ")) + (else + (str + "
  • #" + ref + "
  • ")))))) + +(define + gitea/w-search-page + (fn + (forge req) + (let + ((owner (dream-param req "owner")) + (name (dream-param req "name")) + (q (gitea/w-query-param req "q"))) + (cond + ((not (gitea/w-readable? forge req owner name)) (dream-not-found)) + ((or (nil? q) (= q "")) + (gitea/w-json-status 400 {:error "missing-query"})) + (else + (gitea/w-page + (str "search: " q) + (str + "

    Search

    " + (dream-escape q) + "

    "))))))) + +(define + gitea/w-api-search-repo + (fn + (forge req) + (let + ((owner (dream-param req "owner")) + (name (dream-param req "name")) + (q (gitea/w-query-param req "q"))) + (cond + ((not (gitea/w-readable? forge req owner name)) (dream-not-found)) + ((or (nil? q) (= q "")) + (gitea/w-json-status 400 {:error "missing-query"})) + (else + (dream-json-value + (gitea/search-repo + forge + owner + name + q + (gitea/w-search-kinds req) + 20))))))) + +(define + gitea/w-api-search + (fn + (forge req) + (let + ((q (gitea/w-query-param req "q"))) + (if + (or (nil? q) (= q "")) + (gitea/w-json-status 400 {:error "missing-query"}) + (dream-json-value + (gitea/search-visible + forge + (gitea/w-user forge req) + q + (gitea/w-search-kinds req) + 20)))))) + +(define + gitea/search-routes + (fn + (forge) + (list + (dream-get + "/:owner/:name/search" + (fn (req) (gitea/w-search-page forge req))) + (dream-get + "/api/repos/:owner/:name/search" + (fn (req) (gitea/w-api-search-repo forge req))) + (dream-get "/api/search" (fn (req) (gitea/w-api-search forge req)))))) + +(set! + gitea/route-packs + (append gitea/route-packs (list gitea/search-routes))) diff --git a/lib/gitea/tests/search.sx b/lib/gitea/tests/search.sx new file mode 100644 index 00000000..0d1bca61 --- /dev/null +++ b/lib/gitea/tests/search.sx @@ -0,0 +1,266 @@ +; lib/gitea/tests/search.sx — Phase 7: code + issue/PR search over +; search-on-sx. All SX-level queries run as ONE haskell evaluation +; (gitea/search-multi over five corpora); the web tests add one +; evaluation per request. + +(st-bootstrap-classes!) +(content/bootstrap!) +(content-bootstrap-markdown!) +(content-bootstrap-table!) + +(define gitea-search-pass 0) +(define gitea-search-fail 0) +(define gitea-search-fails (list)) + +(define + gitea-search-test + (fn + (name actual expected) + (if + (= actual expected) + (set! gitea-search-pass (+ gitea-search-pass 1)) + (begin + (set! gitea-search-fail (+ gitea-search-fail 1)) + (set! + gitea-search-fails + (append gitea-search-fails (list {:name name :expected (inspect expected) :actual (inspect actual)}))))))) + +; ── escaping ───────────────────────────────────────────────────────── + +(gitea-search-test "hk-escape quotes" (gitea/hk-escape "a\"b") "a\\\"b") +(gitea-search-test "hk-escape newline" (gitea/hk-escape "a\nb") "a\\nb") +(gitea-search-test "hk-escape backslash" (gitea/hk-escape "a\\b") "a\\\\b") +(gitea-search-test "hk-escape nil" (gitea/hk-escape nil) "") + +; ── setup ──────────────────────────────────────────────────────────── + +(define gs-db (persist/mem-backend)) +(define gs-forge (gitea/forge gs-db)) +(gitea/user-create! gs-forge "alice") +(gitea/user-create! gs-forge "bob") +(gitea/token-create! gs-forge "alice" "tok-a") +(gitea/repo-create! gs-forge "alice" "proj" {}) +(gitea/repo-create! gs-forge "alice" "sec" {:visibility "private"}) +(gitea/repo-create! gs-forge "alice" "empty" {}) + +(define gs-g (gitea/repo-git gs-forge "alice" "proj")) +(git/add! gs-g "README.md" "SX forge documentation overview") +(git/add! gs-g "src/main.sx" "(define forge-main entry) banana runtime") +(git/add! gs-g "docs/guide.md" "how to search the forge") +(git/commit! gs-g {:message "c1" :time 1 :author "alice"}) +(git/branch! gs-g "feat") +(git/checkout! gs-g "feat") +(git/add! gs-g "src/extra.sx" "cherry helpers") +(git/commit! gs-g {:message "c2" :time 2 :author "alice"}) +(git/checkout! gs-g "main") + +(define gs-gs (gitea/repo-git gs-forge "alice" "sec")) +(git/add! gs-gs "secret.txt" "banana secret stash") +(git/commit! gs-gs {:message "s1" :time 3 :author "alice"}) + +(gitea/issue-create! + gs-forge + "alice" + "proj" + "alice" + "Crash in search" + "the search crashes on banana input" + {:created-at 10}) +(gitea/issue-comment! + gs-forge + "alice" + "proj" + 1 + "bob" + "reproduced with cherry too" + {:at 11}) +(gitea/issue-create! + gs-forge + "alice" + "proj" + "alice" + "Docs update" + "documentation needs love" + {:created-at 12}) +(gitea/pr-create! + gs-forge + "alice" + "proj" + "bob" + "Improve search ranking" + "feat" + "main" + "tfidf ranking for results" + {}) + +; ── corpus construction ────────────────────────────────────────────── + +(define gs-corp-code (gitea/repo-docs gs-forge "alice" "proj" (list "code"))) +(define + gs-corp-ip + (gitea/repo-docs gs-forge "alice" "proj" (list "issue" "pr"))) +(define + gs-corp-all + (gitea/repo-docs gs-forge "alice" "proj" gitea/search-kinds-all)) +(define gs-corp-anon (gitea/visible-docs gs-forge nil gitea/search-kinds-all)) +(define + gs-corp-own + (gitea/visible-docs gs-forge "alice" gitea/search-kinds-all)) + +(gitea-search-test "code docs" (len gs-corp-code) 3) +(gitea-search-test "issue+pr docs" (len gs-corp-ip) 3) +(gitea-search-test "all docs" (len gs-corp-all) 6) +(gitea-search-test + "empty repo corpus" + (gitea/code-docs gs-forge "alice" "empty") + (list)) +(gitea-search-test + "anon corpus excludes private" + (len (filter (fn (d) (= (get d :repo) "alice/sec")) gs-corp-anon)) + 0) +(gitea-search-test + "owner corpus includes private" + (len (filter (fn (d) (= (get d :repo) "alice/sec")) gs-corp-own)) + 1) + +; ── ONE evaluation, thirteen queries over five corpora ─────────────── + +(define + gs-R + (gitea/search-multi + (list gs-corp-code gs-corp-ip gs-corp-all gs-corp-anon gs-corp-own) + (list + {:n 20 :query "banana" :corpus 0} + {:n 20 :query "forge" :corpus 0} + {:n 20 :query "banana OR cherry" :corpus 0} + {:n 20 :query "nosuchterm" :corpus 0} + {:n 20 :query "banana" :corpus 1} + {:n 20 :query "cherry" :corpus 1} + {:n 20 :query "ranking" :corpus 1} + {:n 20 :query "search AND banana" :corpus 1} + {:n 20 :query "documentation" :corpus 1} + {:n 20 :query "search" :corpus 2} + {:n 2 :query "search" :corpus 2} + {:n 20 :query "banana" :corpus 3} + {:n 20 :query "banana" :corpus 4}))) + +(gitea-search-test + "code: unique term" + (nth gs-R 0) + (list {:kind "code" :ref "src/main.sx"})) +(gitea-search-test + "code: common term hits all three" + (len (nth gs-R 1)) + 3) +(gitea-search-test "code: OR query" (nth gs-R 2) (list {:kind "code" :ref "src/main.sx"})) +(gitea-search-test "code: no hits" (nth gs-R 3) (list)) + +(gitea-search-test + "issues: body term" + (nth gs-R 4) + (list {:kind "issue" :ref "1"})) +(gitea-search-test + "issues: comment text indexed" + (nth gs-R 5) + (list {:kind "issue" :ref "1"})) +(gitea-search-test "prs: body term" (nth gs-R 6) (list {:kind "pr" :ref "3"})) +(gitea-search-test + "issues: AND query" + (nth gs-R 7) + (list {:kind "issue" :ref "1"})) +(gitea-search-test + "issues: title+body of second" + (len (filter (fn (r) (= r {:kind "issue" :ref "2"})) (nth gs-R 8))) + 1) + +(gitea-search-test + "mixed kinds found" + (len (nth gs-R 9)) + 3) +(gitea-search-test + "take-n limits results" + (len (nth gs-R 10)) + 2) + +(gitea-search-test + "global anon excludes private" + (len + (filter (fn (r) (= (get r :repo) "alice/sec")) (nth gs-R 11))) + 0) +(gitea-search-test + "global anon finds public code+issue" + (len (nth gs-R 11)) + 2) +(gitea-search-test + "global owner sees private" + (len + (filter (fn (r) (= (get r :repo) "alice/sec")) (nth gs-R 12))) + 1) +(gitea-search-test + "global owner total" + (len (nth gs-R 12)) + 3) + +; empty corpora short-circuit without an evaluation +(gitea-search-test + "empty corpus searches empty" + (gitea/search-repo + gs-forge + "alice" + "empty" + "banana" + gitea/search-kinds-all + 10) + (list)) + +; ── web (one haskell evaluation per request) ───────────────────────── + +(define gs-app (gitea/app gs-forge)) +(define gs-hdr (fn (tok) (if (nil? tok) {} {:authorization (str "Bearer " tok)}))) +(define + gs-get + (fn (target tok) (gs-app (dream-request "GET" target (gs-hdr tok) "")))) + +(define gs-page (gs-get "/alice/proj/search?q=banana" nil)) +(gitea-search-test "search page 200" (dream-status gs-page) 200) +(gitea-search-test + "search page links the hit" + (contains? (dream-resp-body gs-page) "src/main.sx") + true) + +(gitea-search-test + "search page kind filter" + (contains? + (dream-resp-body (gs-get "/alice/proj/search?q=banana&kind=issue" nil)) + "issues/1") + true) +(gitea-search-test + "search page missing q 400" + (dream-status (gs-get "/alice/proj/search" nil)) + 400) +(gitea-search-test + "private search anon 404" + (dream-status (gs-get "/alice/sec/search?q=banana" nil)) + 404) + +(gitea-search-test + "api repo search" + (dream-json-parse + (dream-resp-body + (gs-get "/api/repos/alice/proj/search?q=cherry&kind=issue" nil))) + (list {:kind "issue" :ref "1"})) +(gitea-search-test + "api repo search missing q 400" + (dream-status (gs-get "/api/repos/alice/proj/search" nil)) + 400) + +(define gs-glob-anon (gs-get "/api/search?q=banana" nil)) +(gitea-search-test + "api global anon count" + (len (dream-json-parse (dream-resp-body gs-glob-anon))) + 2) +(define gs-glob-own (gs-get "/api/search?q=banana" "tok-a")) +(gitea-search-test + "api global authed count" + (len (dream-json-parse (dream-resp-body gs-glob-own))) + 3)