diff --git a/plans/jit-cache-architecture.md b/plans/jit-cache-architecture.md new file mode 100644 index 00000000..09900550 --- /dev/null +++ b/plans/jit-cache-architecture.md @@ -0,0 +1,223 @@ +# JIT Cache Architecture — Tiered + LRU + Reset API + +## Problem statement + +The OCaml WASM kernel JIT-compiles every lambda body on first call and caches +the resulting `vm_closure` in a mutable slot on the lambda itself +(`Lambda.l_compiled`, `Component.c_compiled`, `Island.i_compiled`). Cache +growth is unbounded — there is no eviction, no threshold, no reset. + +**Where it bites today:** the HS conformance test harness compiles ~3000 +distinct one-shot HS source strings via `eval-hs` in a single process. Each +compilation creates a fresh lambda → fresh `vm_closure`. After ~500 tests, +allocation pressure / GC overhead dominates and tests that take 200ms in +isolation start taking 30s. + +**Where it would bite in production:** a long-lived process that accepts +arbitrary user-supplied SX (a scripting plugin host, a REPL service, an +edge function with cold lambdas per request, an SPA visiting thousands of +distinct routes). Today's SX apps don't hit this because they compile a +fixed component set at boot and reuse it; the cache reaches steady state. + +## Architecture + +Three coordinated mechanisms, deployed in order: + +### 1. Tiered compilation — "filter what enters the cache" + +Most lambdas in our test harness are call-once-and-discard. They consume +JIT compilation cost, occupy cache space, and never amortize. Solution: +don't JIT until a lambda has been called K times. + +**OCaml changes:** + +```ocaml +(* sx_types.ml *) +type lambda = { + ... + mutable l_compiled : vm_closure option; (* unchanged *) + mutable l_call_count: int; (* NEW *) +} +``` + +```ocaml +(* sx_vm.ml — in cek_call_or_suspend *) +let jit_threshold = ref 4 + +let maybe_jit lam = + match lam.l_compiled with + | Some _ -> () (* already compiled *) + | None -> + lam.l_call_count <- lam.l_call_count + 1; + if lam.l_call_count >= !jit_threshold then + lam.l_compiled <- !jit_compile_ref lam globals +``` + +**Tunable via primitive:** `(jit-set-threshold! N)` (default 4; 1 = old +behavior; ∞ = disable JIT). + +**Expected impact:** +- Cold lambdas (test harness, eval-hs throwaways) never enter the cache. +- Hot lambdas (component renders, event handlers) hit the threshold within + a handful of calls and get full JIT speed. +- Eliminates the test-harness pathology entirely without touching cache size. + +### 2. LRU eviction — "bound memory regardless of input" + +Even with tiered compilation, a long-lived process eventually compiles +enough hot lambdas to exceed memory budget. Pure LRU eviction with a +fixed budget gives a predictable ceiling. + +**OCaml changes:** + +```ocaml +(* sx_jit_cache.ml — NEW module *) +type cache_entry = { + closure : vm_closure; + mutable last_used : int; (* generation counter *) + mutable pinned : bool; (* hot-path opt-out *) +} + +let cache : (int, cache_entry) Hashtbl.t = Hashtbl.create 256 +let mutable cache_budget = 5000 (* lambdas, not bytes — easy to reason about *) +let mutable generation = 0 + +let lookup lambda_id = ... +let insert lambda_id closure = + generation <- generation + 1; + Hashtbl.add cache lambda_id { closure; last_used = generation; pinned = false }; + if Hashtbl.length cache > cache_budget then evict_oldest () +let pin lambda_id = ... +``` + +**Migration:** `Lambda.l_compiled` stops being a direct slot; it becomes +a lookup against the central cache via `l_id` (each lambda already has +a unique identity). Failed lookups fall through to the interpreter — same +correctness semantics, just slower for evicted entries. + +**Tunable:** `(jit-set-budget! N)` (default 5000; 0 = disable cache). + +**Pinning:** `(jit-pin! 'fn-name)` keeps a function from ever being evicted. +Use for stdlib helpers, hot rendering paths. + +### 3. Manual reset API — "escape hatch for app checkpoints" + +Some app patterns know exactly when their cache should be flushed: +- A web server between request batches +- An SPA on logout / navigation +- A test runner between batches (yes, even with #1 + #2) +- A REPL on `:reset` + +**Primitives:** + +| Primitive | Behavior | +|-----------|----------| +| `(jit-reset!)` | Drop all cache entries. Hot paths re-JIT on next call. | +| `(jit-clear-cold!)` | Drop only entries that haven't been used in N generations. | +| `(jit-stats)` | Returns dict: `{:size N :budget M :hits H :misses I :evictions E}`. | +| `(jit-set-threshold! N)` | Raise/lower compilation threshold at runtime. | +| `(jit-set-budget! N)` | Raise/lower cache size budget. | +| `(jit-pin! sym)` | Pin a named function against eviction. | +| `(jit-unpin! sym)` | Unpin. | + +All zero-cost when not called — just a few atomic counter increments. + +## Where it lives + +The JIT is host-specific (OCaml WASM kernel). The plan splits across +three layers: + +``` +hosts/ocaml/lib/sx_jit_cache.ml NEW — cache datastructure + LRU +hosts/ocaml/lib/sx_vm.ml Modified — call counter, lookup integration +hosts/ocaml/lib/sx_types.ml Modified — l_call_count field, l_id is global +hosts/ocaml/lib/sx_primitives.ml Modified — register jit-* primitives +spec/primitives.sx Modified — declarative spec for jit-* primitives +lib/jit.sx NEW — SX-level helpers + macros +``` + +**lib/jit.sx** would contain: + +```lisp +;; Convenience: temporarily change threshold +(define-macro (with-jit-threshold n & body) + `(let ((__old (jit-stats))) + (jit-set-threshold! ,n) + (let ((__r (do ,@body))) (jit-set-threshold! (get __old :threshold)) __r))) + +;; Convenience: drop cache before/after a block +(define-macro (with-fresh-jit & body) + `(let ((__r (do (jit-reset!) ,@body))) (jit-reset!) __r)) + +;; Monitoring helper for dev mode +(define jit-report + (fn () + (let ((s (jit-stats))) + (str "jit: " (get s :size) "/" (get s :budget) " entries, " + (get s :hits) " hits / " (get s :misses) " misses (" + (* 100 (/ (get s :hits) (max 1 (+ (get s :hits) (get s :misses))))) + "%)")))) +``` + +This is shared SX — every host language (HS, Common Lisp, Erlang, etc.) +gets the same API for free. + +## Rollout + +**Phase 1: Tiered compilation (1-2 days)** +- Add `l_call_count` to lambda type +- Wire counter increment in `cek_call_or_suspend` +- Add `jit-set-threshold!` primitive +- Default threshold = 1 (no change in behavior) +- Bump default to 4 once test suite confirms stability +- Verify: HS conformance full-suite run completes without JIT saturation + +**Phase 2: LRU cache (3-5 days)** +- Extract `Lambda.l_compiled` into central `sx_jit_cache.ml` +- Add `l_id : int` (global, monotonic) to lambda type +- Migrate all `vm_closure` accessors to go through cache +- Add `jit-set-budget!`, `jit-pin!`, `jit-unpin!` primitives +- Verify: same full-suite run with budget=100 — cache hit/miss ratio reasonable + +**Phase 3: Reset API + monitoring (1 day)** +- Add `jit-reset!`, `jit-clear-cold!`, `jit-stats` primitives +- Add `lib/jit.sx` SX-level wrappers +- Integrate into HS test runner: call `jit-reset!` between batches as belt-and-suspenders +- Document in CLAUDE.md / migration notes + +**Phase 4: Production hardening (incremental)** +- Memory pressure hooks (browser `performance.measureUserAgentSpecificMemory`) +- Bytecode interning (dedupe identical `vm_closure` bodies across lambdas) +- Generational sweep on idle (browser `requestIdleCallback`) +- These are nice-to-have, not required for correctness. + +## Testing + +Each phase ships with: +- Unit tests in `spec/tests/test-jit-cache.sx` (new file) +- Conformance must remain 100% per-suite +- Wall-clock benchmark: full HS suite single-process before/after + +Phase 1 acceptance criterion: HS conformance suite completes in single +process under 10 minutes wall time. + +Phase 2 acceptance: same as 1 but with budget=500. Cache size stays +bounded throughout the run; hit rate >90% on hot paths. + +Phase 3 acceptance: `jit-reset!` between batches reduces test-harness +wall time by >50% vs no reset (because hot stdlib stays cached, but +test-specific lambdas don't accumulate). + +## Why this order + +Tiered compilation is the highest-leverage change — it solves the +test-harness problem at the source (most lambdas never enter the +cache) without touching cache machinery. LRU is the safety net +(unbounded growth still possible if every lambda is hot, e.g., huge +dynamic component graph). Reset is the escape hatch for situations +neither mechanism can handle (logout, hard memory pressure, app +restart without process restart). + +Doing them in reverse would invert the value — reset alone fixes +nothing without app-level integration, and LRU without tiered +compilation churns the cache constantly on cold lambdas. diff --git a/tests/hs-run-batched.js b/tests/hs-run-batched.js new file mode 100755 index 00000000..0b88d2f7 --- /dev/null +++ b/tests/hs-run-batched.js @@ -0,0 +1,151 @@ +#!/usr/bin/env node +/** + * Batched HS conformance runner — option 2 (per-process kernel isolation). + * + * Each batch spawns a fresh Node process running tests/hs-run-filtered.js + * with HS_START/HS_END set, so the WASM kernel's JIT cache starts empty. + * Avoids the cumulative slowdown that hits the 1-process runner around + * test 500-700 (compiled lambdas accumulate, allocation stalls). + * + * Usage: + * node tests/hs-run-batched.js + * HS_BATCH_SIZE=100 node tests/hs-run-batched.js + * HS_PARALLEL=4 node tests/hs-run-batched.js + */ +const { spawnSync, spawn } = require('child_process'); +const path = require('path'); +const fs = require('fs'); + +const FILTERED = path.join(__dirname, 'hs-run-filtered.js'); +const TOTAL = parseInt(process.env.HS_TOTAL || '1496'); +const FROM = parseInt(process.env.HS_FROM || '0'); +const BATCH_SIZE = parseInt(process.env.HS_BATCH_SIZE || '150'); +const PARALLEL = parseInt(process.env.HS_PARALLEL || '1'); +const VERBOSE = !!process.env.HS_VERBOSE; + +function makeBatches() { + const batches = []; + for (let i = FROM; i < TOTAL; i += BATCH_SIZE) { + batches.push({ start: i, end: Math.min(i + BATCH_SIZE, TOTAL) }); + } + return batches; +} + +function runBatch({ start, end }) { + const t0 = Date.now(); + const r = spawnSync('node', [FILTERED], { + env: { ...process.env, HS_START: String(start), HS_END: String(end) }, + encoding: 'utf8', + timeout: 1800_000, // 30 min per batch hard cap + }); + const out = (r.stdout || '') + (r.stderr || ''); + const elapsed = Date.now() - t0; + return { start, end, elapsed, out, code: r.status }; +} + +function parseBatch(out) { + const result = { pass: 0, fail: 0, failures: [], slow: [], timeouts: [] }; + const m = out.match(/Results:\s+(\d+)\/(\d+)/); + if (m) { + result.pass = parseInt(m[1]); + const total = parseInt(m[2]); + result.fail = total - result.pass; + } + // Capture each "[suite] name: error" failure line + const failSection = out.split('All failures:')[1] || ''; + for (const line of failSection.split('\n')) { + const fm = line.match(/^\s*\[([^\]]+)\]\s+(.+?):\s*(.*)$/); + if (fm) result.failures.push({ suite: fm[1], name: fm[2], err: fm[3] || '(empty)' }); + } + for (const line of out.split('\n')) { + const sm = line.match(/SLOW: test (\d+) took (\d+)ms \[([^\]]+)\] (.+)$/); + if (sm) result.slow.push({ idx: +sm[1], ms: +sm[2], suite: sm[3], name: sm[4] }); + const tm = line.match(/TIMEOUT: test (\d+) \[([^\]]+)\] (.+)$/); + if (tm) result.timeouts.push({ idx: +tm[1], suite: tm[2], name: tm[3] }); + } + return result; +} + +function fmtTime(ms) { + if (ms < 1000) return `${ms}ms`; + if (ms < 60_000) return `${(ms / 1000).toFixed(1)}s`; + return `${Math.floor(ms / 60_000)}m${Math.round((ms % 60_000) / 1000)}s`; +} + +async function runParallel(batches, concurrency) { + const results = new Array(batches.length); + let cursor = 0; + async function worker() { + while (cursor < batches.length) { + const i = cursor++; + results[i] = await new Promise((resolve) => { + const t0 = Date.now(); + let out = ''; + const child = spawn('node', [FILTERED], { + env: { ...process.env, HS_START: String(batches[i].start), HS_END: String(batches[i].end) }, + }); + child.stdout.on('data', d => out += d); + child.stderr.on('data', d => out += d); + child.on('exit', (code) => resolve({ ...batches[i], elapsed: Date.now() - t0, out, code })); + }); + const r = parseBatch(results[i].out); + process.stderr.write(` batch ${batches[i].start}-${batches[i].end}: ${r.pass}/${r.pass + r.fail} (${fmtTime(results[i].elapsed)})\n`); + } + } + await Promise.all(Array.from({ length: concurrency }, worker)); + return results; +} + +(async () => { + const batches = makeBatches(); + const t0 = Date.now(); + process.stderr.write(`Running ${TOTAL} tests in ${batches.length} batches of ${BATCH_SIZE} (parallelism=${PARALLEL})\n`); + + let results; + if (PARALLEL > 1) { + results = await runParallel(batches, PARALLEL); + } else { + results = []; + for (const b of batches) { + const r = runBatch(b); + results.push(r); + const p = parseBatch(r.out); + process.stderr.write(` batch ${b.start}-${b.end}: ${p.pass}/${p.pass + p.fail} (${fmtTime(r.elapsed)})\n`); + } + } + + let totalPass = 0, totalFail = 0; + const allFailures = []; + const allTimeouts = []; + const slowest = []; + for (const r of results) { + const p = parseBatch(r.out); + totalPass += p.pass; + totalFail += p.fail; + allFailures.push(...p.failures); + allTimeouts.push(...p.timeouts); + slowest.push(...p.slow); + if (VERBOSE) process.stdout.write(r.out); + } + + const totalElapsed = Date.now() - t0; + process.stdout.write(`\n=== Conformance ===\n`); + process.stdout.write(`Total: ${totalPass}/${totalPass + totalFail} (${(100 * totalPass / (totalPass + totalFail)).toFixed(2)}%)\n`); + process.stdout.write(`Wall: ${fmtTime(totalElapsed)} across ${batches.length} batches\n`); + + if (allFailures.length) { + process.stdout.write(`\nFailures (${allFailures.length}):\n`); + for (const f of allFailures) process.stdout.write(` [${f.suite}] ${f.name}: ${f.err}\n`); + } + if (allTimeouts.length && allTimeouts.length !== allFailures.length) { + process.stdout.write(`\nTimeouts (${allTimeouts.length}):\n`); + for (const t of allTimeouts) process.stdout.write(` [${t.suite}] ${t.name}\n`); + } + slowest.sort((a, b) => b.ms - a.ms); + if (slowest.length) { + process.stdout.write(`\nSlowest 10 tests:\n`); + for (const s of slowest.slice(0, 10)) process.stdout.write(` ${s.ms}ms [${s.suite}] ${s.name}\n`); + } + + process.exit(totalFail > 0 ? 1 : 0); +})();