vm-ext: fix serving-JIT perform-in-HO-callback miscompile (host bug)

Root cause (found via bin/repro_jit_resume.ml, 9 surgical cases): when a
`perform` (durable kv read) fires inside a native HO-primitive callback
(map/filter/reduce/for-each/some/every?), the VmSuspended unwound through
the primitive's native OCaml loop (List.map etc.), destroying the loop's
iteration state. The remaining elements were dropped and the stack left
misaligned, so the NEXT CALL_PRIM (map/rest/drop) read wrong args —
"map: expected (fn list)", "rest: 1 list arg", "drop: list and number".
Only triggers in the http-listen + cek_run_with_io serving path (epoch
eval has no synchronous resolver, so conformance was 271/271).

(A) lib/sx_vm.ml call_closure_reuse: when a callback suspends AND a
synchronous IO resolver is installed (serving mode), resolve the
callback's IO inline and run it to completion right there, returning its
value to the native loop — so the loop is never unwound. Scoped to the
resolver-set path; the CEK-driven path (flow/reactive/async tests) keeps
its existing reuse_stack behaviour, so nothing else changes. reuse_stack
is isolated across the nested resume.

(A') lib/sx_vm.ml resume_vm: re-assert _active_vm := Some vm for the
duration of the resumed run (mirrors call_closure). call_closure restored
_active_vm to the caller when VmSuspended unwound, so HO callbacks during
a resume could land on the wrong VM. Latent-bug fix.

(B) bin/sx_server.ml register_jit_hook: the resolve_loop runs inside the
VmSuspended handler, so a non-VmSuspended exception from resume_vm escaped
to the http handler (→ 500). Catch it and fall back to CEK for THIS call
(mark jit_failed, return None → interpreter re-runs it). Self-heals on the
first hit, not a retry. Defense-in-depth; with (A) it shouldn't trigger.

Verification: repro 9/9 (incl. host shape: map[cb→interpreted-helper
perform]→drop = (7 8); reduce; nested map). Standard + --full OCaml
conformance unchanged at 4834/1110 (baseline identical — the 1110 are
pre-existing environmental: host-call-fn/browser-platform symbols,
rational display, tw/regex). Host loop to re-verify 271/271 serving and
drop its (jit-exclude! "host/*" "dream-*" "dr/*") band-aid.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-28 19:44:12 +00:00
parent fed58b2814
commit 81177d0ebd
4 changed files with 287 additions and 27 deletions

View File

@@ -1,5 +1,5 @@
(executables (executables
(names run_tests debug_set sx_server integration_tests bench_cek bench_inspect bench_vm) (names run_tests debug_set sx_server integration_tests bench_cek bench_inspect bench_vm repro_jit_resume)
(libraries sx unix threads.posix otfm yojson)) (libraries sx unix threads.posix otfm yojson))
(executable (executable

View File

@@ -0,0 +1,202 @@
(* Surgical repro for the serving-JIT OP_PERFORM/resume stack misalignment.
Mirrors what register_jit_hook's resolve_loop does: call_closure, catch
VmSuspended, resolve IO (return Nil), resume_vm — looping on re-suspend.
No CEK evaluator needed for the direct/multi-frame/reuse paths. *)
open Sx_types
let req_dict () =
let h = Hashtbl.create 1 in
Hashtbl.replace h "op" (String "noop");
Dict h
(* Mirror the serving hook's resolve loop exactly. *)
let drive cl =
let globals = cl.vm_closure_env |> ignore; cl.vm_env_ref in
let rec resolve_loop req vm =
let _ = req in
(try Sx_vm.resume_vm vm Nil
with Sx_vm.VmSuspended (r2, v2) -> resolve_loop r2 v2)
in
try Sx_vm.call_closure cl [] globals
with Sx_vm.VmSuspended (req, vm) -> resolve_loop req vm
let mk_code ~locals ~bc ~consts = {
vc_arity = 0; vc_rest_arity = -1; vc_locals = locals;
vc_bytecode = Array.of_list bc;
vc_constants = Array.of_list consts;
vc_bytecode_list = None; vc_constants_list = None;
}
let mk_cl ?(name="tf") ?(env=Hashtbl.create 64) code =
{ vm_code = code; vm_upvalues = [||]; vm_name = Some name;
vm_env_ref = env; vm_closure_env = None }
let report label v =
Printf.printf "%-28s => %s\n%!" label (Sx_runtime.value_to_str v)
let run label f =
(try report label (f ())
with
| Eval_error m -> Printf.printf "%-28s => ERROR: %s\n%!" label m
| e -> Printf.printf "%-28s => EXN: %s\n%!" label (Printexc.to_string e))
(* opcodes *)
let _const i = [1; i land 0xff; (i lsr 8) land 0xff]
let _perform = [112]
let _pop = [5]
let _call_prim idx argc = [52; idx land 0xff; (idx lsr 8) land 0xff; argc]
let _call argc = [48; argc]
let _return = [50]
let () =
(* Serving mode: a synchronous IO resolver is installed (mirrors
sx_server's http setup). Our mock resolves every request to Nil. *)
Sx_types._cek_io_resolver := Some (fun _req _ -> Nil);
(* Case 1: direct OP_PERFORM then a list prim in the SAME frame.
(do (perform {..}) (rest (list 1 2 3))) => (2 3) *)
run "1.direct perform→rest" (fun () ->
let consts = [ req_dict (); List [Number 1.; Number 2.; Number 3.]; String "rest" ] in
let bc = _const 0 @ _perform @ _pop @ _const 1 @ _call_prim 2 1 @ _return in
drive (mk_cl (mk_code ~locals:0 ~bc ~consts)));
(* Case 2: direct perform then map (2-arg prim).
(do (perform {..}) (map inc (list 1 2 3))) — needs a fn; use a NativeFn const *)
run "2.direct perform→map" (fun () ->
let inc = NativeFn ("inc1", function [Number n] -> Number (n +. 1.) | _ -> Nil) in
let consts = [ req_dict (); inc; List [Number 1.; Number 2.; Number 3.]; String "map" ] in
(* push fn, push list, CALL_PRIM map 2 *)
let bc = _const 0 @ _perform @ _pop @ _const 1 @ _const 2 @ _call_prim 3 2 @ _return in
drive (mk_cl (mk_code ~locals:0 ~bc ~consts)));
(* Case 3: multi-frame — outer calls a JIT'd helper that performs, THEN outer maps.
helper: (do (perform {..}) 99)
outer: (do (helper) (map inc (list 1 2 3))) *)
run "3.multiframe perform→map" (fun () ->
let env = Hashtbl.create 64 in
let helper_code = mk_code ~locals:0
~bc:(_const 0 @ _perform @ _pop @ _const 1 @ _return)
~consts:[ req_dict (); Number 99. ] in
let helper_cl = mk_cl ~name:"helper" ~env helper_code in
let inc = NativeFn ("inc1", function [Number n] -> Number (n +. 1.) | _ -> Nil) in
let consts = [ VmClosure helper_cl; inc; List [Number 1.; Number 2.; Number 3.]; String "map" ] in
(* push helper-closure, CALL 0, POP its result, push inc, push list, CALL_PRIM map 2 *)
let bc = _const 0 @ _call 0 @ _pop @ _const 1 @ _const 2 @ _call_prim 3 2 @ _return in
drive (mk_cl ~name:"outer" ~env (mk_code ~locals:0 ~bc ~consts)));
(* Case 4: map whose CALLBACK performs (reuse_stack path), then a trailing prim.
callback: (do (perform {..}) (inc e)) — but callback gets arg e in slot 0
outer: (do (map cb (list 1 2 3)) (rest (list 7 8 9))) *)
run "4.map-callback-perform" (fun () ->
let env = Hashtbl.create 64 in
(* callback arity 1: slot0 = e. body: (perform {..}); (inc e) ; return
LOCAL_GET 0 then CALL_PRIM inc... use NativeFn inc via CALL_PRIM *)
let cb_code = {
vc_arity = 1; vc_rest_arity = -1; vc_locals = 1;
vc_bytecode = Array.of_list (_const 0 @ _perform @ _pop
@ [16;0] (* LOCAL_GET 0 *)
@ _call_prim 1 1 @ _return);
vc_constants = [| req_dict (); String "inc" |];
vc_bytecode_list = None; vc_constants_list = None } in
let cb_cl = mk_cl ~name:"cb" ~env cb_code in
let consts = [ VmClosure cb_cl; List [Number 1.; Number 2.; Number 3.]; String "map";
List [Number 7.; Number 8.; Number 9.]; String "rest" ] in
(* push cb, push list, CALL_PRIM map 2, POP, push list2, CALL_PRIM rest 1, RETURN *)
let bc = _const 0 @ _const 1 @ _call_prim 2 2 @ _pop @ _const 3 @ _call_prim 4 1 @ _return in
drive (mk_cl ~name:"outer4" ~env (mk_code ~locals:0 ~bc ~consts)));
(* Case 5: THE HOST CASE — perform via an INTERPRETED helper (pending_cek path),
then a list prim. helper is a Lambda (l_compiled = jit_failed) whose body
performs; vm_call routes it through cek_call_or_suspend → pending_cek.
helper: (perform {..}) [interpreted via CEK]
outer: (do (helper) (rest (list 1 2 3))) => (2 3) *)
run "5.pending_cek perform→rest" (fun () ->
let env = Sx_types.make_env () in
let helper = Lambda {
l_params = []; l_body = List [Symbol "perform"; req_dict ()];
l_closure = env; l_name = Some "kvread";
l_compiled = Some Sx_vm.jit_failed_sentinel; l_call_count = 0;
l_uid = Sx_types.next_lambda_uid () } in
let consts = [ helper; List [Number 1.; Number 2.; Number 3.]; String "rest" ] in
(* push helper, CALL 0, POP, push list, CALL_PRIM rest 1, RETURN *)
let bc = _const 0 @ _call 0 @ _pop @ _const 1 @ _call_prim 2 1 @ _return in
drive (mk_cl ~name:"outer5" ~env:(Hashtbl.create 64) (mk_code ~locals:0 ~bc ~consts)));
(* Case 6: pending_cek perform → MAP (2-arg), the exact host shape. *)
run "6.pending_cek perform→map" (fun () ->
let env = Sx_types.make_env () in
let helper = Lambda {
l_params = []; l_body = List [Symbol "perform"; req_dict ()];
l_closure = env; l_name = Some "kvread";
l_compiled = Some Sx_vm.jit_failed_sentinel; l_call_count = 0;
l_uid = Sx_types.next_lambda_uid () } in
let inc = NativeFn ("inc1", function [Number n] -> Number (n +. 1.) | _ -> Nil) in
let consts = [ helper; inc; List [Number 1.; Number 2.; Number 3.]; String "map" ] in
(* push helper, CALL 0, POP, push inc, push list, CALL_PRIM map 2, RETURN *)
let bc = _const 0 @ _call 0 @ _pop @ _const 1 @ _const 2 @ _call_prim 3 2 @ _return in
drive (mk_cl ~name:"outer6" ~env:(Hashtbl.create 64) (mk_code ~locals:0 ~bc ~consts)));
(* Case 7: THE HOST SHAPE — map whose callback calls an INTERPRETED helper
that performs (kv read via persist helper inside a map), THEN a trailing
prim. callback(e): (do (kvread) e) — kvread suspends via pending_cek.
outer: (do (map cb (list 1 2 3)) (drop (list 5 6 7 8) 2)) => (7 8) *)
run "7.HOST: map[cb→helper perform]→drop" (fun () ->
let genv = Sx_types.make_env () in
let helper = Lambda {
l_params = []; l_body = List [Symbol "perform"; req_dict ()];
l_closure = genv; l_name = Some "kvread";
l_compiled = Some Sx_vm.jit_failed_sentinel; l_call_count = 0;
l_uid = Sx_types.next_lambda_uid () } in
let env = Hashtbl.create 64 in
(* cb(e): push helper, CALL 0, POP, LOCAL_GET 0, RETURN *)
let cb_code = {
vc_arity = 1; vc_rest_arity = -1; vc_locals = 1;
vc_bytecode = Array.of_list (_const 0 @ _call 0 @ _pop @ [16;0] @ _return);
vc_constants = [| helper |]; vc_bytecode_list=None; vc_constants_list=None } in
let cb_cl = mk_cl ~name:"cb7" ~env cb_code in
let consts = [ VmClosure cb_cl; List [Number 1.; Number 2.; Number 3.]; String "map";
List [Number 5.; Number 6.; Number 7.; Number 8.]; Number 2.; String "drop" ] in
(* push cb, push list, CALL_PRIM map 2, POP, push list2, push 2, CALL_PRIM drop 2, RETURN *)
let bc = _const 0 @ _const 1 @ _call_prim 2 2 @ _pop
@ _const 3 @ _const 4 @ _call_prim 5 2 @ _return in
drive (mk_cl ~name:"outer7" ~env (mk_code ~locals:0 ~bc ~consts)));
(* Case 8: reduce whose callback performs. (reduce + 0 (list 1 2 3)) with a
perform in the reducer => 6 *)
run "8.reduce[acc→perform]" (fun () ->
let env = Hashtbl.create 64 in
(* reducer(acc e): (do (perform {..}) (+ acc e)). slots: 0=acc 1=e *)
let rd_code = {
vc_arity = 2; vc_rest_arity = -1; vc_locals = 2;
vc_bytecode = Array.of_list (_const 0 @ _perform @ _pop
@ [16;0] @ [16;1] @ _call_prim 1 2 @ _return);
vc_constants = [| req_dict (); String "+" |];
vc_bytecode_list=None; vc_constants_list=None } in
let rd_cl = mk_cl ~name:"rd" ~env rd_code in
let consts = [ VmClosure rd_cl; Number 0.; List [Number 1.; Number 2.; Number 3.]; String "reduce" ] in
(* push reducer, push 0, push list, CALL_PRIM reduce 3, RETURN *)
let bc = _const 0 @ _const 1 @ _const 2 @ _call_prim 3 3 @ _return in
drive (mk_cl ~name:"outer8" ~env (mk_code ~locals:0 ~bc ~consts)));
(* Case 9: nested map — outer map callback runs an inner map whose callback
performs. outer over (list 1 2), inner over (list 10 20) performing.
cb_outer(x): (map cb_inner (list 10 20)) ; cb_inner(y): (do (perform) y)
=> ((10 20) (10 20)) *)
run "9.nested map[inner→perform]" (fun () ->
let env = Hashtbl.create 64 in
let inner_code = {
vc_arity = 1; vc_rest_arity = -1; vc_locals = 1;
vc_bytecode = Array.of_list (_const 0 @ _perform @ _pop @ [16;0] @ _return);
vc_constants = [| req_dict () |]; vc_bytecode_list=None; vc_constants_list=None } in
let inner_cl = mk_cl ~name:"cbin" ~env inner_code in
(* outer cb(x): push inner_cl, push (10 20), CALL_PRIM map 2, RETURN *)
let outer_cb_code = {
vc_arity = 1; vc_rest_arity = -1; vc_locals = 1;
vc_bytecode = Array.of_list (_const 0 @ _const 1 @ _call_prim 2 2 @ _return);
vc_constants = [| VmClosure inner_cl; List [Number 10.; Number 20.]; String "map" |];
vc_bytecode_list=None; vc_constants_list=None } in
let outer_cb_cl = mk_cl ~name:"cbout" ~env outer_cb_code in
let consts = [ VmClosure outer_cb_cl; List [Number 1.; Number 2.]; String "map" ] in
let bc = _const 0 @ _const 1 @ _call_prim 2 2 @ _return in
drive (mk_cl ~name:"outer9" ~env (mk_code ~locals:0 ~bc ~consts)))

View File

@@ -1216,7 +1216,23 @@ let register_jit_hook env =
let rec resolve_loop req vm = let rec resolve_loop req vm =
let result = resolver req (Nil) in let result = resolver req (Nil) in
(try Some (Sx_vm.resume_vm vm result) (try Some (Sx_vm.resume_vm vm result)
with Sx_vm.VmSuspended (req2, vm2) -> resolve_loop req2 vm2) with
| Sx_vm.VmSuspended (req2, vm2) -> resolve_loop req2 vm2
| e ->
(* (B) Resume raised mid-execution. resolve_loop runs inside
the VmSuspended handler, so without catching here the
error escapes to the http handler (→ 500). Recover THIS
call on the CEK instead: mark jit_failed and return None
so the interpreter re-runs it (idempotent for the host's
durable reads). Self-heals on the first hit, not a retry. *)
let fn_name = match l.l_name with Some n -> n | None -> "?" in
if not (Hashtbl.mem _jit_warned fn_name) then begin
Hashtbl.replace _jit_warned fn_name true;
Printf.eprintf "[jit] %s resume fallback to CEK: %s\n%!"
fn_name (Printexc.to_string e)
end;
l.l_compiled <- Some Sx_vm.jit_failed_sentinel;
None)
in in
resolve_loop request saved_vm resolve_loop request saved_vm
| None -> Some (make_vm_suspend_marker request saved_vm)) | None -> Some (make_vm_suspend_marker request saved_vm))
@@ -1249,7 +1265,16 @@ let register_jit_hook env =
let rec resolve_loop req vm = let rec resolve_loop req vm =
let result = resolver req (Nil) in let result = resolver req (Nil) in
(try Some (Sx_vm.resume_vm vm result) (try Some (Sx_vm.resume_vm vm result)
with Sx_vm.VmSuspended (req2, vm2) -> resolve_loop req2 vm2) with
| Sx_vm.VmSuspended (req2, vm2) -> resolve_loop req2 vm2
| e ->
(* (B) See note above — recover a failed resume on the
CEK instead of escaping to the handler (→ 500). *)
Printf.eprintf "[jit] %s resume fallback to CEK: %s\n%!"
fn_name (Printexc.to_string e);
Hashtbl.replace _jit_warned fn_name true;
l.l_compiled <- Some Sx_vm.jit_failed_sentinel;
None)
in in
resolve_loop request saved_vm resolve_loop request saved_vm
| None -> Some (make_vm_suspend_marker request saved_vm)) | None -> Some (make_vm_suspend_marker request saved_vm))

View File

@@ -336,30 +336,51 @@ and call_closure_reuse cl args =
push_closure_frame vm cl args; push_closure_frame vm cl args;
let saved_frames = List.tl vm.frames in let saved_frames = List.tl vm.frames in
vm.frames <- [List.hd vm.frames]; vm.frames <- [List.hd vm.frames];
(try run vm let result =
(try run vm;
(* Normal completion: result sits at the top of the stack.
OP_RETURN normally leaves sp = saved_sp + 1, but the
bytecode-exhausted path (or a callee that returns a closure whose
own RETURN leaves extra stack residue) can leave sp inconsistent.
Read the result at the expected slot. *)
if vm.sp > saved_sp then vm.stack.(vm.sp - 1) else Nil
with with
| VmSuspended _ as e -> | VmSuspended (req, _) as e ->
(* IO suspension: save the caller's continuation on the reuse stack. (match !Sx_types._cek_io_resolver with
DON'T merge frames — that corrupts the frame chain with nested | Some resolver ->
closures. On resume, restore_reuse in resume_vm processes these (* Serving path: a `perform` fired inside this HO-primitive
in innermost-first order after the callback finishes. *) callback (map/filter/reduce/for-each/…). The primitive's native
OCaml loop sits between us and the resume point, so we CANNOT
unwind it and resume later (the loop state would be lost and the
remaining elements dropped — corrupting the stack so the next
CALL_PRIM sees wrong args). Instead resolve the callback's IO
inline and run it to completion right here, returning its value
to the native loop exactly as a non-suspending callback would.
reuse_stack is isolated so an outer suspension's saved
continuations aren't consumed by this nested resume. *)
let saved_reuse = vm.reuse_stack in
vm.reuse_stack <- [];
let rec settle req =
let r = resolver req Nil in
(try resume_vm vm r
with VmSuspended (req2, _) -> settle req2)
in
let cb = settle req in
vm.reuse_stack <- saved_reuse;
cb
| None ->
(* CEK-driven path (no synchronous resolver): preserve the existing
behaviour — save the caller's continuation on the reuse stack and
re-raise so resume_vm restores it after the callback finishes.
DON'T merge frames — that corrupts the frame chain. *)
vm.reuse_stack <- (saved_frames, saved_sp) :: vm.reuse_stack; vm.reuse_stack <- (saved_frames, saved_sp) :: vm.reuse_stack;
raise e raise e)
| e -> | e ->
vm.frames <- saved_frames; vm.frames <- saved_frames;
vm.sp <- saved_sp; vm.sp <- saved_sp;
raise e); raise e)
vm.frames <- saved_frames;
(* Snapshot/restore sp around the popped result.
OP_RETURN normally leaves sp = saved_sp + 1, but the bytecode-exhausted
path (or a callee that returns a closure whose own RETURN leaves extra
stack residue) can leave sp inconsistent. Read the result at the
expected slot and reset sp explicitly so the parent frame's
intermediate values are not corrupted. *)
let result =
if vm.sp > saved_sp then vm.stack.(vm.sp - 1)
else Nil
in in
vm.frames <- saved_frames;
vm.sp <- saved_sp; vm.sp <- saved_sp;
result result
| None -> | None ->
@@ -938,7 +959,17 @@ and run vm =
After the callback finishes, restores any call_closure_reuse After the callback finishes, restores any call_closure_reuse
continuations saved on vm.reuse_stack (innermost first). *) continuations saved on vm.reuse_stack (innermost first). *)
let resume_vm vm result = and resume_vm vm result =
(* The resumed execution runs on [vm]; HO primitives (map/filter/…) called
during the resume reach for [!_active_vm] to run their callbacks on the
same stack. call_closure restored [_active_vm] to the *caller* when the
original VmSuspended unwound through it, so without re-asserting it here
the resumed run's callbacks land on the wrong VM (or allocate a fresh
one), corrupting the stack. Mirror call_closure's save/set/restore. *)
let prev_active = !_active_vm in
_active_vm := Some vm;
let restore () = _active_vm := prev_active in
(try
(match vm.pending_cek with (match vm.pending_cek with
| Some cek_state -> | Some cek_state ->
vm.pending_cek <- None; vm.pending_cek <- None;
@@ -1010,7 +1041,9 @@ let resume_vm vm result =
let pending = List.rev vm.reuse_stack in let pending = List.rev vm.reuse_stack in
vm.reuse_stack <- []; vm.reuse_stack <- [];
restore_reuse pending; restore_reuse pending;
pop vm let r = pop vm in
restore (); r
with e -> restore (); raise e)
(** Execute a compiled module (top-level bytecode). *) (** Execute a compiled module (top-level bytecode). *)
let execute_module code globals = let execute_module code globals =