vm-ext: fix serving-JIT perform-in-HO-callback miscompile (host bug)
Root cause (found via bin/repro_jit_resume.ml, 9 surgical cases): when a `perform` (durable kv read) fires inside a native HO-primitive callback (map/filter/reduce/for-each/some/every?), the VmSuspended unwound through the primitive's native OCaml loop (List.map etc.), destroying the loop's iteration state. The remaining elements were dropped and the stack left misaligned, so the NEXT CALL_PRIM (map/rest/drop) read wrong args — "map: expected (fn list)", "rest: 1 list arg", "drop: list and number". Only triggers in the http-listen + cek_run_with_io serving path (epoch eval has no synchronous resolver, so conformance was 271/271). (A) lib/sx_vm.ml call_closure_reuse: when a callback suspends AND a synchronous IO resolver is installed (serving mode), resolve the callback's IO inline and run it to completion right there, returning its value to the native loop — so the loop is never unwound. Scoped to the resolver-set path; the CEK-driven path (flow/reactive/async tests) keeps its existing reuse_stack behaviour, so nothing else changes. reuse_stack is isolated across the nested resume. (A') lib/sx_vm.ml resume_vm: re-assert _active_vm := Some vm for the duration of the resumed run (mirrors call_closure). call_closure restored _active_vm to the caller when VmSuspended unwound, so HO callbacks during a resume could land on the wrong VM. Latent-bug fix. (B) bin/sx_server.ml register_jit_hook: the resolve_loop runs inside the VmSuspended handler, so a non-VmSuspended exception from resume_vm escaped to the http handler (→ 500). Catch it and fall back to CEK for THIS call (mark jit_failed, return None → interpreter re-runs it). Self-heals on the first hit, not a retry. Defense-in-depth; with (A) it shouldn't trigger. Verification: repro 9/9 (incl. host shape: map[cb→interpreted-helper perform]→drop = (7 8); reduce; nested map). Standard + --full OCaml conformance unchanged at 4834/1110 (baseline identical — the 1110 are pre-existing environmental: host-call-fn/browser-platform symbols, rational display, tw/regex). Host loop to re-verify 271/271 serving and drop its (jit-exclude! "host/*" "dream-*" "dr/*") band-aid. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -336,30 +336,51 @@ and call_closure_reuse cl args =
|
||||
push_closure_frame vm cl args;
|
||||
let saved_frames = List.tl vm.frames in
|
||||
vm.frames <- [List.hd vm.frames];
|
||||
(try run vm
|
||||
with
|
||||
| VmSuspended _ as e ->
|
||||
(* IO suspension: save the caller's continuation on the reuse stack.
|
||||
DON'T merge frames — that corrupts the frame chain with nested
|
||||
closures. On resume, restore_reuse in resume_vm processes these
|
||||
in innermost-first order after the callback finishes. *)
|
||||
vm.reuse_stack <- (saved_frames, saved_sp) :: vm.reuse_stack;
|
||||
raise e
|
||||
| e ->
|
||||
vm.frames <- saved_frames;
|
||||
vm.sp <- saved_sp;
|
||||
raise e);
|
||||
vm.frames <- saved_frames;
|
||||
(* Snapshot/restore sp around the popped result.
|
||||
OP_RETURN normally leaves sp = saved_sp + 1, but the bytecode-exhausted
|
||||
path (or a callee that returns a closure whose own RETURN leaves extra
|
||||
stack residue) can leave sp inconsistent. Read the result at the
|
||||
expected slot and reset sp explicitly so the parent frame's
|
||||
intermediate values are not corrupted. *)
|
||||
let result =
|
||||
if vm.sp > saved_sp then vm.stack.(vm.sp - 1)
|
||||
else Nil
|
||||
(try run vm;
|
||||
(* Normal completion: result sits at the top of the stack.
|
||||
OP_RETURN normally leaves sp = saved_sp + 1, but the
|
||||
bytecode-exhausted path (or a callee that returns a closure whose
|
||||
own RETURN leaves extra stack residue) can leave sp inconsistent.
|
||||
Read the result at the expected slot. *)
|
||||
if vm.sp > saved_sp then vm.stack.(vm.sp - 1) else Nil
|
||||
with
|
||||
| VmSuspended (req, _) as e ->
|
||||
(match !Sx_types._cek_io_resolver with
|
||||
| Some resolver ->
|
||||
(* Serving path: a `perform` fired inside this HO-primitive
|
||||
callback (map/filter/reduce/for-each/…). The primitive's native
|
||||
OCaml loop sits between us and the resume point, so we CANNOT
|
||||
unwind it and resume later (the loop state would be lost and the
|
||||
remaining elements dropped — corrupting the stack so the next
|
||||
CALL_PRIM sees wrong args). Instead resolve the callback's IO
|
||||
inline and run it to completion right here, returning its value
|
||||
to the native loop exactly as a non-suspending callback would.
|
||||
reuse_stack is isolated so an outer suspension's saved
|
||||
continuations aren't consumed by this nested resume. *)
|
||||
let saved_reuse = vm.reuse_stack in
|
||||
vm.reuse_stack <- [];
|
||||
let rec settle req =
|
||||
let r = resolver req Nil in
|
||||
(try resume_vm vm r
|
||||
with VmSuspended (req2, _) -> settle req2)
|
||||
in
|
||||
let cb = settle req in
|
||||
vm.reuse_stack <- saved_reuse;
|
||||
cb
|
||||
| None ->
|
||||
(* CEK-driven path (no synchronous resolver): preserve the existing
|
||||
behaviour — save the caller's continuation on the reuse stack and
|
||||
re-raise so resume_vm restores it after the callback finishes.
|
||||
DON'T merge frames — that corrupts the frame chain. *)
|
||||
vm.reuse_stack <- (saved_frames, saved_sp) :: vm.reuse_stack;
|
||||
raise e)
|
||||
| e ->
|
||||
vm.frames <- saved_frames;
|
||||
vm.sp <- saved_sp;
|
||||
raise e)
|
||||
in
|
||||
vm.frames <- saved_frames;
|
||||
vm.sp <- saved_sp;
|
||||
result
|
||||
| None ->
|
||||
@@ -938,7 +959,17 @@ and run vm =
|
||||
|
||||
After the callback finishes, restores any call_closure_reuse
|
||||
continuations saved on vm.reuse_stack (innermost first). *)
|
||||
let resume_vm vm result =
|
||||
and resume_vm vm result =
|
||||
(* The resumed execution runs on [vm]; HO primitives (map/filter/…) called
|
||||
during the resume reach for [!_active_vm] to run their callbacks on the
|
||||
same stack. call_closure restored [_active_vm] to the *caller* when the
|
||||
original VmSuspended unwound through it, so without re-asserting it here
|
||||
the resumed run's callbacks land on the wrong VM (or allocate a fresh
|
||||
one), corrupting the stack. Mirror call_closure's save/set/restore. *)
|
||||
let prev_active = !_active_vm in
|
||||
_active_vm := Some vm;
|
||||
let restore () = _active_vm := prev_active in
|
||||
(try
|
||||
(match vm.pending_cek with
|
||||
| Some cek_state ->
|
||||
vm.pending_cek <- None;
|
||||
@@ -1010,7 +1041,9 @@ let resume_vm vm result =
|
||||
let pending = List.rev vm.reuse_stack in
|
||||
vm.reuse_stack <- [];
|
||||
restore_reuse pending;
|
||||
pop vm
|
||||
let r = pop vm in
|
||||
restore (); r
|
||||
with e -> restore (); raise e)
|
||||
|
||||
(** Execute a compiled module (top-level bytecode). *)
|
||||
let execute_module code globals =
|
||||
|
||||
Reference in New Issue
Block a user