diff --git a/hosts/ocaml/bin/dune b/hosts/ocaml/bin/dune index 1d892dd4..06a54810 100644 --- a/hosts/ocaml/bin/dune +++ b/hosts/ocaml/bin/dune @@ -1,5 +1,5 @@ (executables - (names run_tests debug_set sx_server integration_tests bench_cek bench_inspect bench_vm) + (names run_tests debug_set sx_server integration_tests bench_cek bench_inspect bench_vm repro_jit_resume) (libraries sx unix threads.posix otfm yojson)) (executable diff --git a/hosts/ocaml/bin/repro_jit_resume.ml b/hosts/ocaml/bin/repro_jit_resume.ml new file mode 100644 index 00000000..4d5f58bf --- /dev/null +++ b/hosts/ocaml/bin/repro_jit_resume.ml @@ -0,0 +1,202 @@ +(* Surgical repro for the serving-JIT OP_PERFORM/resume stack misalignment. + Mirrors what register_jit_hook's resolve_loop does: call_closure, catch + VmSuspended, resolve IO (return Nil), resume_vm — looping on re-suspend. + No CEK evaluator needed for the direct/multi-frame/reuse paths. *) + +open Sx_types + +let req_dict () = + let h = Hashtbl.create 1 in + Hashtbl.replace h "op" (String "noop"); + Dict h + +(* Mirror the serving hook's resolve loop exactly. *) +let drive cl = + let globals = cl.vm_closure_env |> ignore; cl.vm_env_ref in + let rec resolve_loop req vm = + let _ = req in + (try Sx_vm.resume_vm vm Nil + with Sx_vm.VmSuspended (r2, v2) -> resolve_loop r2 v2) + in + try Sx_vm.call_closure cl [] globals + with Sx_vm.VmSuspended (req, vm) -> resolve_loop req vm + +let mk_code ~locals ~bc ~consts = { + vc_arity = 0; vc_rest_arity = -1; vc_locals = locals; + vc_bytecode = Array.of_list bc; + vc_constants = Array.of_list consts; + vc_bytecode_list = None; vc_constants_list = None; +} + +let mk_cl ?(name="tf") ?(env=Hashtbl.create 64) code = + { vm_code = code; vm_upvalues = [||]; vm_name = Some name; + vm_env_ref = env; vm_closure_env = None } + +let report label v = + Printf.printf "%-28s => %s\n%!" label (Sx_runtime.value_to_str v) + +let run label f = + (try report label (f ()) + with + | Eval_error m -> Printf.printf "%-28s => ERROR: %s\n%!" label m + | e -> Printf.printf "%-28s => EXN: %s\n%!" label (Printexc.to_string e)) + +(* opcodes *) +let _const i = [1; i land 0xff; (i lsr 8) land 0xff] +let _perform = [112] +let _pop = [5] +let _call_prim idx argc = [52; idx land 0xff; (idx lsr 8) land 0xff; argc] +let _call argc = [48; argc] +let _return = [50] + +let () = + (* Serving mode: a synchronous IO resolver is installed (mirrors + sx_server's http setup). Our mock resolves every request to Nil. *) + Sx_types._cek_io_resolver := Some (fun _req _ -> Nil); + + (* Case 1: direct OP_PERFORM then a list prim in the SAME frame. + (do (perform {..}) (rest (list 1 2 3))) => (2 3) *) + run "1.direct perform→rest" (fun () -> + let consts = [ req_dict (); List [Number 1.; Number 2.; Number 3.]; String "rest" ] in + let bc = _const 0 @ _perform @ _pop @ _const 1 @ _call_prim 2 1 @ _return in + drive (mk_cl (mk_code ~locals:0 ~bc ~consts))); + + (* Case 2: direct perform then map (2-arg prim). + (do (perform {..}) (map inc (list 1 2 3))) — needs a fn; use a NativeFn const *) + run "2.direct perform→map" (fun () -> + let inc = NativeFn ("inc1", function [Number n] -> Number (n +. 1.) | _ -> Nil) in + let consts = [ req_dict (); inc; List [Number 1.; Number 2.; Number 3.]; String "map" ] in + (* push fn, push list, CALL_PRIM map 2 *) + let bc = _const 0 @ _perform @ _pop @ _const 1 @ _const 2 @ _call_prim 3 2 @ _return in + drive (mk_cl (mk_code ~locals:0 ~bc ~consts))); + + (* Case 3: multi-frame — outer calls a JIT'd helper that performs, THEN outer maps. + helper: (do (perform {..}) 99) + outer: (do (helper) (map inc (list 1 2 3))) *) + run "3.multiframe perform→map" (fun () -> + let env = Hashtbl.create 64 in + let helper_code = mk_code ~locals:0 + ~bc:(_const 0 @ _perform @ _pop @ _const 1 @ _return) + ~consts:[ req_dict (); Number 99. ] in + let helper_cl = mk_cl ~name:"helper" ~env helper_code in + let inc = NativeFn ("inc1", function [Number n] -> Number (n +. 1.) | _ -> Nil) in + let consts = [ VmClosure helper_cl; inc; List [Number 1.; Number 2.; Number 3.]; String "map" ] in + (* push helper-closure, CALL 0, POP its result, push inc, push list, CALL_PRIM map 2 *) + let bc = _const 0 @ _call 0 @ _pop @ _const 1 @ _const 2 @ _call_prim 3 2 @ _return in + drive (mk_cl ~name:"outer" ~env (mk_code ~locals:0 ~bc ~consts))); + + (* Case 4: map whose CALLBACK performs (reuse_stack path), then a trailing prim. + callback: (do (perform {..}) (inc e)) — but callback gets arg e in slot 0 + outer: (do (map cb (list 1 2 3)) (rest (list 7 8 9))) *) + run "4.map-callback-perform" (fun () -> + let env = Hashtbl.create 64 in + (* callback arity 1: slot0 = e. body: (perform {..}); (inc e) ; return + LOCAL_GET 0 then CALL_PRIM inc... use NativeFn inc via CALL_PRIM *) + let cb_code = { + vc_arity = 1; vc_rest_arity = -1; vc_locals = 1; + vc_bytecode = Array.of_list (_const 0 @ _perform @ _pop + @ [16;0] (* LOCAL_GET 0 *) + @ _call_prim 1 1 @ _return); + vc_constants = [| req_dict (); String "inc" |]; + vc_bytecode_list = None; vc_constants_list = None } in + let cb_cl = mk_cl ~name:"cb" ~env cb_code in + let consts = [ VmClosure cb_cl; List [Number 1.; Number 2.; Number 3.]; String "map"; + List [Number 7.; Number 8.; Number 9.]; String "rest" ] in + (* push cb, push list, CALL_PRIM map 2, POP, push list2, CALL_PRIM rest 1, RETURN *) + let bc = _const 0 @ _const 1 @ _call_prim 2 2 @ _pop @ _const 3 @ _call_prim 4 1 @ _return in + drive (mk_cl ~name:"outer4" ~env (mk_code ~locals:0 ~bc ~consts))); + + (* Case 5: THE HOST CASE — perform via an INTERPRETED helper (pending_cek path), + then a list prim. helper is a Lambda (l_compiled = jit_failed) whose body + performs; vm_call routes it through cek_call_or_suspend → pending_cek. + helper: (perform {..}) [interpreted via CEK] + outer: (do (helper) (rest (list 1 2 3))) => (2 3) *) + run "5.pending_cek perform→rest" (fun () -> + let env = Sx_types.make_env () in + let helper = Lambda { + l_params = []; l_body = List [Symbol "perform"; req_dict ()]; + l_closure = env; l_name = Some "kvread"; + l_compiled = Some Sx_vm.jit_failed_sentinel; l_call_count = 0; + l_uid = Sx_types.next_lambda_uid () } in + let consts = [ helper; List [Number 1.; Number 2.; Number 3.]; String "rest" ] in + (* push helper, CALL 0, POP, push list, CALL_PRIM rest 1, RETURN *) + let bc = _const 0 @ _call 0 @ _pop @ _const 1 @ _call_prim 2 1 @ _return in + drive (mk_cl ~name:"outer5" ~env:(Hashtbl.create 64) (mk_code ~locals:0 ~bc ~consts))); + + (* Case 6: pending_cek perform → MAP (2-arg), the exact host shape. *) + run "6.pending_cek perform→map" (fun () -> + let env = Sx_types.make_env () in + let helper = Lambda { + l_params = []; l_body = List [Symbol "perform"; req_dict ()]; + l_closure = env; l_name = Some "kvread"; + l_compiled = Some Sx_vm.jit_failed_sentinel; l_call_count = 0; + l_uid = Sx_types.next_lambda_uid () } in + let inc = NativeFn ("inc1", function [Number n] -> Number (n +. 1.) | _ -> Nil) in + let consts = [ helper; inc; List [Number 1.; Number 2.; Number 3.]; String "map" ] in + (* push helper, CALL 0, POP, push inc, push list, CALL_PRIM map 2, RETURN *) + let bc = _const 0 @ _call 0 @ _pop @ _const 1 @ _const 2 @ _call_prim 3 2 @ _return in + drive (mk_cl ~name:"outer6" ~env:(Hashtbl.create 64) (mk_code ~locals:0 ~bc ~consts))); + + (* Case 7: THE HOST SHAPE — map whose callback calls an INTERPRETED helper + that performs (kv read via persist helper inside a map), THEN a trailing + prim. callback(e): (do (kvread) e) — kvread suspends via pending_cek. + outer: (do (map cb (list 1 2 3)) (drop (list 5 6 7 8) 2)) => (7 8) *) + run "7.HOST: map[cb→helper perform]→drop" (fun () -> + let genv = Sx_types.make_env () in + let helper = Lambda { + l_params = []; l_body = List [Symbol "perform"; req_dict ()]; + l_closure = genv; l_name = Some "kvread"; + l_compiled = Some Sx_vm.jit_failed_sentinel; l_call_count = 0; + l_uid = Sx_types.next_lambda_uid () } in + let env = Hashtbl.create 64 in + (* cb(e): push helper, CALL 0, POP, LOCAL_GET 0, RETURN *) + let cb_code = { + vc_arity = 1; vc_rest_arity = -1; vc_locals = 1; + vc_bytecode = Array.of_list (_const 0 @ _call 0 @ _pop @ [16;0] @ _return); + vc_constants = [| helper |]; vc_bytecode_list=None; vc_constants_list=None } in + let cb_cl = mk_cl ~name:"cb7" ~env cb_code in + let consts = [ VmClosure cb_cl; List [Number 1.; Number 2.; Number 3.]; String "map"; + List [Number 5.; Number 6.; Number 7.; Number 8.]; Number 2.; String "drop" ] in + (* push cb, push list, CALL_PRIM map 2, POP, push list2, push 2, CALL_PRIM drop 2, RETURN *) + let bc = _const 0 @ _const 1 @ _call_prim 2 2 @ _pop + @ _const 3 @ _const 4 @ _call_prim 5 2 @ _return in + drive (mk_cl ~name:"outer7" ~env (mk_code ~locals:0 ~bc ~consts))); + + (* Case 8: reduce whose callback performs. (reduce + 0 (list 1 2 3)) with a + perform in the reducer => 6 *) + run "8.reduce[acc→perform]" (fun () -> + let env = Hashtbl.create 64 in + (* reducer(acc e): (do (perform {..}) (+ acc e)). slots: 0=acc 1=e *) + let rd_code = { + vc_arity = 2; vc_rest_arity = -1; vc_locals = 2; + vc_bytecode = Array.of_list (_const 0 @ _perform @ _pop + @ [16;0] @ [16;1] @ _call_prim 1 2 @ _return); + vc_constants = [| req_dict (); String "+" |]; + vc_bytecode_list=None; vc_constants_list=None } in + let rd_cl = mk_cl ~name:"rd" ~env rd_code in + let consts = [ VmClosure rd_cl; Number 0.; List [Number 1.; Number 2.; Number 3.]; String "reduce" ] in + (* push reducer, push 0, push list, CALL_PRIM reduce 3, RETURN *) + let bc = _const 0 @ _const 1 @ _const 2 @ _call_prim 3 3 @ _return in + drive (mk_cl ~name:"outer8" ~env (mk_code ~locals:0 ~bc ~consts))); + + (* Case 9: nested map — outer map callback runs an inner map whose callback + performs. outer over (list 1 2), inner over (list 10 20) performing. + cb_outer(x): (map cb_inner (list 10 20)) ; cb_inner(y): (do (perform) y) + => ((10 20) (10 20)) *) + run "9.nested map[inner→perform]" (fun () -> + let env = Hashtbl.create 64 in + let inner_code = { + vc_arity = 1; vc_rest_arity = -1; vc_locals = 1; + vc_bytecode = Array.of_list (_const 0 @ _perform @ _pop @ [16;0] @ _return); + vc_constants = [| req_dict () |]; vc_bytecode_list=None; vc_constants_list=None } in + let inner_cl = mk_cl ~name:"cbin" ~env inner_code in + (* outer cb(x): push inner_cl, push (10 20), CALL_PRIM map 2, RETURN *) + let outer_cb_code = { + vc_arity = 1; vc_rest_arity = -1; vc_locals = 1; + vc_bytecode = Array.of_list (_const 0 @ _const 1 @ _call_prim 2 2 @ _return); + vc_constants = [| VmClosure inner_cl; List [Number 10.; Number 20.]; String "map" |]; + vc_bytecode_list=None; vc_constants_list=None } in + let outer_cb_cl = mk_cl ~name:"cbout" ~env outer_cb_code in + let consts = [ VmClosure outer_cb_cl; List [Number 1.; Number 2.]; String "map" ] in + let bc = _const 0 @ _const 1 @ _call_prim 2 2 @ _return in + drive (mk_cl ~name:"outer9" ~env (mk_code ~locals:0 ~bc ~consts))) diff --git a/hosts/ocaml/bin/sx_server.ml b/hosts/ocaml/bin/sx_server.ml index 1547ab1a..85e56908 100644 --- a/hosts/ocaml/bin/sx_server.ml +++ b/hosts/ocaml/bin/sx_server.ml @@ -1216,7 +1216,23 @@ let register_jit_hook env = let rec resolve_loop req vm = let result = resolver req (Nil) in (try Some (Sx_vm.resume_vm vm result) - with Sx_vm.VmSuspended (req2, vm2) -> resolve_loop req2 vm2) + with + | Sx_vm.VmSuspended (req2, vm2) -> resolve_loop req2 vm2 + | e -> + (* (B) Resume raised mid-execution. resolve_loop runs inside + the VmSuspended handler, so without catching here the + error escapes to the http handler (→ 500). Recover THIS + call on the CEK instead: mark jit_failed and return None + so the interpreter re-runs it (idempotent for the host's + durable reads). Self-heals on the first hit, not a retry. *) + let fn_name = match l.l_name with Some n -> n | None -> "?" in + if not (Hashtbl.mem _jit_warned fn_name) then begin + Hashtbl.replace _jit_warned fn_name true; + Printf.eprintf "[jit] %s resume fallback to CEK: %s\n%!" + fn_name (Printexc.to_string e) + end; + l.l_compiled <- Some Sx_vm.jit_failed_sentinel; + None) in resolve_loop request saved_vm | None -> Some (make_vm_suspend_marker request saved_vm)) @@ -1249,7 +1265,16 @@ let register_jit_hook env = let rec resolve_loop req vm = let result = resolver req (Nil) in (try Some (Sx_vm.resume_vm vm result) - with Sx_vm.VmSuspended (req2, vm2) -> resolve_loop req2 vm2) + with + | Sx_vm.VmSuspended (req2, vm2) -> resolve_loop req2 vm2 + | e -> + (* (B) See note above — recover a failed resume on the + CEK instead of escaping to the handler (→ 500). *) + Printf.eprintf "[jit] %s resume fallback to CEK: %s\n%!" + fn_name (Printexc.to_string e); + Hashtbl.replace _jit_warned fn_name true; + l.l_compiled <- Some Sx_vm.jit_failed_sentinel; + None) in resolve_loop request saved_vm | None -> Some (make_vm_suspend_marker request saved_vm)) diff --git a/hosts/ocaml/lib/sx_vm.ml b/hosts/ocaml/lib/sx_vm.ml index 3ba4529e..4763acae 100644 --- a/hosts/ocaml/lib/sx_vm.ml +++ b/hosts/ocaml/lib/sx_vm.ml @@ -336,30 +336,51 @@ and call_closure_reuse cl args = push_closure_frame vm cl args; let saved_frames = List.tl vm.frames in vm.frames <- [List.hd vm.frames]; - (try run vm - with - | VmSuspended _ as e -> - (* IO suspension: save the caller's continuation on the reuse stack. - DON'T merge frames — that corrupts the frame chain with nested - closures. On resume, restore_reuse in resume_vm processes these - in innermost-first order after the callback finishes. *) - vm.reuse_stack <- (saved_frames, saved_sp) :: vm.reuse_stack; - raise e - | e -> - vm.frames <- saved_frames; - vm.sp <- saved_sp; - raise e); - vm.frames <- saved_frames; - (* Snapshot/restore sp around the popped result. - OP_RETURN normally leaves sp = saved_sp + 1, but the bytecode-exhausted - path (or a callee that returns a closure whose own RETURN leaves extra - stack residue) can leave sp inconsistent. Read the result at the - expected slot and reset sp explicitly so the parent frame's - intermediate values are not corrupted. *) let result = - if vm.sp > saved_sp then vm.stack.(vm.sp - 1) - else Nil + (try run vm; + (* Normal completion: result sits at the top of the stack. + OP_RETURN normally leaves sp = saved_sp + 1, but the + bytecode-exhausted path (or a callee that returns a closure whose + own RETURN leaves extra stack residue) can leave sp inconsistent. + Read the result at the expected slot. *) + if vm.sp > saved_sp then vm.stack.(vm.sp - 1) else Nil + with + | VmSuspended (req, _) as e -> + (match !Sx_types._cek_io_resolver with + | Some resolver -> + (* Serving path: a `perform` fired inside this HO-primitive + callback (map/filter/reduce/for-each/…). The primitive's native + OCaml loop sits between us and the resume point, so we CANNOT + unwind it and resume later (the loop state would be lost and the + remaining elements dropped — corrupting the stack so the next + CALL_PRIM sees wrong args). Instead resolve the callback's IO + inline and run it to completion right here, returning its value + to the native loop exactly as a non-suspending callback would. + reuse_stack is isolated so an outer suspension's saved + continuations aren't consumed by this nested resume. *) + let saved_reuse = vm.reuse_stack in + vm.reuse_stack <- []; + let rec settle req = + let r = resolver req Nil in + (try resume_vm vm r + with VmSuspended (req2, _) -> settle req2) + in + let cb = settle req in + vm.reuse_stack <- saved_reuse; + cb + | None -> + (* CEK-driven path (no synchronous resolver): preserve the existing + behaviour — save the caller's continuation on the reuse stack and + re-raise so resume_vm restores it after the callback finishes. + DON'T merge frames — that corrupts the frame chain. *) + vm.reuse_stack <- (saved_frames, saved_sp) :: vm.reuse_stack; + raise e) + | e -> + vm.frames <- saved_frames; + vm.sp <- saved_sp; + raise e) in + vm.frames <- saved_frames; vm.sp <- saved_sp; result | None -> @@ -938,7 +959,17 @@ and run vm = After the callback finishes, restores any call_closure_reuse continuations saved on vm.reuse_stack (innermost first). *) -let resume_vm vm result = +and resume_vm vm result = + (* The resumed execution runs on [vm]; HO primitives (map/filter/…) called + during the resume reach for [!_active_vm] to run their callbacks on the + same stack. call_closure restored [_active_vm] to the *caller* when the + original VmSuspended unwound through it, so without re-asserting it here + the resumed run's callbacks land on the wrong VM (or allocate a fresh + one), corrupting the stack. Mirror call_closure's save/set/restore. *) + let prev_active = !_active_vm in + _active_vm := Some vm; + let restore () = _active_vm := prev_active in + (try (match vm.pending_cek with | Some cek_state -> vm.pending_cek <- None; @@ -1010,7 +1041,9 @@ let resume_vm vm result = let pending = List.rev vm.reuse_stack in vm.reuse_stack <- []; restore_reuse pending; - pop vm + let r = pop vm in + restore (); r + with e -> restore (); raise e) (** Execute a compiled module (top-level bytecode). *) let execute_module code globals =