From 076b8ae7f7219d6a7701a0d50be51259eea34a20 Mon Sep 17 00:00:00 2001 From: giles Date: Fri, 5 Jun 2026 06:56:31 +0000 Subject: [PATCH] =?UTF-8?q?fed-sx-m1:=20Step=203b=20codec=20=E2=80=94=20ne?= =?UTF-8?q?xt/kernel/term=5Fcodec.erl=20encode/decode=20+=2018=20round-tri?= =?UTF-8?q?p=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- next/kernel/term_codec.erl | 105 +++++++++++++++++++++++ next/tests/term_codec.sh | 160 ++++++++++++++++++++++++++++++++++++ plans/fed-sx-milestone-1.md | 3 +- 3 files changed, 267 insertions(+), 1 deletion(-) create mode 100644 next/kernel/term_codec.erl create mode 100755 next/tests/term_codec.sh diff --git a/next/kernel/term_codec.erl b/next/kernel/term_codec.erl new file mode 100644 index 00000000..03f74d02 --- /dev/null +++ b/next/kernel/term_codec.erl @@ -0,0 +1,105 @@ +-module(term_codec). +-export([encode/1, decode/1]). + +%% Erlang-side term <-> binary codec, built on the substrate fixes from +%% commits 24e3bf53 (binary_to_list / list_to_binary), 3d80bd8c ($X char +%% literals), 4852cca9 (atom_to_list / integer_to_list charlists). +%% +%% Wire format (netstring-ish; all length headers ASCII decimal): +%% +%% atom $a Len $: NameBytes +%% integer $i Len $: DecimalBytes (negative ints carry leading $-) +%% binary $b Len $: RawBytes +%% tuple $t Count $: Enc1 Enc2 ... Encn +%% list $l Count $: Enc1 Enc2 ... Encn (proper list) +%% nil $l $0 $: (empty list) +%% +%% Each Enc is itself one of these forms — recursive. The format is +%% byte-clean: binary bodies may contain any byte (newlines, NULs, etc.), +%% so callers can frame entries with a 4-byte big-endian length prefix +%% (Step 3b on-disk segment writer's job). + +%% encode/1: term -> binary +encode(T) when is_atom(T) -> + Cs = atom_to_list(T), + list_to_binary([$a, integer_to_list(length(Cs)), $:, Cs]); +encode(T) when is_integer(T) -> + Cs = integer_to_list(T), + list_to_binary([$i, integer_to_list(length(Cs)), $:, Cs]); +encode(T) when is_binary(T) -> + list_to_binary([$b, integer_to_list(byte_size(T)), $:, T]); +encode(T) when is_tuple(T) -> + L = tuple_to_list(T), + list_to_binary([$t, integer_to_list(length(L)), $:, + [encode(E) || E <- L]]); +encode([]) -> + list_to_binary([$l, $0, $:]); +encode(T) when is_list(T) -> + list_to_binary([$l, integer_to_list(length(T)), $:, + [encode(E) || E <- T]]). + +%% decode/1: binary -> {ok, Term, RestBinary} | {error, badform} +%% On success returns the remaining unconsumed bytes so callers can +%% stream-decode multiple frames from one buffer. +decode(B) when is_binary(B) -> + decode_chars(binary_to_list(B)). + +decode_chars([$a | Rest]) -> + {Len, Rest1} = read_len(Rest, 0), + Rest2 = strip_colon(Rest1), + {NameChars, Rest3} = split_at(Len, Rest2), + {ok, list_to_atom(NameChars), list_to_binary(Rest3)}; +decode_chars([$i | Rest]) -> + {Len, Rest1} = read_len(Rest, 0), + Rest2 = strip_colon(Rest1), + {NumChars, Rest3} = split_at(Len, Rest2), + {ok, list_to_integer(NumChars), list_to_binary(Rest3)}; +decode_chars([$b | Rest]) -> + {Len, Rest1} = read_len(Rest, 0), + Rest2 = strip_colon(Rest1), + {Bytes, Rest3} = split_at(Len, Rest2), + {ok, list_to_binary(Bytes), list_to_binary(Rest3)}; +decode_chars([$t | Rest]) -> + {N, Rest1} = read_len(Rest, 0), + Rest2 = strip_colon(Rest1), + {Elems, Rest3} = decode_n(N, Rest2, []), + {ok, list_to_tuple(Elems), list_to_binary(Rest3)}; +decode_chars([$l | Rest]) -> + {N, Rest1} = read_len(Rest, 0), + Rest2 = strip_colon(Rest1), + {Elems, Rest3} = decode_n(N, Rest2, []), + {ok, Elems, list_to_binary(Rest3)}; +decode_chars(_) -> + {error, badform}. + +read_len([C | Rest], Acc) when C >= $0, C =< $9 -> + read_len(Rest, Acc * 10 + C - $0); +read_len([$- | Rest], 0) -> + %% Leading minus for negative integer-body lengths is invalid for + %% lengths, but appears inside integer-body bytes (handled in + %% the body, not here — read_len only consumes digits before $:). + {0, [$- | Rest]}; +read_len(Rest, Acc) -> + {Acc, Rest}. + +strip_colon([$: | Rest]) -> Rest; +strip_colon(Other) -> erlang:error({badform, Other}). + +split_at(0, Rest) -> {[], Rest}; +split_at(N, [H | T]) -> + {Hs, Tl} = split_at(N - 1, T), + {[H | Hs], Tl}; +split_at(_, []) -> + erlang:error({badform, short}). + +decode_n(0, Rest, Acc) -> + {lists:reverse(Acc), Rest}; +decode_n(N, Bytes, Acc) -> + {Term, Rest} = decode_one(Bytes), + decode_n(N - 1, Rest, [Term | Acc]). + +decode_one(Bytes) -> + case decode_chars(Bytes) of + {ok, Term, RestBin} -> {Term, binary_to_list(RestBin)}; + {error, R} -> erlang:error({badform, R}) + end. diff --git a/next/tests/term_codec.sh b/next/tests/term_codec.sh new file mode 100755 index 00000000..d9bcac22 --- /dev/null +++ b/next/tests/term_codec.sh @@ -0,0 +1,160 @@ +#!/usr/bin/env bash +# next/tests/term_codec.sh — Step 3b term codec acceptance test. +# +# Exercises encode/1 + decode/1 for atoms, integers, binaries, tuples, +# lists, nesting, and round-trip equivalence. Built on the substrate-fix +# trio: binary_to_list/list_to_binary (24e3bf53), $X literals (3d80bd8c), +# atom_to_list/integer_to_list charlists (4852cca9). + +set -uo pipefail +cd "$(git rev-parse --show-toplevel)" + +SX_SERVER="${SX_SERVER:-hosts/ocaml/_build/default/bin/sx_server.exe}" +if [ ! -x "$SX_SERVER" ]; then + SX_SERVER="/root/rose-ash/hosts/ocaml/_build/default/bin/sx_server.exe" +fi +if [ ! -x "$SX_SERVER" ]; then + echo "ERROR: sx_server.exe not found." >&2 + exit 1 +fi + +VERBOSE="${1:-}" +PASS=0; FAIL=0; ERRORS="" +TMPFILE=$(mktemp); trap "rm -f $TMPFILE" EXIT + +cat > "$TMPFILE" <<'EPOCHS' +(epoch 1) +(load "lib/erlang/tokenizer.sx") +(load "lib/erlang/parser.sx") +(load "lib/erlang/parser-core.sx") +(load "lib/erlang/parser-expr.sx") +(load "lib/erlang/parser-module.sx") +(load "lib/erlang/transpile.sx") +(load "lib/erlang/runtime.sx") +(load "lib/erlang/vm/dispatcher.sx") + +(epoch 2) +(eval "(get (erlang-load-module (file-read \"next/kernel/term_codec.erl\")) :name)") + +;; --- encode produces correct headers --- + +;; atom 'ok' -> bytes "a2:ok" +(epoch 10) +(eval "(get (erlang-eval-ast \"term_codec:encode(ok) =:= <<97, 50, 58, 111, 107>>\") :name)") + +;; integer 42 -> "i2:42" +(epoch 11) +(eval "(get (erlang-eval-ast \"term_codec:encode(42) =:= <<105, 50, 58, 52, 50>>\") :name)") + +;; negative integer -99 -> "i3:-99" +(epoch 12) +(eval "(get (erlang-eval-ast \"term_codec:encode(-99) =:= <<105, 51, 58, 45, 57, 57>>\") :name)") + +;; binary <<1,2,3>> -> "b3:" + 1,2,3 +(epoch 13) +(eval "(get (erlang-eval-ast \"term_codec:encode(<<1, 2, 3>>) =:= <<98, 51, 58, 1, 2, 3>>\") :name)") + +;; empty list -> "l0:" +(epoch 14) +(eval "(get (erlang-eval-ast \"term_codec:encode([]) =:= <<108, 48, 58>>\") :name)") + +;; tuple {a, b} -> "t2:" + enc(a) + enc(b) = "t2:a1:aa1:b" +(epoch 15) +(eval "(get (erlang-eval-ast \"term_codec:encode({a, b}) =:= <<116, 50, 58, 97, 49, 58, 97, 97, 49, 58, 98>>\") :name)") + +;; --- round-trip: encode then decode returns original term --- + +(epoch 20) +(eval "(get (erlang-eval-ast \"{ok, T, _} = term_codec:decode(term_codec:encode(ok)), T =:= ok\") :name)") + +(epoch 21) +(eval "(get (erlang-eval-ast \"{ok, T, _} = term_codec:decode(term_codec:encode(42)), T =:= 42\") :name)") + +(epoch 22) +(eval "(get (erlang-eval-ast \"{ok, T, _} = term_codec:decode(term_codec:encode(-99)), T =:= -99\") :name)") + +(epoch 23) +(eval "(get (erlang-eval-ast \"{ok, T, _} = term_codec:decode(term_codec:encode(<<1, 2, 3, 4, 5>>)), T =:= <<1, 2, 3, 4, 5>>\") :name)") + +(epoch 24) +(eval "(get (erlang-eval-ast \"{ok, T, _} = term_codec:decode(term_codec:encode([])), T =:= []\") :name)") + +(epoch 25) +(eval "(get (erlang-eval-ast \"{ok, T, _} = term_codec:decode(term_codec:encode({a, b, c})), T =:= {a, b, c}\") :name)") + +(epoch 26) +(eval "(get (erlang-eval-ast \"{ok, T, _} = term_codec:decode(term_codec:encode([1, 2, 3])), T =:= [1, 2, 3]\") :name)") + +;; --- nested: activity-shaped term (atoms, ints, binaries, nested tuple+list) --- + +(epoch 30) +(eval "(get (erlang-eval-ast \"Act = {create, [{id, 1}, {actor, alice}, {payload, <<104, 105>>}]}, {ok, T, _} = term_codec:decode(term_codec:encode(Act)), T =:= Act\") :name)") + +;; --- decode returns remainder so multiple frames can be streamed --- + +(epoch 31) +(eval "(get (erlang-eval-ast \"E1 = term_codec:encode(foo), E2 = term_codec:encode(42), Both = list_to_binary([E1, E2]), {ok, T1, Rest} = term_codec:decode(Both), {ok, T2, _} = term_codec:decode(Rest), {T1, T2} =:= {foo, 42}\") :name)") + +;; --- binary content with embedded zero / newline bytes round-trips --- + +(epoch 32) +(eval "(get (erlang-eval-ast \"B = <<0, 10, 0, 10, 0>>, {ok, T, _} = term_codec:decode(term_codec:encode(B)), T =:= B\") :name)") + +;; --- bad form returns {error, _} not a crash --- + +(epoch 40) +(eval "(get (erlang-eval-ast \"element(1, term_codec:decode(<<122, 122, 122>>))\") :name)") +EPOCHS + +OUTPUT=$(timeout 60 "$SX_SERVER" < "$TMPFILE" 2>/dev/null) + +check() { + local epoch="$1" desc="$2" expected="$3" + local actual + actual=$(echo "$OUTPUT" | grep -A1 "^(ok-len $epoch " | tail -1 || true) + if echo "$actual" | grep -q "^(ok-len"; then actual=""; fi + if [ -z "$actual" ]; then + actual=$(echo "$OUTPUT" | grep "^(ok $epoch " | head -1 || true) + fi + if [ -z "$actual" ]; then + actual=$(echo "$OUTPUT" | grep "^(error $epoch " | head -1 || true) + fi + [ -z "$actual" ] && actual="" + + if echo "$actual" | grep -qF -- "$expected"; then + PASS=$((PASS+1)) + [ "$VERBOSE" = "-v" ] && echo " ok $desc" + else + FAIL=$((FAIL+1)) + ERRORS+=" FAIL [$desc] (epoch $epoch) expected: $expected | actual: $actual +" + fi +} + +check 2 "module loads" "term_codec" +check 10 "encode atom" "true" +check 11 "encode int" "true" +check 12 "encode neg int" "true" +check 13 "encode binary" "true" +check 14 "encode []" "true" +check 15 "encode tuple" "true" +check 20 "round-trip atom" "true" +check 21 "round-trip int" "true" +check 22 "round-trip neg int" "true" +check 23 "round-trip binary" "true" +check 24 "round-trip []" "true" +check 25 "round-trip tuple" "true" +check 26 "round-trip list" "true" +check 30 "round-trip nested activity" "true" +check 31 "streaming two frames" "true" +check 32 "binary w/ embedded NUL+LF" "true" +check 40 "bad form -> error tag" "error" + +TOTAL=$((PASS+FAIL)) +if [ $FAIL -eq 0 ]; then + echo "ok $PASS/$TOTAL term_codec tests passed" +else + echo "FAIL $PASS/$TOTAL passed, $FAIL failed:" + echo "$ERRORS" +fi +[ $FAIL -eq 0 ] diff --git a/plans/fed-sx-milestone-1.md b/plans/fed-sx-milestone-1.md index e2d057a1..1e59531b 100644 --- a/plans/fed-sx-milestone-1.md +++ b/plans/fed-sx-milestone-1.md @@ -197,7 +197,7 @@ verify_signature(Activity, ActorState) -> **Sub-deliverables:** - [x] **3a** — `log:open/2` + `log:append/2` + `log:tip/1` + `log:replay/3` + `log:entries/1` over an in-memory log state (per-actor seq; replay in append order; round-trip the stored activity). `next/tests/log_memory.sh` (12 cases). -- [ ] **3b** — *Parked behind substrate gap (see Blockers below).* Term codec + on-disk persistence: serializer/parser writing each activity as a JSONL-style line; restart-resumes-tip from the segment file. +- [~] **3b** — Term codec landed (`next/kernel/term_codec.erl`): `encode/1`/`decode/1` for atoms, integers, binaries, tuples, lists, nesting; netstring-ish framing (`a/i/b/t/l` tag + length + body); byte-clean (binary bodies may contain NUL/LF). 18 round-trip + streaming + bad-form tests in `next/tests/term_codec.sh`. On-disk segment writer (open/2 reads existing, append/2 writes-through, replay/3 reads from disk) is the next sub-step — codec is the load-bearing piece. - [ ] **3c** — Segment rotation at size threshold + gen_server-mediated concurrent appends. **Blockers (Step 3b) — byte-level path resolved 2026-06-04:** `binary_to_list/1` and `list_to_binary/1` are now registered Erlang BIFs in `lib/erlang/runtime.sx` (Step 3b substrate fix, +9 ffi tests, 738/738 conformance). `list_to_binary` is iolist-aware: accepts nested cons of integer bytes (0-255) and/or binaries; `binary_to_list` returns a proper Erlang charlist of integers. Round-trip verified: `list_to_binary(binary_to_list(B)) =:= B`. On-disk segment writer (3b) can now build segment bytes from `[Header, IoListPayload]` and reconstruct on read — option (c) of the original workaround menu is now cheap. `$X` char literals now decode correctly **as of 2026-06-04**: the Erlang tokenizer's `(= ch "$")` branch (`lib/erlang/tokenizer.sx`) now emits the decimal char code as the token value instead of the raw `$X` text (which `parse-number` couldn't decode → nil). Plain chars use `char->integer` of the first char; the standard escape table (`\n=10 \t=9 \r=13 \s=32 \b=8 \e=27 \f=12 \v=11 \d=127 \0=0 \\=92 \"=34 \'=39`) handles `$\X` forms. So `[$h, $i | T]` patterns and `list_to_binary([$f,$e,$d])` both work end-to-end. +12 eval tests, 750/750. Combined with 3b's `binary_to_list`/`list_to_binary`, Erlang code can now read/write byte sequences and string-shaped char lists fluently. **All three substrate gaps resolved as of 2026-06-05.** `atom_to_list/1` and `integer_to_list/1` now return Erlang charlists (cons of int char codes — standard Erlang semantics) via a new `er-string->charlist` helper in `transpile.sx`. `list_to_atom/1` and `list_to_integer/1` accept either charlists OR SX strings (back-compat via the existing `er-source-to-string` coercer). Composition works end-to-end: `list_to_binary(atom_to_list(hello)) =:= <<104,101,108,108,111>>` and `integer_to_list(N)` round-trips through `list_to_integer`. 5 existing eval tests rewritten to charlist semantics, 8 new charlist-aware tests added (759/759). The full term-codec primitive set — `binary_to_list`, `list_to_binary`, `$X`, `atom_to_list`, `integer_to_list` charlist semantics, plus existing `file:read_file`/`write_file`/`list_dir` — is now in place. @@ -1003,6 +1003,7 @@ A few things still under-specified; resolve as work begins. Newest first. One line per sub-deliverable commit. Erlang conformance gate (`bash lib/erlang/conformance.sh`) must remain 729/729 on every entry. +- **2026-06-05** — Step 3b codec landed: `next/kernel/term_codec.erl` with `encode/1` + `decode/1` over a netstring-ish wire format (`a` atom / `i` int / `b` binary / `t` tuple / `l` list, each as `tag + decimal-length + ":" + body`; nil = `l0:`). Byte-clean — binary bodies may contain NUL, LF, or any byte; encoding stays parseable. Built end-to-end on the three substrate fixes (binary_to_list/list_to_binary + $X + atom_to_list/integer_to_list charlists). `decode/1` returns `{ok, Term, RestBinary}` so callers can stream multiple frames from one buffer. 18 acceptance tests in `next/tests/term_codec.sh`: encode bytes for every leaf type, round-trip for each, nested activity-shaped term (`{create, [{id,1},{actor,alice},{payload,<<104,105>>}]}`), 2-frame streaming, binary with embedded NUL+LF, bad-form returns `{error, badform}` not crash. Erlang conformance **759/759** unchanged (codec is in `next/`, not lib/erlang/). Step 3b on-disk segment writer (the second half — open/append/replay reading/writing the actual segment file) is the natural next iteration: encode each activity with `term_codec`, frame with a 4-byte big-endian length prefix, append to disk. - **2026-06-05** — Step 3b substrate fix #3 (final): `atom_to_list/1` and `integer_to_list/1` now return Erlang charlists (cons-of-int-char-codes) instead of SX strings — standard Erlang semantics. New helper `er-string->charlist` in `transpile.sx`. `list_to_atom/1` and `list_to_integer/1` accept either charlists OR SX strings (back-compat via the existing `er-source-to-string` coercer, which already handles both shapes). 5 existing eval tests rewritten to match new semantics (e.g. `length(atom_to_list(hello)) =:= 5`, `hd(integer_to_list(42)) =:= 52`). 8 new charlist-coverage tests demonstrating composition: `list_to_binary(atom_to_list(ok)) =:= <<111,107>>`; `list_to_atom([$f,$o,$o])` round-trips; `list_to_integer([$1,$0,$0]) =:= 100`. Erlang conformance **759/759** (eval 397→406, +9 net). The full term-codec primitive set — `binary_to_list`/`list_to_binary` (24e3bf53), `$X` literals (3d80bd8c), and now `atom_to_list`/`integer_to_list` charlists — is in place; Step 3b on-disk segment writer can encode arbitrary Erlang activity terms (atoms, ints, binaries, tuples, lists) into byte sequences using only Erlang-native primitives. - **2026-06-04** — Step 3b substrate fix #2: `$X` char-literal decoding. Patched the Erlang tokenizer's `(= ch "$")` branch in `lib/erlang/tokenizer.sx` to emit the decimal char code as the integer token value instead of the raw `$X` source text (which `parse-number` couldn't decode → nil). Plain `$c` uses `char->integer` of the first char; `$\C` consults the standard Erlang escape table (`\n=10 \t=9 \r=13 \s=32 \b=8 \e=27 \f=12 \v=11 \d=127 \0=0 \\=92 \"=34 \'=39`). End-of-file after `$` decodes to 0 defensively. Probes: `$A→65`, `$0→48`, `$\n→10`, `$\\→92`, `[$h,$i]` → cons of 104/105, `list_to_binary([$f,$e,$d])` → `<<102,101,100>>`. +12 eval tests (single chars, each escape, list/binary composition with previous BIFs). Combined with substrate fix #1, Erlang code in fed-sx-m1 can now write `[$h, $i | T]` patterns AND construct/deconstruct binaries — a full term-codec primitive set. Erlang conformance **750/750** (eval 385→397). Plan Blockers note updated; remaining `atom_to_list`/`integer_to_list` charlist gap noted as low-priority for Milestone 1. - **2026-06-04** — Step 3b substrate fix: registered `erlang:binary_to_list/1` and `erlang:list_to_binary/1` in `lib/erlang/runtime.sx` — the byte-level half of the term-codec gap. `binary_to_list` returns a proper Erlang charlist (`er-mk-cons` chain of byte ints). `list_to_binary` is iolist-aware via a recursive `er-iolist-walk!` that accepts nil / cons / binary / integer 0-255 and flattens nested iolists (e.g. `[1, <<2,3>>, [4, [5]]]` → `<<1,2,3,4,5>>`); out-of-range bytes or non-iolist elements raise `error:badarg`. Round-trip verified: `list_to_binary(binary_to_list(B)) =:= B`. +9 ffi tests (length, hd, empty→[], flat byte_size, nested-iolist, round-trip, 3 badarg paths). On-disk segment writer (3b) now has a complete `[Header | IoListPayload] → Binary` path; the remaining two substrate gaps (`atom_to_list`/`integer_to_list` as Erlang charlists, `$X` char-literal decoding) are still parked but no longer block 3b implementation if the encoding uses byte ints directly. Erlang conformance **738/738** (ffi 28→37). Plan Blockers note for Step 3b updated to reflect the partial resolution.