From 559b0df900b40c8679e4616b782a48dedd7a47d0 Mon Sep 17 00:00:00 2001 From: giles Date: Wed, 6 May 2026 23:06:12 +0000 Subject: [PATCH] =?UTF-8?q?GUEST:=20step=203=20=E2=80=94=20lib/guest/lex.s?= =?UTF-8?q?x=20character-class=20+=20token=20primitives?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extracted shared tokeniser primitives: - Char-class predicates: lex-digit?, lex-hex-digit?, lex-alpha? (alias lex-letter?), lex-alnum?, lex-ident-start?, lex-ident-char?, lex-space? (no newline), lex-whitespace? (incl newline). All nil-safe. - Token record: lex-make-token, lex-make-token-spanning, accessors. Ported lib/lua/tokenizer.sx and lib/tcl/tokenizer.sx — 7 lua and 5 tcl predicate definitions collapsed into prefix-rename calls that alias lua-/tcl- names to lex- primitives. Test scripts (lua/test.sh, tcl/test.sh, tcl/conformance.sh) load lib/guest/lex.sx and prefix.sx before the per-language tokenizer. Verification: - lua/test.sh: 185/185 = baseline - tcl/test.sh: 342/342 (parse 67 + eval 169 + error 39 + namespace 22 + coro 20 + idiom 25) - tcl/conformance.sh: 3/4 = baseline (event-loop failure is pre-existing) Two consumers verified — step complete. Co-Authored-By: Claude Opus 4.7 (1M context) --- lib/guest/lex.sx | 67 ++++++++++++++++++++++++++++++++++++++++++ lib/lua/test.sh | 2 ++ lib/lua/tokenizer.sx | 35 +++++----------------- lib/tcl/conformance.sh | 2 ++ lib/tcl/test.sh | 2 ++ lib/tcl/tokenizer.sx | 21 ++++--------- 6 files changed, 87 insertions(+), 42 deletions(-) create mode 100644 lib/guest/lex.sx diff --git a/lib/guest/lex.sx b/lib/guest/lex.sx new file mode 100644 index 00000000..5894dffa --- /dev/null +++ b/lib/guest/lex.sx @@ -0,0 +1,67 @@ +;; lib/guest/lex.sx — character-class predicates and token primitives shared +;; across guest tokenisers. +;; +;; All predicates are nil-safe — they accept nil (end-of-input) and return +;; false. This matches the convention used by the existing per-language +;; tokenisers (cur returns nil at EOF). +;; +;; Char classes +;; ------------ +;; lex-digit? — 0-9 +;; lex-hex-digit? — 0-9, a-f, A-F +;; lex-alpha? — a-z, A-Z (alias: lex-letter?) +;; lex-alnum? — alpha or digit +;; lex-ident-start? — alpha or underscore +;; lex-ident-char? — ident-start or digit +;; lex-space? — " ", "\t", "\r" (no newline) +;; lex-whitespace? — " ", "\t", "\r", "\n" (includes newline) +;; +;; Token record +;; ------------ +;; (lex-make-token TYPE VALUE POS) — {:type :value :pos} +;; (lex-make-token-spanning TYPE VALUE POS END) +;; — {:type :value :pos :end} +;; (lex-token-type TOK) +;; (lex-token-value TOK) +;; (lex-token-pos TOK) + +(define lex-digit? (fn (c) (and (not (= c nil)) (>= c "0") (<= c "9")))) + +(define + lex-hex-digit? + (fn + (c) + (and + (not (= c nil)) + (or + (lex-digit? c) + (and (>= c "a") (<= c "f")) + (and (>= c "A") (<= c "F")))))) + +(define + lex-alpha? + (fn + (c) + (and + (not (= c nil)) + (or (and (>= c "a") (<= c "z")) (and (>= c "A") (<= c "Z")))))) + +(define lex-letter? lex-alpha?) + +(define lex-alnum? (fn (c) (or (lex-alpha? c) (lex-digit? c)))) + +(define lex-ident-start? (fn (c) (or (lex-alpha? c) (= c "_")))) + +(define lex-ident-char? (fn (c) (or (lex-ident-start? c) (lex-digit? c)))) + +(define lex-space? (fn (c) (or (= c " ") (= c "\t") (= c "\r")))) + +(define lex-whitespace? (fn (c) (or (lex-space? c) (= c "\n")))) + +(define lex-make-token (fn (type value pos) {:pos pos :value value :type type})) + +(define lex-make-token-spanning (fn (type value pos end) {:pos pos :end end :value value :type type})) + +(define lex-token-type (fn (tok) (get tok :type))) +(define lex-token-value (fn (tok) (get tok :value))) +(define lex-token-pos (fn (tok) (get tok :pos))) diff --git a/lib/lua/test.sh b/lib/lua/test.sh index 719f3750..13dabffc 100755 --- a/lib/lua/test.sh +++ b/lib/lua/test.sh @@ -28,6 +28,8 @@ trap "rm -f $TMPFILE" EXIT cat > "$TMPFILE" << 'EPOCHS' (epoch 1) +(load "lib/guest/lex.sx") +(load "lib/guest/prefix.sx") (load "lib/lua/tokenizer.sx") (epoch 2) (load "lib/lua/parser.sx") diff --git a/lib/lua/tokenizer.sx b/lib/lua/tokenizer.sx index 6a09788d..32512705 100644 --- a/lib/lua/tokenizer.sx +++ b/lib/lua/tokenizer.sx @@ -1,31 +1,12 @@ -(define lua-make-token (fn (type value pos) {:pos pos :value value :type type})) +(prefix-rename "lua-" + '((make-token lex-make-token) + (digit? lex-digit?) + (hex-digit? lex-hex-digit?) + (letter? lex-alpha?) + (ident-start? lex-ident-start?) + (ident-char? lex-ident-char?) + (ws? lex-whitespace?))) -(define lua-digit? (fn (c) (and (not (= c nil)) (>= c "0") (<= c "9")))) - -(define - lua-hex-digit? - (fn - (c) - (and - (not (= c nil)) - (or - (lua-digit? c) - (and (>= c "a") (<= c "f")) - (and (>= c "A") (<= c "F")))))) - -(define - lua-letter? - (fn - (c) - (and - (not (= c nil)) - (or (and (>= c "a") (<= c "z")) (and (>= c "A") (<= c "Z")))))) - -(define lua-ident-start? (fn (c) (or (lua-letter? c) (= c "_")))) - -(define lua-ident-char? (fn (c) (or (lua-ident-start? c) (lua-digit? c)))) - -(define lua-ws? (fn (c) (or (= c " ") (= c "\t") (= c "\n") (= c "\r")))) (define lua-keywords diff --git a/lib/tcl/conformance.sh b/lib/tcl/conformance.sh index 50d0f8d0..23ce41fb 100755 --- a/lib/tcl/conformance.sh +++ b/lib/tcl/conformance.sh @@ -63,6 +63,8 @@ for tcl_file in "${TCL_FILES[@]}"; do # Build epoch input using quoted heredoc for static parts; helper path via variable cat > "$tmpfile" << EPOCHS (epoch 1) +(load "lib/guest/lex.sx") +(load "lib/guest/prefix.sx") (load "lib/tcl/tokenizer.sx") (epoch 2) (load "lib/tcl/parser.sx") diff --git a/lib/tcl/test.sh b/lib/tcl/test.sh index 445db51d..8b85edde 100755 --- a/lib/tcl/test.sh +++ b/lib/tcl/test.sh @@ -33,6 +33,8 @@ HELPER_EOF cat > "$TMPFILE" << EPOCHS (epoch 1) +(load "lib/guest/lex.sx") +(load "lib/guest/prefix.sx") (load "lib/tcl/tokenizer.sx") (epoch 2) (load "lib/tcl/parser.sx") diff --git a/lib/tcl/tokenizer.sx b/lib/tcl/tokenizer.sx index 6ad455ac..bc094ff3 100644 --- a/lib/tcl/tokenizer.sx +++ b/lib/tcl/tokenizer.sx @@ -1,19 +1,10 @@ -(define tcl-ws? (fn (c) (or (= c " ") (= c "\t") (= c "\r")))) +(prefix-rename "tcl-" + '((ws? lex-space?) + (alpha? lex-alpha?) + (digit? lex-digit?) + (ident-start? lex-ident-start?) + (ident-char? lex-ident-char?))) -(define tcl-alpha? - (fn (c) - (and - (not (= c nil)) - (or (and (>= c "a") (<= c "z")) (and (>= c "A") (<= c "Z")))))) - -(define tcl-digit? - (fn (c) (and (not (= c nil)) (>= c "0") (<= c "9")))) - -(define tcl-ident-start? - (fn (c) (or (tcl-alpha? c) (= c "_")))) - -(define tcl-ident-char? - (fn (c) (or (tcl-ident-start? c) (tcl-digit? c)))) (define tcl-tokenize (fn (src)