GUEST: step 3 — lib/guest/lex.sx character-class + token primitives
Extracted shared tokeniser primitives:
- Char-class predicates: lex-digit?, lex-hex-digit?, lex-alpha?
(alias lex-letter?), lex-alnum?, lex-ident-start?, lex-ident-char?,
lex-space? (no newline), lex-whitespace? (incl newline). All nil-safe.
- Token record: lex-make-token, lex-make-token-spanning, accessors.
Ported lib/lua/tokenizer.sx and lib/tcl/tokenizer.sx — 7 lua and 5 tcl
predicate definitions collapsed into prefix-rename calls that alias
lua-/tcl- names to lex- primitives. Test scripts (lua/test.sh,
tcl/test.sh, tcl/conformance.sh) load lib/guest/lex.sx and prefix.sx
before the per-language tokenizer.
Verification:
- lua/test.sh: 185/185 = baseline
- tcl/test.sh: 342/342 (parse 67 + eval 169 + error 39 + namespace 22
+ coro 20 + idiom 25)
- tcl/conformance.sh: 3/4 = baseline (event-loop failure is pre-existing)
Two consumers verified — step complete.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
67
lib/guest/lex.sx
Normal file
67
lib/guest/lex.sx
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
;; lib/guest/lex.sx — character-class predicates and token primitives shared
|
||||||
|
;; across guest tokenisers.
|
||||||
|
;;
|
||||||
|
;; All predicates are nil-safe — they accept nil (end-of-input) and return
|
||||||
|
;; false. This matches the convention used by the existing per-language
|
||||||
|
;; tokenisers (cur returns nil at EOF).
|
||||||
|
;;
|
||||||
|
;; Char classes
|
||||||
|
;; ------------
|
||||||
|
;; lex-digit? — 0-9
|
||||||
|
;; lex-hex-digit? — 0-9, a-f, A-F
|
||||||
|
;; lex-alpha? — a-z, A-Z (alias: lex-letter?)
|
||||||
|
;; lex-alnum? — alpha or digit
|
||||||
|
;; lex-ident-start? — alpha or underscore
|
||||||
|
;; lex-ident-char? — ident-start or digit
|
||||||
|
;; lex-space? — " ", "\t", "\r" (no newline)
|
||||||
|
;; lex-whitespace? — " ", "\t", "\r", "\n" (includes newline)
|
||||||
|
;;
|
||||||
|
;; Token record
|
||||||
|
;; ------------
|
||||||
|
;; (lex-make-token TYPE VALUE POS) — {:type :value :pos}
|
||||||
|
;; (lex-make-token-spanning TYPE VALUE POS END)
|
||||||
|
;; — {:type :value :pos :end}
|
||||||
|
;; (lex-token-type TOK)
|
||||||
|
;; (lex-token-value TOK)
|
||||||
|
;; (lex-token-pos TOK)
|
||||||
|
|
||||||
|
(define lex-digit? (fn (c) (and (not (= c nil)) (>= c "0") (<= c "9"))))
|
||||||
|
|
||||||
|
(define
|
||||||
|
lex-hex-digit?
|
||||||
|
(fn
|
||||||
|
(c)
|
||||||
|
(and
|
||||||
|
(not (= c nil))
|
||||||
|
(or
|
||||||
|
(lex-digit? c)
|
||||||
|
(and (>= c "a") (<= c "f"))
|
||||||
|
(and (>= c "A") (<= c "F"))))))
|
||||||
|
|
||||||
|
(define
|
||||||
|
lex-alpha?
|
||||||
|
(fn
|
||||||
|
(c)
|
||||||
|
(and
|
||||||
|
(not (= c nil))
|
||||||
|
(or (and (>= c "a") (<= c "z")) (and (>= c "A") (<= c "Z"))))))
|
||||||
|
|
||||||
|
(define lex-letter? lex-alpha?)
|
||||||
|
|
||||||
|
(define lex-alnum? (fn (c) (or (lex-alpha? c) (lex-digit? c))))
|
||||||
|
|
||||||
|
(define lex-ident-start? (fn (c) (or (lex-alpha? c) (= c "_"))))
|
||||||
|
|
||||||
|
(define lex-ident-char? (fn (c) (or (lex-ident-start? c) (lex-digit? c))))
|
||||||
|
|
||||||
|
(define lex-space? (fn (c) (or (= c " ") (= c "\t") (= c "\r"))))
|
||||||
|
|
||||||
|
(define lex-whitespace? (fn (c) (or (lex-space? c) (= c "\n"))))
|
||||||
|
|
||||||
|
(define lex-make-token (fn (type value pos) {:pos pos :value value :type type}))
|
||||||
|
|
||||||
|
(define lex-make-token-spanning (fn (type value pos end) {:pos pos :end end :value value :type type}))
|
||||||
|
|
||||||
|
(define lex-token-type (fn (tok) (get tok :type)))
|
||||||
|
(define lex-token-value (fn (tok) (get tok :value)))
|
||||||
|
(define lex-token-pos (fn (tok) (get tok :pos)))
|
||||||
@@ -28,6 +28,8 @@ trap "rm -f $TMPFILE" EXIT
|
|||||||
|
|
||||||
cat > "$TMPFILE" << 'EPOCHS'
|
cat > "$TMPFILE" << 'EPOCHS'
|
||||||
(epoch 1)
|
(epoch 1)
|
||||||
|
(load "lib/guest/lex.sx")
|
||||||
|
(load "lib/guest/prefix.sx")
|
||||||
(load "lib/lua/tokenizer.sx")
|
(load "lib/lua/tokenizer.sx")
|
||||||
(epoch 2)
|
(epoch 2)
|
||||||
(load "lib/lua/parser.sx")
|
(load "lib/lua/parser.sx")
|
||||||
|
|||||||
@@ -1,31 +1,12 @@
|
|||||||
(define lua-make-token (fn (type value pos) {:pos pos :value value :type type}))
|
(prefix-rename "lua-"
|
||||||
|
'((make-token lex-make-token)
|
||||||
|
(digit? lex-digit?)
|
||||||
|
(hex-digit? lex-hex-digit?)
|
||||||
|
(letter? lex-alpha?)
|
||||||
|
(ident-start? lex-ident-start?)
|
||||||
|
(ident-char? lex-ident-char?)
|
||||||
|
(ws? lex-whitespace?)))
|
||||||
|
|
||||||
(define lua-digit? (fn (c) (and (not (= c nil)) (>= c "0") (<= c "9"))))
|
|
||||||
|
|
||||||
(define
|
|
||||||
lua-hex-digit?
|
|
||||||
(fn
|
|
||||||
(c)
|
|
||||||
(and
|
|
||||||
(not (= c nil))
|
|
||||||
(or
|
|
||||||
(lua-digit? c)
|
|
||||||
(and (>= c "a") (<= c "f"))
|
|
||||||
(and (>= c "A") (<= c "F"))))))
|
|
||||||
|
|
||||||
(define
|
|
||||||
lua-letter?
|
|
||||||
(fn
|
|
||||||
(c)
|
|
||||||
(and
|
|
||||||
(not (= c nil))
|
|
||||||
(or (and (>= c "a") (<= c "z")) (and (>= c "A") (<= c "Z"))))))
|
|
||||||
|
|
||||||
(define lua-ident-start? (fn (c) (or (lua-letter? c) (= c "_"))))
|
|
||||||
|
|
||||||
(define lua-ident-char? (fn (c) (or (lua-ident-start? c) (lua-digit? c))))
|
|
||||||
|
|
||||||
(define lua-ws? (fn (c) (or (= c " ") (= c "\t") (= c "\n") (= c "\r"))))
|
|
||||||
|
|
||||||
(define
|
(define
|
||||||
lua-keywords
|
lua-keywords
|
||||||
|
|||||||
@@ -63,6 +63,8 @@ for tcl_file in "${TCL_FILES[@]}"; do
|
|||||||
# Build epoch input using quoted heredoc for static parts; helper path via variable
|
# Build epoch input using quoted heredoc for static parts; helper path via variable
|
||||||
cat > "$tmpfile" << EPOCHS
|
cat > "$tmpfile" << EPOCHS
|
||||||
(epoch 1)
|
(epoch 1)
|
||||||
|
(load "lib/guest/lex.sx")
|
||||||
|
(load "lib/guest/prefix.sx")
|
||||||
(load "lib/tcl/tokenizer.sx")
|
(load "lib/tcl/tokenizer.sx")
|
||||||
(epoch 2)
|
(epoch 2)
|
||||||
(load "lib/tcl/parser.sx")
|
(load "lib/tcl/parser.sx")
|
||||||
|
|||||||
@@ -33,6 +33,8 @@ HELPER_EOF
|
|||||||
|
|
||||||
cat > "$TMPFILE" << EPOCHS
|
cat > "$TMPFILE" << EPOCHS
|
||||||
(epoch 1)
|
(epoch 1)
|
||||||
|
(load "lib/guest/lex.sx")
|
||||||
|
(load "lib/guest/prefix.sx")
|
||||||
(load "lib/tcl/tokenizer.sx")
|
(load "lib/tcl/tokenizer.sx")
|
||||||
(epoch 2)
|
(epoch 2)
|
||||||
(load "lib/tcl/parser.sx")
|
(load "lib/tcl/parser.sx")
|
||||||
|
|||||||
@@ -1,19 +1,10 @@
|
|||||||
(define tcl-ws? (fn (c) (or (= c " ") (= c "\t") (= c "\r"))))
|
(prefix-rename "tcl-"
|
||||||
|
'((ws? lex-space?)
|
||||||
|
(alpha? lex-alpha?)
|
||||||
|
(digit? lex-digit?)
|
||||||
|
(ident-start? lex-ident-start?)
|
||||||
|
(ident-char? lex-ident-char?)))
|
||||||
|
|
||||||
(define tcl-alpha?
|
|
||||||
(fn (c)
|
|
||||||
(and
|
|
||||||
(not (= c nil))
|
|
||||||
(or (and (>= c "a") (<= c "z")) (and (>= c "A") (<= c "Z"))))))
|
|
||||||
|
|
||||||
(define tcl-digit?
|
|
||||||
(fn (c) (and (not (= c nil)) (>= c "0") (<= c "9"))))
|
|
||||||
|
|
||||||
(define tcl-ident-start?
|
|
||||||
(fn (c) (or (tcl-alpha? c) (= c "_"))))
|
|
||||||
|
|
||||||
(define tcl-ident-char?
|
|
||||||
(fn (c) (or (tcl-ident-start? c) (tcl-digit? c))))
|
|
||||||
|
|
||||||
(define tcl-tokenize
|
(define tcl-tokenize
|
||||||
(fn (src)
|
(fn (src)
|
||||||
|
|||||||
Reference in New Issue
Block a user