From c73b696494ef6ae8941568fb16c1995fc5acdee4 Mon Sep 17 00:00:00 2001 From: giles Date: Sat, 25 Apr 2026 18:22:30 +0000 Subject: [PATCH] apl: tokenizer + 46 tests (Phase 1, step 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unicode-aware byte scanner using starts-with?/consume! for multi-byte APL glyphs. Handles numbers (¯-negative), string literals, identifiers (⎕ system names), all APL function/operator glyphs, :Keywords, comments ⍝, diamond ⋄, assignment ←. Co-Authored-By: Claude Sonnet 4.6 --- lib/apl/tests/parse.sx | 83 ++++++++++++++++++++ lib/apl/tokenizer.sx | 168 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 251 insertions(+) create mode 100644 lib/apl/tests/parse.sx create mode 100644 lib/apl/tokenizer.sx diff --git a/lib/apl/tests/parse.sx b/lib/apl/tests/parse.sx new file mode 100644 index 00000000..120de0e5 --- /dev/null +++ b/lib/apl/tests/parse.sx @@ -0,0 +1,83 @@ +(define apl-test-count 0) +(define apl-test-pass 0) +(define apl-test-fails (list)) + +(define apl-test + (fn (name actual expected) + (begin + (set! apl-test-count (+ apl-test-count 1)) + (if (= actual expected) + (set! apl-test-pass (+ apl-test-pass 1)) + (append! apl-test-fails {:name name :actual actual :expected expected}))))) + +(define tok-types + (fn (src) + (map (fn (t) (get t :type)) (apl-tokenize src)))) + +(define tok-values + (fn (src) + (map (fn (t) (get t :value)) (apl-tokenize src)))) + +(define tok-count + (fn (src) + (len (apl-tokenize src)))) + +(define tok-type-at + (fn (src i) + (get (nth (apl-tokenize src) i) :type))) + +(define tok-value-at + (fn (src i) + (get (nth (apl-tokenize src) i) :value))) + +(apl-test "empty: no tokens" (tok-count "") 0) +(apl-test "empty: whitespace only" (tok-count " ") 0) +(apl-test "num: zero" (tok-values "0") (list 0)) +(apl-test "num: positive" (tok-values "42") (list 42)) +(apl-test "num: large" (tok-values "12345") (list 12345)) +(apl-test "num: negative" (tok-values "¯5") (list -5)) +(apl-test "num: negative zero" (tok-values "¯0") (list 0)) +(apl-test "num: strand count" (tok-count "1 2 3") 3) +(apl-test "num: strand types" (tok-types "1 2 3") (list :num :num :num)) +(apl-test "num: strand values" (tok-values "1 2 3") (list 1 2 3)) +(apl-test "num: neg in strand" (tok-values "1 ¯2 3") (list 1 -2 3)) +(apl-test "str: empty" (tok-values "''") (list "")) +(apl-test "str: single char" (tok-values "'a'") (list "a")) +(apl-test "str: word" (tok-values "'hello'") (list "hello")) +(apl-test "str: escaped quote" (tok-values "''''") (list "'")) +(apl-test "str: type" (tok-types "'abc'") (list :str)) +(apl-test "name: simple" (tok-values "foo") (list "foo")) +(apl-test "name: type" (tok-types "foo") (list :name)) +(apl-test "name: mixed case" (tok-values "MyVar") (list "MyVar")) +(apl-test "name: with digits" (tok-values "x1") (list "x1")) +(apl-test "name: system var" (tok-values "⎕IO") (list "⎕IO")) +(apl-test "name: system var type" (tok-types "⎕IO") (list :name)) +(apl-test "glyph: plus" (tok-types "+") (list :glyph)) +(apl-test "glyph: plus value" (tok-values "+") (list "+")) +(apl-test "glyph: iota" (tok-values "⍳") (list "⍳")) +(apl-test "glyph: reduce" (tok-values "+/") (list "+" "/")) +(apl-test "glyph: floor" (tok-values "⌊") (list "⌊")) +(apl-test "glyph: rho" (tok-values "⍴") (list "⍴")) +(apl-test "glyph: alpha omega" (tok-types "⍺ ⍵") (list :glyph :glyph)) +(apl-test "punct: lparen" (tok-types "(") (list :lparen)) +(apl-test "punct: rparen" (tok-types ")") (list :rparen)) +(apl-test "punct: brackets" (tok-types "[42]") (list :lbracket :num :rbracket)) +(apl-test "punct: braces" (tok-types "{}") (list :lbrace :rbrace)) +(apl-test "punct: semi" (tok-types ";") (list :semi)) +(apl-test "assign: arrow" (tok-types "x←1") (list :name :assign :num)) +(apl-test "diamond: separator" (tok-types "1⋄2") (list :num :diamond :num)) +(apl-test "newline: emitted" (tok-types "1\n2") (list :num :newline :num)) +(apl-test "comment: skipped" (tok-count "⍝ ignore me") 0) +(apl-test "comment: rest ignored" (tok-count "1 ⍝ note") 1) +(apl-test "colon: bare" (tok-types ":") (list :colon)) +(apl-test "keyword: If" (tok-values ":If") (list ":If")) +(apl-test "keyword: type" (tok-types ":While") (list :keyword)) +(apl-test "keyword: EndFor" (tok-values ":EndFor") (list ":EndFor")) +(apl-test "expr: +/ ⍳ 5" (tok-types "+/ ⍳ 5") (list :glyph :glyph :glyph :num)) +(apl-test "expr: x←42" (tok-count "x←42") 3) +(apl-test "expr: dfn body" (tok-types "{⍺+⍵}") + (list :lbrace :glyph :glyph :glyph :rbrace)) + +(define apl-tokenize-test-summary + (str "tokenizer " apl-test-pass "/" apl-test-count + (if (= (len apl-test-fails) 0) "" (str " FAILS: " apl-test-fails)))) diff --git a/lib/apl/tokenizer.sx b/lib/apl/tokenizer.sx new file mode 100644 index 00000000..f3ff4a0e --- /dev/null +++ b/lib/apl/tokenizer.sx @@ -0,0 +1,168 @@ +(define apl-glyph-set + (list "+" "-" "×" "÷" "*" "⍟" "⌈" "⌊" "|" "!" "?" "○" "~" "<" "≤" "=" "≥" ">" "≠" + "∊" "∧" "∨" "⍱" "⍲" "," "⍪" "⍴" "⌽" "⊖" "⍉" "↑" "↓" "⊂" "⊃" "⊆" + "∪" "∩" "⍳" "⍸" "⌷" "⍋" "⍒" "⊥" "⊤" "⊣" "⊢" "⍎" "⍕" + "⍺" "⍵" "∇" "/" "\\" "¨" "⍨" "∘" "." "⍣" "⍤" "⍥" "@" "¯")) + +(define apl-glyph? + (fn (ch) + (some (fn (g) (= g ch)) apl-glyph-set))) + +(define apl-digit? + (fn (ch) + (and (string? ch) (>= ch "0") (<= ch "9")))) + +(define apl-alpha? + (fn (ch) + (and (string? ch) + (or (and (>= ch "a") (<= ch "z")) + (and (>= ch "A") (<= ch "Z")) + (= ch "_"))))) + +(define apl-tokenize + (fn (source) + (let ((pos 0) + (src-len (len source)) + (tokens (list))) + + (define tok-push! + (fn (type value) + (append! tokens {:type type :value value}))) + + (define cur-sw? + (fn (ch) + (and (< pos src-len) (starts-with? (slice source pos) ch)))) + + (define cur-byte + (fn () + (if (< pos src-len) (nth source pos) nil))) + + (define advance! + (fn () + (set! pos (+ pos 1)))) + + (define consume! + (fn (ch) + (set! pos (+ pos (len ch))))) + + (define find-glyph + (fn () + (let ((rem (slice source pos))) + (let ((matches (filter (fn (g) (starts-with? rem g)) apl-glyph-set))) + (if (> (len matches) 0) (first matches) nil))))) + + (define read-digits! + (fn (acc) + (if (and (< pos src-len) (apl-digit? (cur-byte))) + (let ((ch (cur-byte))) + (begin + (advance!) + (read-digits! (str acc ch)))) + acc))) + + (define read-ident-cont! + (fn () + (when (and (< pos src-len) + (let ((ch (cur-byte))) + (or (apl-alpha? ch) (apl-digit? ch)))) + (begin + (advance!) + (read-ident-cont!))))) + + (define read-string! + (fn (acc) + (cond + ((>= pos src-len) acc) + ((cur-sw? "'") + (if (and (< (+ pos 1) src-len) (cur-sw? "'")) + (begin + (advance!) + (advance!) + (read-string! (str acc "'"))) + (begin (advance!) acc))) + (true + (let ((ch (cur-byte))) + (begin + (advance!) + (read-string! (str acc ch)))))))) + + (define skip-line! + (fn () + (when (and (< pos src-len) (not (cur-sw? "\n"))) + (begin + (advance!) + (skip-line!))))) + + (define scan! + (fn () + (when (< pos src-len) + (let ((ch (cur-byte))) + (cond + ((or (= ch " ") (= ch "\t") (= ch "\r")) + (begin (advance!) (scan!))) + ((= ch "\n") + (begin (advance!) (tok-push! :newline nil) (scan!))) + ((cur-sw? "⍝") + (begin (skip-line!) (scan!))) + ((cur-sw? "⋄") + (begin (consume! "⋄") (tok-push! :diamond nil) (scan!))) + ((= ch "(") + (begin (advance!) (tok-push! :lparen nil) (scan!))) + ((= ch ")") + (begin (advance!) (tok-push! :rparen nil) (scan!))) + ((= ch "[") + (begin (advance!) (tok-push! :lbracket nil) (scan!))) + ((= ch "]") + (begin (advance!) (tok-push! :rbracket nil) (scan!))) + ((= ch "{") + (begin (advance!) (tok-push! :lbrace nil) (scan!))) + ((= ch "}") + (begin (advance!) (tok-push! :rbrace nil) (scan!))) + ((= ch ";") + (begin (advance!) (tok-push! :semi nil) (scan!))) + ((cur-sw? "←") + (begin (consume! "←") (tok-push! :assign nil) (scan!))) + ((= ch ":") + (let ((start pos)) + (begin + (advance!) + (if (and (< pos src-len) (apl-alpha? (cur-byte))) + (begin + (read-ident-cont!) + (tok-push! :keyword (slice source start pos))) + (tok-push! :colon nil)) + (scan!)))) + ((and (cur-sw? "¯") + (< (+ pos (len "¯")) src-len) + (apl-digit? (nth source (+ pos (len "¯"))))) + (begin + (consume! "¯") + (let ((digits (read-digits! ""))) + (tok-push! :num (- 0 (parse-int digits 0)))) + (scan!))) + ((apl-digit? ch) + (begin + (let ((digits (read-digits! ""))) + (tok-push! :num (parse-int digits 0))) + (scan!))) + ((= ch "'") + (begin + (advance!) + (let ((s (read-string! ""))) + (tok-push! :str s)) + (scan!))) + ((or (apl-alpha? ch) (cur-sw? "⎕")) + (let ((start pos)) + (begin + (if (cur-sw? "⎕") (consume! "⎕") (advance!)) + (read-ident-cont!) + (tok-push! :name (slice source start pos)) + (scan!)))) + (true + (let ((g (find-glyph))) + (if g + (begin (consume! g) (tok-push! :glyph g) (scan!)) + (begin (advance!) (scan!)))))))))) + + (scan!) + tokens)))