apl: tokenizer + 46 tests (Phase 1, step 1)
Unicode-aware byte scanner using starts-with?/consume! for multi-byte APL glyphs. Handles numbers (¯-negative), string literals, identifiers (⎕ system names), all APL function/operator glyphs, :Keywords, comments ⍝, diamond ⋄, assignment ←. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
168
lib/apl/tokenizer.sx
Normal file
168
lib/apl/tokenizer.sx
Normal file
@@ -0,0 +1,168 @@
|
||||
(define apl-glyph-set
|
||||
(list "+" "-" "×" "÷" "*" "⍟" "⌈" "⌊" "|" "!" "?" "○" "~" "<" "≤" "=" "≥" ">" "≠"
|
||||
"∊" "∧" "∨" "⍱" "⍲" "," "⍪" "⍴" "⌽" "⊖" "⍉" "↑" "↓" "⊂" "⊃" "⊆"
|
||||
"∪" "∩" "⍳" "⍸" "⌷" "⍋" "⍒" "⊥" "⊤" "⊣" "⊢" "⍎" "⍕"
|
||||
"⍺" "⍵" "∇" "/" "\\" "¨" "⍨" "∘" "." "⍣" "⍤" "⍥" "@" "¯"))
|
||||
|
||||
(define apl-glyph?
|
||||
(fn (ch)
|
||||
(some (fn (g) (= g ch)) apl-glyph-set)))
|
||||
|
||||
(define apl-digit?
|
||||
(fn (ch)
|
||||
(and (string? ch) (>= ch "0") (<= ch "9"))))
|
||||
|
||||
(define apl-alpha?
|
||||
(fn (ch)
|
||||
(and (string? ch)
|
||||
(or (and (>= ch "a") (<= ch "z"))
|
||||
(and (>= ch "A") (<= ch "Z"))
|
||||
(= ch "_")))))
|
||||
|
||||
(define apl-tokenize
|
||||
(fn (source)
|
||||
(let ((pos 0)
|
||||
(src-len (len source))
|
||||
(tokens (list)))
|
||||
|
||||
(define tok-push!
|
||||
(fn (type value)
|
||||
(append! tokens {:type type :value value})))
|
||||
|
||||
(define cur-sw?
|
||||
(fn (ch)
|
||||
(and (< pos src-len) (starts-with? (slice source pos) ch))))
|
||||
|
||||
(define cur-byte
|
||||
(fn ()
|
||||
(if (< pos src-len) (nth source pos) nil)))
|
||||
|
||||
(define advance!
|
||||
(fn ()
|
||||
(set! pos (+ pos 1))))
|
||||
|
||||
(define consume!
|
||||
(fn (ch)
|
||||
(set! pos (+ pos (len ch)))))
|
||||
|
||||
(define find-glyph
|
||||
(fn ()
|
||||
(let ((rem (slice source pos)))
|
||||
(let ((matches (filter (fn (g) (starts-with? rem g)) apl-glyph-set)))
|
||||
(if (> (len matches) 0) (first matches) nil)))))
|
||||
|
||||
(define read-digits!
|
||||
(fn (acc)
|
||||
(if (and (< pos src-len) (apl-digit? (cur-byte)))
|
||||
(let ((ch (cur-byte)))
|
||||
(begin
|
||||
(advance!)
|
||||
(read-digits! (str acc ch))))
|
||||
acc)))
|
||||
|
||||
(define read-ident-cont!
|
||||
(fn ()
|
||||
(when (and (< pos src-len)
|
||||
(let ((ch (cur-byte)))
|
||||
(or (apl-alpha? ch) (apl-digit? ch))))
|
||||
(begin
|
||||
(advance!)
|
||||
(read-ident-cont!)))))
|
||||
|
||||
(define read-string!
|
||||
(fn (acc)
|
||||
(cond
|
||||
((>= pos src-len) acc)
|
||||
((cur-sw? "'")
|
||||
(if (and (< (+ pos 1) src-len) (cur-sw? "'"))
|
||||
(begin
|
||||
(advance!)
|
||||
(advance!)
|
||||
(read-string! (str acc "'")))
|
||||
(begin (advance!) acc)))
|
||||
(true
|
||||
(let ((ch (cur-byte)))
|
||||
(begin
|
||||
(advance!)
|
||||
(read-string! (str acc ch))))))))
|
||||
|
||||
(define skip-line!
|
||||
(fn ()
|
||||
(when (and (< pos src-len) (not (cur-sw? "\n")))
|
||||
(begin
|
||||
(advance!)
|
||||
(skip-line!)))))
|
||||
|
||||
(define scan!
|
||||
(fn ()
|
||||
(when (< pos src-len)
|
||||
(let ((ch (cur-byte)))
|
||||
(cond
|
||||
((or (= ch " ") (= ch "\t") (= ch "\r"))
|
||||
(begin (advance!) (scan!)))
|
||||
((= ch "\n")
|
||||
(begin (advance!) (tok-push! :newline nil) (scan!)))
|
||||
((cur-sw? "⍝")
|
||||
(begin (skip-line!) (scan!)))
|
||||
((cur-sw? "⋄")
|
||||
(begin (consume! "⋄") (tok-push! :diamond nil) (scan!)))
|
||||
((= ch "(")
|
||||
(begin (advance!) (tok-push! :lparen nil) (scan!)))
|
||||
((= ch ")")
|
||||
(begin (advance!) (tok-push! :rparen nil) (scan!)))
|
||||
((= ch "[")
|
||||
(begin (advance!) (tok-push! :lbracket nil) (scan!)))
|
||||
((= ch "]")
|
||||
(begin (advance!) (tok-push! :rbracket nil) (scan!)))
|
||||
((= ch "{")
|
||||
(begin (advance!) (tok-push! :lbrace nil) (scan!)))
|
||||
((= ch "}")
|
||||
(begin (advance!) (tok-push! :rbrace nil) (scan!)))
|
||||
((= ch ";")
|
||||
(begin (advance!) (tok-push! :semi nil) (scan!)))
|
||||
((cur-sw? "←")
|
||||
(begin (consume! "←") (tok-push! :assign nil) (scan!)))
|
||||
((= ch ":")
|
||||
(let ((start pos))
|
||||
(begin
|
||||
(advance!)
|
||||
(if (and (< pos src-len) (apl-alpha? (cur-byte)))
|
||||
(begin
|
||||
(read-ident-cont!)
|
||||
(tok-push! :keyword (slice source start pos)))
|
||||
(tok-push! :colon nil))
|
||||
(scan!))))
|
||||
((and (cur-sw? "¯")
|
||||
(< (+ pos (len "¯")) src-len)
|
||||
(apl-digit? (nth source (+ pos (len "¯")))))
|
||||
(begin
|
||||
(consume! "¯")
|
||||
(let ((digits (read-digits! "")))
|
||||
(tok-push! :num (- 0 (parse-int digits 0))))
|
||||
(scan!)))
|
||||
((apl-digit? ch)
|
||||
(begin
|
||||
(let ((digits (read-digits! "")))
|
||||
(tok-push! :num (parse-int digits 0)))
|
||||
(scan!)))
|
||||
((= ch "'")
|
||||
(begin
|
||||
(advance!)
|
||||
(let ((s (read-string! "")))
|
||||
(tok-push! :str s))
|
||||
(scan!)))
|
||||
((or (apl-alpha? ch) (cur-sw? "⎕"))
|
||||
(let ((start pos))
|
||||
(begin
|
||||
(if (cur-sw? "⎕") (consume! "⎕") (advance!))
|
||||
(read-ident-cont!)
|
||||
(tok-push! :name (slice source start pos))
|
||||
(scan!))))
|
||||
(true
|
||||
(let ((g (find-glyph)))
|
||||
(if g
|
||||
(begin (consume! g) (tok-push! :glyph g) (scan!))
|
||||
(begin (advance!) (scan!))))))))))
|
||||
|
||||
(scan!)
|
||||
tokens)))
|
||||
Reference in New Issue
Block a user