Files
rose-ash/lib/apl/tokenizer.sx
giles c73b696494 apl: tokenizer + 46 tests (Phase 1, step 1)
Unicode-aware byte scanner using starts-with?/consume! for multi-byte
APL glyphs. Handles numbers (¯-negative), string literals, identifiers
(⎕ system names), all APL function/operator glyphs, :Keywords,
comments ⍝, diamond ⋄, assignment ←.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-25 18:22:30 +00:00

169 lines
5.6 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
(define apl-glyph-set
(list "+" "-" "×" "÷" "*" "⍟" "⌈" "⌊" "|" "!" "?" "○" "~" "<" "≤" "=" "≥" ">" "≠"
"∊" "∧" "" "⍱" "⍲" "," "⍪" "" "⌽" "⊖" "⍉" "↑" "↓" "⊂" "⊃" "⊆"
"" "∩" "" "⍸" "⌷" "⍋" "⍒" "⊥" "" "⊣" "⊢" "⍎" "⍕"
"" "⍵" "∇" "/" "\\" "¨" "⍨" "∘" "." "⍣" "⍤" "⍥" "@" "¯"))
(define apl-glyph?
(fn (ch)
(some (fn (g) (= g ch)) apl-glyph-set)))
(define apl-digit?
(fn (ch)
(and (string? ch) (>= ch "0") (<= ch "9"))))
(define apl-alpha?
(fn (ch)
(and (string? ch)
(or (and (>= ch "a") (<= ch "z"))
(and (>= ch "A") (<= ch "Z"))
(= ch "_")))))
(define apl-tokenize
(fn (source)
(let ((pos 0)
(src-len (len source))
(tokens (list)))
(define tok-push!
(fn (type value)
(append! tokens {:type type :value value})))
(define cur-sw?
(fn (ch)
(and (< pos src-len) (starts-with? (slice source pos) ch))))
(define cur-byte
(fn ()
(if (< pos src-len) (nth source pos) nil)))
(define advance!
(fn ()
(set! pos (+ pos 1))))
(define consume!
(fn (ch)
(set! pos (+ pos (len ch)))))
(define find-glyph
(fn ()
(let ((rem (slice source pos)))
(let ((matches (filter (fn (g) (starts-with? rem g)) apl-glyph-set)))
(if (> (len matches) 0) (first matches) nil)))))
(define read-digits!
(fn (acc)
(if (and (< pos src-len) (apl-digit? (cur-byte)))
(let ((ch (cur-byte)))
(begin
(advance!)
(read-digits! (str acc ch))))
acc)))
(define read-ident-cont!
(fn ()
(when (and (< pos src-len)
(let ((ch (cur-byte)))
(or (apl-alpha? ch) (apl-digit? ch))))
(begin
(advance!)
(read-ident-cont!)))))
(define read-string!
(fn (acc)
(cond
((>= pos src-len) acc)
((cur-sw? "'")
(if (and (< (+ pos 1) src-len) (cur-sw? "'"))
(begin
(advance!)
(advance!)
(read-string! (str acc "'")))
(begin (advance!) acc)))
(true
(let ((ch (cur-byte)))
(begin
(advance!)
(read-string! (str acc ch))))))))
(define skip-line!
(fn ()
(when (and (< pos src-len) (not (cur-sw? "\n")))
(begin
(advance!)
(skip-line!)))))
(define scan!
(fn ()
(when (< pos src-len)
(let ((ch (cur-byte)))
(cond
((or (= ch " ") (= ch "\t") (= ch "\r"))
(begin (advance!) (scan!)))
((= ch "\n")
(begin (advance!) (tok-push! :newline nil) (scan!)))
((cur-sw? "⍝")
(begin (skip-line!) (scan!)))
((cur-sw? "⋄")
(begin (consume! "⋄") (tok-push! :diamond nil) (scan!)))
((= ch "(")
(begin (advance!) (tok-push! :lparen nil) (scan!)))
((= ch ")")
(begin (advance!) (tok-push! :rparen nil) (scan!)))
((= ch "[")
(begin (advance!) (tok-push! :lbracket nil) (scan!)))
((= ch "]")
(begin (advance!) (tok-push! :rbracket nil) (scan!)))
((= ch "{")
(begin (advance!) (tok-push! :lbrace nil) (scan!)))
((= ch "}")
(begin (advance!) (tok-push! :rbrace nil) (scan!)))
((= ch ";")
(begin (advance!) (tok-push! :semi nil) (scan!)))
((cur-sw? "←")
(begin (consume! "←") (tok-push! :assign nil) (scan!)))
((= ch ":")
(let ((start pos))
(begin
(advance!)
(if (and (< pos src-len) (apl-alpha? (cur-byte)))
(begin
(read-ident-cont!)
(tok-push! :keyword (slice source start pos)))
(tok-push! :colon nil))
(scan!))))
((and (cur-sw? "¯")
(< (+ pos (len "¯")) src-len)
(apl-digit? (nth source (+ pos (len "¯")))))
(begin
(consume! "¯")
(let ((digits (read-digits! "")))
(tok-push! :num (- 0 (parse-int digits 0))))
(scan!)))
((apl-digit? ch)
(begin
(let ((digits (read-digits! "")))
(tok-push! :num (parse-int digits 0)))
(scan!)))
((= ch "'")
(begin
(advance!)
(let ((s (read-string! "")))
(tok-push! :str s))
(scan!)))
((or (apl-alpha? ch) (cur-sw? "⎕"))
(let ((start pos))
(begin
(if (cur-sw? "⎕") (consume! "⎕") (advance!))
(read-ident-cont!)
(tok-push! :name (slice source start pos))
(scan!))))
(true
(let ((g (find-glyph)))
(if g
(begin (consume! g) (tok-push! :glyph g) (scan!))
(begin (advance!) (scan!))))))))))
(scan!)
tokens)))