Unicode-aware byte scanner using starts-with?/consume! for multi-byte APL glyphs. Handles numbers (¯-negative), string literals, identifiers (⎕ system names), all APL function/operator glyphs, :Keywords, comments ⍝, diamond ⋄, assignment ←. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
169 lines
5.6 KiB
Plaintext
169 lines
5.6 KiB
Plaintext
(define apl-glyph-set
|
||
(list "+" "-" "×" "÷" "*" "⍟" "⌈" "⌊" "|" "!" "?" "○" "~" "<" "≤" "=" "≥" ">" "≠"
|
||
"∊" "∧" "∨" "⍱" "⍲" "," "⍪" "⍴" "⌽" "⊖" "⍉" "↑" "↓" "⊂" "⊃" "⊆"
|
||
"∪" "∩" "⍳" "⍸" "⌷" "⍋" "⍒" "⊥" "⊤" "⊣" "⊢" "⍎" "⍕"
|
||
"⍺" "⍵" "∇" "/" "\\" "¨" "⍨" "∘" "." "⍣" "⍤" "⍥" "@" "¯"))
|
||
|
||
(define apl-glyph?
|
||
(fn (ch)
|
||
(some (fn (g) (= g ch)) apl-glyph-set)))
|
||
|
||
(define apl-digit?
|
||
(fn (ch)
|
||
(and (string? ch) (>= ch "0") (<= ch "9"))))
|
||
|
||
(define apl-alpha?
|
||
(fn (ch)
|
||
(and (string? ch)
|
||
(or (and (>= ch "a") (<= ch "z"))
|
||
(and (>= ch "A") (<= ch "Z"))
|
||
(= ch "_")))))
|
||
|
||
(define apl-tokenize
|
||
(fn (source)
|
||
(let ((pos 0)
|
||
(src-len (len source))
|
||
(tokens (list)))
|
||
|
||
(define tok-push!
|
||
(fn (type value)
|
||
(append! tokens {:type type :value value})))
|
||
|
||
(define cur-sw?
|
||
(fn (ch)
|
||
(and (< pos src-len) (starts-with? (slice source pos) ch))))
|
||
|
||
(define cur-byte
|
||
(fn ()
|
||
(if (< pos src-len) (nth source pos) nil)))
|
||
|
||
(define advance!
|
||
(fn ()
|
||
(set! pos (+ pos 1))))
|
||
|
||
(define consume!
|
||
(fn (ch)
|
||
(set! pos (+ pos (len ch)))))
|
||
|
||
(define find-glyph
|
||
(fn ()
|
||
(let ((rem (slice source pos)))
|
||
(let ((matches (filter (fn (g) (starts-with? rem g)) apl-glyph-set)))
|
||
(if (> (len matches) 0) (first matches) nil)))))
|
||
|
||
(define read-digits!
|
||
(fn (acc)
|
||
(if (and (< pos src-len) (apl-digit? (cur-byte)))
|
||
(let ((ch (cur-byte)))
|
||
(begin
|
||
(advance!)
|
||
(read-digits! (str acc ch))))
|
||
acc)))
|
||
|
||
(define read-ident-cont!
|
||
(fn ()
|
||
(when (and (< pos src-len)
|
||
(let ((ch (cur-byte)))
|
||
(or (apl-alpha? ch) (apl-digit? ch))))
|
||
(begin
|
||
(advance!)
|
||
(read-ident-cont!)))))
|
||
|
||
(define read-string!
|
||
(fn (acc)
|
||
(cond
|
||
((>= pos src-len) acc)
|
||
((cur-sw? "'")
|
||
(if (and (< (+ pos 1) src-len) (cur-sw? "'"))
|
||
(begin
|
||
(advance!)
|
||
(advance!)
|
||
(read-string! (str acc "'")))
|
||
(begin (advance!) acc)))
|
||
(true
|
||
(let ((ch (cur-byte)))
|
||
(begin
|
||
(advance!)
|
||
(read-string! (str acc ch))))))))
|
||
|
||
(define skip-line!
|
||
(fn ()
|
||
(when (and (< pos src-len) (not (cur-sw? "\n")))
|
||
(begin
|
||
(advance!)
|
||
(skip-line!)))))
|
||
|
||
(define scan!
|
||
(fn ()
|
||
(when (< pos src-len)
|
||
(let ((ch (cur-byte)))
|
||
(cond
|
||
((or (= ch " ") (= ch "\t") (= ch "\r"))
|
||
(begin (advance!) (scan!)))
|
||
((= ch "\n")
|
||
(begin (advance!) (tok-push! :newline nil) (scan!)))
|
||
((cur-sw? "⍝")
|
||
(begin (skip-line!) (scan!)))
|
||
((cur-sw? "⋄")
|
||
(begin (consume! "⋄") (tok-push! :diamond nil) (scan!)))
|
||
((= ch "(")
|
||
(begin (advance!) (tok-push! :lparen nil) (scan!)))
|
||
((= ch ")")
|
||
(begin (advance!) (tok-push! :rparen nil) (scan!)))
|
||
((= ch "[")
|
||
(begin (advance!) (tok-push! :lbracket nil) (scan!)))
|
||
((= ch "]")
|
||
(begin (advance!) (tok-push! :rbracket nil) (scan!)))
|
||
((= ch "{")
|
||
(begin (advance!) (tok-push! :lbrace nil) (scan!)))
|
||
((= ch "}")
|
||
(begin (advance!) (tok-push! :rbrace nil) (scan!)))
|
||
((= ch ";")
|
||
(begin (advance!) (tok-push! :semi nil) (scan!)))
|
||
((cur-sw? "←")
|
||
(begin (consume! "←") (tok-push! :assign nil) (scan!)))
|
||
((= ch ":")
|
||
(let ((start pos))
|
||
(begin
|
||
(advance!)
|
||
(if (and (< pos src-len) (apl-alpha? (cur-byte)))
|
||
(begin
|
||
(read-ident-cont!)
|
||
(tok-push! :keyword (slice source start pos)))
|
||
(tok-push! :colon nil))
|
||
(scan!))))
|
||
((and (cur-sw? "¯")
|
||
(< (+ pos (len "¯")) src-len)
|
||
(apl-digit? (nth source (+ pos (len "¯")))))
|
||
(begin
|
||
(consume! "¯")
|
||
(let ((digits (read-digits! "")))
|
||
(tok-push! :num (- 0 (parse-int digits 0))))
|
||
(scan!)))
|
||
((apl-digit? ch)
|
||
(begin
|
||
(let ((digits (read-digits! "")))
|
||
(tok-push! :num (parse-int digits 0)))
|
||
(scan!)))
|
||
((= ch "'")
|
||
(begin
|
||
(advance!)
|
||
(let ((s (read-string! "")))
|
||
(tok-push! :str s))
|
||
(scan!)))
|
||
((or (apl-alpha? ch) (cur-sw? "⎕"))
|
||
(let ((start pos))
|
||
(begin
|
||
(if (cur-sw? "⎕") (consume! "⎕") (advance!))
|
||
(read-ident-cont!)
|
||
(tok-push! :name (slice source start pos))
|
||
(scan!))))
|
||
(true
|
||
(let ((g (find-glyph)))
|
||
(if g
|
||
(begin (consume! g) (tok-push! :glyph g) (scan!))
|
||
(begin (advance!) (scan!))))))))))
|
||
|
||
(scan!)
|
||
tokens)))
|