;; lib/prolog/tokenizer.sx — Prolog source → token stream ;; ;; Tokens: {:type T :value V :pos P} ;; Types: ;; "atom" — lowercase-start, quoted, or symbolic atom (=, \=, +, etc.) ;; "var" — uppercase-start or _-start variable (value is the name) ;; "number" — numeric literal (decoded to number) ;; "string" — "..." string literal ;; "punct" — ( ) , . [ ] | ;; "op" — :- ! (phase 1 only has these two "operators") ;; "eof" ;; ;; NOTE: phase 1 parser does NOT handle operator precedence (no X is Y+1). ;; All compound terms are written as f(arg1, arg2, ...) — including ;; =(X, Y), is(X, +(1,2)), and so on, using symbolic atoms as functors. (define pl-make-token (fn (type value pos) {:pos pos :value value :type type})) ;; ── Character predicates ────────────────────────────────────────── (define pl-digit? (fn (c) (and (>= c "0") (<= c "9")))) (define pl-lower? (fn (c) (and (>= c "a") (<= c "z")))) (define pl-upper? (fn (c) (and (>= c "A") (<= c "Z")))) (define pl-ident-char? (fn (c) (or (pl-lower? c) (pl-upper? c) (pl-digit? c) (= c "_")))) (define pl-ws? (fn (c) (or (= c " ") (= c "\t") (= c "\n") (= c "\r")))) ;; Characters that form "symbolic atoms" (operator-shaped atoms like ;; =, \=, +, -, *, /, <, >, etc.). A run of these becomes a single atom ;; token. In phase 1 this lets users write =(X, Y) or is(X, +(1,2)) as ;; regular compound terms without needing an operator parser. (define pl-sym? (fn (c) (or (= c "=") (= c "\\") (= c "+") (= c "-") (= c "*") (= c "/") (= c "<") (= c ">") (= c "@") (= c "#") (= c "$") (= c "&") (= c "?") (= c "^") (= c "~") (= c ";")))) ;; ── Main tokenizer ──────────────────────────────────────────────── (define pl-tokenize (fn (src) (let ((tokens (list)) (pos 0) (src-len (len src))) (define pl-peek (fn (offset) (if (< (+ pos offset) src-len) (nth src (+ pos offset)) nil))) (define cur (fn () (pl-peek 0))) (define advance! (fn (n) (set! pos (+ pos n)))) (define at? (fn (s) (let ((sl (len s))) (and (<= (+ pos sl) src-len) (= (slice src pos (+ pos sl)) s))))) (define pl-emit! (fn (type value start) (append! tokens (pl-make-token type value start)))) (define skip-line-comment! (fn () (when (and (< pos src-len) (not (= (cur) "\n"))) (do (advance! 1) (skip-line-comment!))))) (define skip-block-comment! (fn () (cond ((>= pos src-len) nil) ((and (= (cur) "*") (< (+ pos 1) src-len) (= (pl-peek 1) "/")) (advance! 2)) (else (do (advance! 1) (skip-block-comment!)))))) (define skip-ws! (fn () (cond ((>= pos src-len) nil) ((pl-ws? (cur)) (do (advance! 1) (skip-ws!))) ((= (cur) "%") (do (advance! 1) (skip-line-comment!) (skip-ws!))) ((and (= (cur) "/") (< (+ pos 1) src-len) (= (pl-peek 1) "*")) (do (advance! 2) (skip-block-comment!) (skip-ws!))) (else nil)))) (define read-ident (fn (start) (do (when (and (< pos src-len) (pl-ident-char? (cur))) (do (advance! 1) (read-ident start))) (slice src start pos)))) (define read-sym (fn (start) (do (when (and (< pos src-len) (pl-sym? (cur))) (do (advance! 1) (read-sym start))) (slice src start pos)))) (define read-decimal-digits! (fn () (when (and (< pos src-len) (pl-digit? (cur))) (do (advance! 1) (read-decimal-digits!))))) (define read-number (fn (start) (do (read-decimal-digits!) (when (and (< pos src-len) (= (cur) ".") (< (+ pos 1) src-len) (pl-digit? (pl-peek 1))) (do (advance! 1) (read-decimal-digits!))) (parse-number (slice src start pos))))) (define read-quoted (fn (quote-char) (let ((chars (list))) (advance! 1) (define loop (fn () (cond ((>= pos src-len) nil) ((= (cur) "\\") (do (advance! 1) (when (< pos src-len) (let ((ch (cur))) (do (cond ((= ch "n") (append! chars "\n")) ((= ch "t") (append! chars "\t")) ((= ch "r") (append! chars "\r")) ((= ch "\\") (append! chars "\\")) ((= ch "'") (append! chars "'")) ((= ch "\"") (append! chars "\"")) (else (append! chars ch))) (advance! 1)))) (loop))) ((= (cur) quote-char) (advance! 1)) (else (do (append! chars (cur)) (advance! 1) (loop)))))) (loop) (join "" chars)))) (define scan! (fn () (do (skip-ws!) (when (< pos src-len) (let ((ch (cur)) (start pos)) (cond ((at? ":-") (do (pl-emit! "op" ":-" start) (advance! 2) (scan!))) ((pl-digit? ch) (do (pl-emit! "number" (read-number start) start) (scan!))) ((= ch "'") (do (pl-emit! "atom" (read-quoted "'") start) (scan!))) ((= ch "\"") (do (pl-emit! "string" (read-quoted "\"") start) (scan!))) ((pl-lower? ch) (do (pl-emit! "atom" (read-ident start) start) (scan!))) ((or (pl-upper? ch) (= ch "_")) (do (pl-emit! "var" (read-ident start) start) (scan!))) ((= ch "(") (do (pl-emit! "punct" "(" start) (advance! 1) (scan!))) ((= ch ")") (do (pl-emit! "punct" ")" start) (advance! 1) (scan!))) ((= ch ",") (do (pl-emit! "punct" "," start) (advance! 1) (scan!))) ((= ch ".") (do (pl-emit! "punct" "." start) (advance! 1) (scan!))) ((= ch "[") (do (pl-emit! "punct" "[" start) (advance! 1) (scan!))) ((= ch "]") (do (pl-emit! "punct" "]" start) (advance! 1) (scan!))) ((= ch "|") (do (pl-emit! "punct" "|" start) (advance! 1) (scan!))) ((= ch "!") (do (pl-emit! "op" "!" start) (advance! 1) (scan!))) ((pl-sym? ch) (do (pl-emit! "atom" (read-sym start) start) (scan!))) (else (do (advance! 1) (scan!))))))))) (scan!) (pl-emit! "eof" nil pos) tokens)))