;; lib/datalog/tokenizer.sx — Datalog source → token stream ;; ;; Tokens: {:type T :value V :pos P} ;; Types: ;; "atom" — lowercase-start ident or quoted 'atom' ;; "var" — uppercase-start or _-start ident (value is the name) ;; "number" — numeric literal (decoded to number) ;; "string" — "..." string literal ;; "punct" — ( ) , . ;; "op" — :- ?- <= >= != < > = + - * / ;; "eof" ;; ;; Datalog has no function symbols in arg position; the parser still ;; accepts nested compounds for arithmetic ((is X (+ A B))) but safety ;; analysis rejects non-arithmetic nesting at rule-load time. (define dl-make-token (fn (type value pos) {:type type :value value :pos pos})) (define dl-digit? (fn (c) (and (>= c "0") (<= c "9")))) (define dl-lower? (fn (c) (and (>= c "a") (<= c "z")))) (define dl-upper? (fn (c) (and (>= c "A") (<= c "Z")))) (define dl-ident-char? (fn (c) (or (dl-lower? c) (dl-upper? c) (dl-digit? c) (= c "_")))) (define dl-ws? (fn (c) (or (= c " ") (= c "\t") (= c "\n") (= c "\r")))) (define dl-tokenize (fn (src) (let ((tokens (list)) (pos 0) (src-len (len src))) (define dl-peek (fn (offset) (if (< (+ pos offset) src-len) (nth src (+ pos offset)) nil))) (define cur (fn () (dl-peek 0))) (define advance! (fn (n) (set! pos (+ pos n)))) (define at? (fn (s) (let ((sl (len s))) (and (<= (+ pos sl) src-len) (= (slice src pos (+ pos sl)) s))))) (define dl-emit! (fn (type value start) (append! tokens (dl-make-token type value start)))) (define skip-line-comment! (fn () (when (and (< pos src-len) (not (= (cur) "\n"))) (do (advance! 1) (skip-line-comment!))))) (define skip-block-comment! (fn () (cond ((>= pos src-len) nil) ((and (= (cur) "*") (< (+ pos 1) src-len) (= (dl-peek 1) "/")) (advance! 2)) (else (do (advance! 1) (skip-block-comment!)))))) (define skip-ws! (fn () (cond ((>= pos src-len) nil) ((dl-ws? (cur)) (do (advance! 1) (skip-ws!))) ((= (cur) "%") (do (advance! 1) (skip-line-comment!) (skip-ws!))) ((and (= (cur) "/") (< (+ pos 1) src-len) (= (dl-peek 1) "*")) (do (advance! 2) (skip-block-comment!) (skip-ws!))) (else nil)))) (define read-ident (fn (start) (do (when (and (< pos src-len) (dl-ident-char? (cur))) (do (advance! 1) (read-ident start))) (slice src start pos)))) (define read-decimal-digits! (fn () (when (and (< pos src-len) (dl-digit? (cur))) (do (advance! 1) (read-decimal-digits!))))) (define read-number (fn (start) (do (read-decimal-digits!) (when (and (< pos src-len) (= (cur) ".") (< (+ pos 1) src-len) (dl-digit? (dl-peek 1))) (do (advance! 1) (read-decimal-digits!))) (parse-number (slice src start pos))))) (define read-quoted (fn (quote-char) (let ((chars (list))) (advance! 1) (define loop (fn () (cond ((>= pos src-len) nil) ((= (cur) "\\") (do (advance! 1) (when (< pos src-len) (let ((ch (cur))) (do (cond ((= ch "n") (append! chars "\n")) ((= ch "t") (append! chars "\t")) ((= ch "r") (append! chars "\r")) ((= ch "\\") (append! chars "\\")) ((= ch "'") (append! chars "'")) ((= ch "\"") (append! chars "\"")) (else (append! chars ch))) (advance! 1)))) (loop))) ((= (cur) quote-char) (advance! 1)) (else (do (append! chars (cur)) (advance! 1) (loop)))))) (loop) (join "" chars)))) (define scan! (fn () (do (skip-ws!) (when (< pos src-len) (let ((ch (cur)) (start pos)) (cond ((at? ":-") (do (dl-emit! "op" ":-" start) (advance! 2) (scan!))) ((at? "?-") (do (dl-emit! "op" "?-" start) (advance! 2) (scan!))) ((at? "<=") (do (dl-emit! "op" "<=" start) (advance! 2) (scan!))) ((at? ">=") (do (dl-emit! "op" ">=" start) (advance! 2) (scan!))) ((at? "!=") (do (dl-emit! "op" "!=" start) (advance! 2) (scan!))) ((dl-digit? ch) (do (dl-emit! "number" (read-number start) start) (scan!))) ((= ch "'") (do (dl-emit! "atom" (read-quoted "'") start) (scan!))) ((= ch "\"") (do (dl-emit! "string" (read-quoted "\"") start) (scan!))) ((dl-lower? ch) (do (dl-emit! "atom" (read-ident start) start) (scan!))) ((or (dl-upper? ch) (= ch "_")) (do (dl-emit! "var" (read-ident start) start) (scan!))) ((= ch "(") (do (dl-emit! "punct" "(" start) (advance! 1) (scan!))) ((= ch ")") (do (dl-emit! "punct" ")" start) (advance! 1) (scan!))) ((= ch ",") (do (dl-emit! "punct" "," start) (advance! 1) (scan!))) ((= ch ".") (do (dl-emit! "punct" "." start) (advance! 1) (scan!))) ((= ch "<") (do (dl-emit! "op" "<" start) (advance! 1) (scan!))) ((= ch ">") (do (dl-emit! "op" ">" start) (advance! 1) (scan!))) ((= ch "=") (do (dl-emit! "op" "=" start) (advance! 1) (scan!))) ((= ch "+") (do (dl-emit! "op" "+" start) (advance! 1) (scan!))) ((= ch "-") (do (dl-emit! "op" "-" start) (advance! 1) (scan!))) ((= ch "*") (do (dl-emit! "op" "*" start) (advance! 1) (scan!))) ((= ch "/") (do (dl-emit! "op" "/" start) (advance! 1) (scan!))) (else (do (advance! 1) (scan!))))))))) (scan!) (dl-emit! "eof" nil pos) tokens)))