diff --git a/lib/datalog/tests/tokenize.sx b/lib/datalog/tests/tokenize.sx new file mode 100644 index 00000000..920c7257 --- /dev/null +++ b/lib/datalog/tests/tokenize.sx @@ -0,0 +1,139 @@ +;; lib/datalog/tests/tokenize.sx — tokenizer unit tests +;; +;; Run via: bash lib/datalog/conformance.sh +;; Or: (load "lib/datalog/tokenizer.sx") (load "lib/datalog/tests/tokenize.sx") +;; (dl-tokenize-tests-run!) + +(define dl-tk-pass 0) +(define dl-tk-fail 0) +(define dl-tk-failures (list)) + +(define + dl-tk-test! + (fn + (name got expected) + (if + (= got expected) + (set! dl-tk-pass (+ dl-tk-pass 1)) + (do + (set! dl-tk-fail (+ dl-tk-fail 1)) + (append! + dl-tk-failures + (str name "\n expected: " expected "\n got: " got)))))) + +(define dl-tk-types (fn (toks) (map (fn (t) (get t :type)) toks))) +(define dl-tk-values (fn (toks) (map (fn (t) (get t :value)) toks))) + +(define + dl-tk-run-all! + (fn + () + (do + (dl-tk-test! "empty" (dl-tk-types (dl-tokenize "")) (list "eof")) + (dl-tk-test! + "atom dot" + (dl-tk-types (dl-tokenize "foo.")) + (list "atom" "punct" "eof")) + (dl-tk-test! + "atom dot value" + (dl-tk-values (dl-tokenize "foo.")) + (list "foo" "." nil)) + (dl-tk-test! + "var" + (dl-tk-types (dl-tokenize "X.")) + (list "var" "punct" "eof")) + (dl-tk-test! + "underscore var" + (dl-tk-types (dl-tokenize "_x.")) + (list "var" "punct" "eof")) + (dl-tk-test! + "integer" + (dl-tk-values (dl-tokenize "42")) + (list 42 nil)) + (dl-tk-test! + "decimal" + (dl-tk-values (dl-tokenize "3.14")) + (list 3.14 nil)) + (dl-tk-test! + "string" + (dl-tk-values (dl-tokenize "\"hello\"")) + (list "hello" nil)) + (dl-tk-test! + "quoted atom" + (dl-tk-types (dl-tokenize "'two words'")) + (list "atom" "eof")) + (dl-tk-test! + "quoted atom value" + (dl-tk-values (dl-tokenize "'two words'")) + (list "two words" nil)) + (dl-tk-test! ":-" (dl-tk-values (dl-tokenize ":-")) (list ":-" nil)) + (dl-tk-test! "?-" (dl-tk-values (dl-tokenize "?-")) (list "?-" nil)) + (dl-tk-test! "<=" (dl-tk-values (dl-tokenize "<=")) (list "<=" nil)) + (dl-tk-test! ">=" (dl-tk-values (dl-tokenize ">=")) (list ">=" nil)) + (dl-tk-test! "!=" (dl-tk-values (dl-tokenize "!=")) (list "!=" nil)) + (dl-tk-test! + "single op values" + (dl-tk-values (dl-tokenize "< > = + - * /")) + (list "<" ">" "=" "+" "-" "*" "/" nil)) + (dl-tk-test! + "single op types" + (dl-tk-types (dl-tokenize "< > = + - * /")) + (list "op" "op" "op" "op" "op" "op" "op" "eof")) + (dl-tk-test! + "punct" + (dl-tk-values (dl-tokenize "( ) , .")) + (list "(" ")" "," "." nil)) + (dl-tk-test! + "fact tokens" + (dl-tk-types (dl-tokenize "parent(tom, bob).")) + (list "atom" "punct" "atom" "punct" "atom" "punct" "punct" "eof")) + (dl-tk-test! + "rule shape" + (dl-tk-types (dl-tokenize "p(X) :- q(X).")) + (list + "atom" + "punct" + "var" + "punct" + "op" + "atom" + "punct" + "var" + "punct" + "punct" + "eof")) + (dl-tk-test! + "comparison literal" + (dl-tk-values (dl-tokenize "<(X, 5)")) + (list "<" "(" "X" "," 5 ")" nil)) + (dl-tk-test! + "is form" + (dl-tk-values (dl-tokenize "is(Y, +(X, 1))")) + (list "is" "(" "Y" "," "+" "(" "X" "," 1 ")" ")" nil)) + (dl-tk-test! + "line comment" + (dl-tk-types (dl-tokenize "% comment line\nfoo.")) + (list "atom" "punct" "eof")) + (dl-tk-test! + "block comment" + (dl-tk-types (dl-tokenize "/* a\nb */ x.")) + (list "atom" "punct" "eof")) + (dl-tk-test! + "whitespace" + (dl-tk-types (dl-tokenize " foo ,\t bar .")) + (list "atom" "punct" "atom" "punct" "eof")) + (dl-tk-test! + "positions" + (map (fn (t) (get t :pos)) (dl-tokenize "foo bar")) + (list 0 4 7))))) + +(define + dl-tokenize-tests-run! + (fn + () + (do + (set! dl-tk-pass 0) + (set! dl-tk-fail 0) + (set! dl-tk-failures (list)) + (dl-tk-run-all!) + {:failures dl-tk-failures :total (+ dl-tk-pass dl-tk-fail) :passed dl-tk-pass :failed dl-tk-fail}))) diff --git a/lib/datalog/tokenizer.sx b/lib/datalog/tokenizer.sx new file mode 100644 index 00000000..b169b64e --- /dev/null +++ b/lib/datalog/tokenizer.sx @@ -0,0 +1,254 @@ +;; lib/datalog/tokenizer.sx — Datalog source → token stream +;; +;; Tokens: {:type T :value V :pos P} +;; Types: +;; "atom" — lowercase-start ident or quoted 'atom' +;; "var" — uppercase-start or _-start ident (value is the name) +;; "number" — numeric literal (decoded to number) +;; "string" — "..." string literal +;; "punct" — ( ) , . +;; "op" — :- ?- <= >= != < > = + - * / +;; "eof" +;; +;; Datalog has no function symbols in arg position; the parser still +;; accepts nested compounds for arithmetic ((is X (+ A B))) but safety +;; analysis rejects non-arithmetic nesting at rule-load time. + +(define dl-make-token (fn (type value pos) {:type type :value value :pos pos})) + +(define dl-digit? (fn (c) (and (>= c "0") (<= c "9")))) +(define dl-lower? (fn (c) (and (>= c "a") (<= c "z")))) +(define dl-upper? (fn (c) (and (>= c "A") (<= c "Z")))) + +(define + dl-ident-char? + (fn (c) (or (dl-lower? c) (dl-upper? c) (dl-digit? c) (= c "_")))) + +(define dl-ws? (fn (c) (or (= c " ") (= c "\t") (= c "\n") (= c "\r")))) + +(define + dl-tokenize + (fn + (src) + (let + ((tokens (list)) (pos 0) (src-len (len src))) + (define + dl-peek + (fn + (offset) + (if (< (+ pos offset) src-len) (nth src (+ pos offset)) nil))) + (define cur (fn () (dl-peek 0))) + (define advance! (fn (n) (set! pos (+ pos n)))) + (define + at? + (fn + (s) + (let + ((sl (len s))) + (and (<= (+ pos sl) src-len) (= (slice src pos (+ pos sl)) s))))) + (define + dl-emit! + (fn + (type value start) + (append! tokens (dl-make-token type value start)))) + (define + skip-line-comment! + (fn + () + (when + (and (< pos src-len) (not (= (cur) "\n"))) + (do (advance! 1) (skip-line-comment!))))) + (define + skip-block-comment! + (fn + () + (cond + ((>= pos src-len) nil) + ((and (= (cur) "*") (< (+ pos 1) src-len) (= (dl-peek 1) "/")) + (advance! 2)) + (else (do (advance! 1) (skip-block-comment!)))))) + (define + skip-ws! + (fn + () + (cond + ((>= pos src-len) nil) + ((dl-ws? (cur)) (do (advance! 1) (skip-ws!))) + ((= (cur) "%") + (do (advance! 1) (skip-line-comment!) (skip-ws!))) + ((and (= (cur) "/") (< (+ pos 1) src-len) (= (dl-peek 1) "*")) + (do (advance! 2) (skip-block-comment!) (skip-ws!))) + (else nil)))) + (define + read-ident + (fn + (start) + (do + (when + (and (< pos src-len) (dl-ident-char? (cur))) + (do (advance! 1) (read-ident start))) + (slice src start pos)))) + (define + read-decimal-digits! + (fn + () + (when + (and (< pos src-len) (dl-digit? (cur))) + (do (advance! 1) (read-decimal-digits!))))) + (define + read-number + (fn + (start) + (do + (read-decimal-digits!) + (when + (and + (< pos src-len) + (= (cur) ".") + (< (+ pos 1) src-len) + (dl-digit? (dl-peek 1))) + (do (advance! 1) (read-decimal-digits!))) + (parse-number (slice src start pos))))) + (define + read-quoted + (fn + (quote-char) + (let + ((chars (list))) + (advance! 1) + (define + loop + (fn + () + (cond + ((>= pos src-len) nil) + ((= (cur) "\\") + (do + (advance! 1) + (when + (< pos src-len) + (let + ((ch (cur))) + (do + (cond + ((= ch "n") (append! chars "\n")) + ((= ch "t") (append! chars "\t")) + ((= ch "r") (append! chars "\r")) + ((= ch "\\") (append! chars "\\")) + ((= ch "'") (append! chars "'")) + ((= ch "\"") (append! chars "\"")) + (else (append! chars ch))) + (advance! 1)))) + (loop))) + ((= (cur) quote-char) (advance! 1)) + (else + (do (append! chars (cur)) (advance! 1) (loop)))))) + (loop) + (join "" chars)))) + (define + scan! + (fn + () + (do + (skip-ws!) + (when + (< pos src-len) + (let + ((ch (cur)) (start pos)) + (cond + ((at? ":-") + (do + (dl-emit! "op" ":-" start) + (advance! 2) + (scan!))) + ((at? "?-") + (do + (dl-emit! "op" "?-" start) + (advance! 2) + (scan!))) + ((at? "<=") + (do + (dl-emit! "op" "<=" start) + (advance! 2) + (scan!))) + ((at? ">=") + (do + (dl-emit! "op" ">=" start) + (advance! 2) + (scan!))) + ((at? "!=") + (do + (dl-emit! "op" "!=" start) + (advance! 2) + (scan!))) + ((dl-digit? ch) + (do + (dl-emit! "number" (read-number start) start) + (scan!))) + ((= ch "'") + (do (dl-emit! "atom" (read-quoted "'") start) (scan!))) + ((= ch "\"") + (do (dl-emit! "string" (read-quoted "\"") start) (scan!))) + ((dl-lower? ch) + (do (dl-emit! "atom" (read-ident start) start) (scan!))) + ((or (dl-upper? ch) (= ch "_")) + (do (dl-emit! "var" (read-ident start) start) (scan!))) + ((= ch "(") + (do + (dl-emit! "punct" "(" start) + (advance! 1) + (scan!))) + ((= ch ")") + (do + (dl-emit! "punct" ")" start) + (advance! 1) + (scan!))) + ((= ch ",") + (do + (dl-emit! "punct" "," start) + (advance! 1) + (scan!))) + ((= ch ".") + (do + (dl-emit! "punct" "." start) + (advance! 1) + (scan!))) + ((= ch "<") + (do + (dl-emit! "op" "<" start) + (advance! 1) + (scan!))) + ((= ch ">") + (do + (dl-emit! "op" ">" start) + (advance! 1) + (scan!))) + ((= ch "=") + (do + (dl-emit! "op" "=" start) + (advance! 1) + (scan!))) + ((= ch "+") + (do + (dl-emit! "op" "+" start) + (advance! 1) + (scan!))) + ((= ch "-") + (do + (dl-emit! "op" "-" start) + (advance! 1) + (scan!))) + ((= ch "*") + (do + (dl-emit! "op" "*" start) + (advance! 1) + (scan!))) + ((= ch "/") + (do + (dl-emit! "op" "/" start) + (advance! 1) + (scan!))) + (else (do (advance! 1) (scan!))))))))) + (scan!) + (dl-emit! "eof" nil pos) + tokens)))