;; lib/ocaml/tokenizer.sx — OCaml lexer. ;; ;; Tokens: ident, ctor (uppercase ident), keyword, number, string, char, op, eof. ;; Token shape: {:type :value :pos} via lex-make-token. ;; OCaml is not indentation-sensitive — no layout pass. ;; Block comments (* ... *) nest. There is no line-comment syntax. (prefix-rename "ocaml-" (quote ((make-token lex-make-token) (digit? lex-digit?) (hex-digit? lex-hex-digit?) (alpha? lex-alpha?) (alnum? lex-alnum?) (ident-start? lex-ident-start?) (ident-char? lex-ident-char?) (ws? lex-whitespace?)))) (define ocaml-keywords (list "and" "as" "assert" "begin" "class" "constraint" "do" "done" "downto" "else" "end" "exception" "external" "false" "for" "fun" "function" "functor" "if" "in" "include" "inherit" "initializer" "lazy" "let" "match" "method" "module" "mutable" "new" "nonrec" "object" "of" "open" "or" "private" "rec" "sig" "struct" "then" "to" "true" "try" "type" "val" "virtual" "when" "while" "with" "land" "lor" "lxor" "lsl" "lsr" "asr" "mod")) (define ocaml-keyword? (fn (word) (contains? ocaml-keywords word))) (define ocaml-upper? (fn (c) (and (not (= c nil)) (>= c "A") (<= c "Z")))) (define ocaml-tokenize (fn (src) (let ((tokens (list)) (pos 0) (src-len (len src))) (define ocaml-peek (fn (offset) (if (< (+ pos offset) src-len) (nth src (+ pos offset)) nil))) (define cur (fn () (ocaml-peek 0))) (define advance! (fn (n) (set! pos (+ pos n)))) (define push! (fn (type value start) (append! tokens (ocaml-make-token type value start)))) (define skip-block-comment! (fn (depth) (cond ((>= pos src-len) nil) ((and (= (cur) "*") (= (ocaml-peek 1) ")")) (begin (advance! 2) (when (> depth 1) (skip-block-comment! (- depth 1))))) ((and (= (cur) "(") (= (ocaml-peek 1) "*")) (begin (advance! 2) (skip-block-comment! (+ depth 1)))) (else (begin (advance! 1) (skip-block-comment! depth)))))) (define skip-ws! (fn () (cond ((>= pos src-len) nil) ((ocaml-ws? (cur)) (begin (advance! 1) (skip-ws!))) ((and (= (cur) "(") (= (ocaml-peek 1) "*")) (begin (advance! 2) (skip-block-comment! 1) (skip-ws!))) (else nil)))) (define read-ident (fn (start) (begin (when (and (< pos src-len) (ocaml-ident-char? (cur))) (begin (advance! 1) (read-ident start))) (when (and (< pos src-len) (= (cur) "'")) (begin (advance! 1) (read-ident start))) (slice src start pos)))) (define read-decimal-digits! (fn () (when (and (< pos src-len) (or (ocaml-digit? (cur)) (= (cur) "_"))) (begin (advance! 1) (read-decimal-digits!))))) (define read-hex-digits! (fn () (when (and (< pos src-len) (or (ocaml-hex-digit? (cur)) (= (cur) "_"))) (begin (advance! 1) (read-hex-digits!))))) (define read-exp-part! (fn () (when (and (< pos src-len) (or (= (cur) "e") (= (cur) "E"))) (let ((p1 (ocaml-peek 1))) (when (or (and (not (= p1 nil)) (ocaml-digit? p1)) (and (or (= p1 "+") (= p1 "-")) (< (+ pos 2) src-len) (ocaml-digit? (ocaml-peek 2)))) (begin (advance! 1) (when (and (< pos src-len) (or (= (cur) "+") (= (cur) "-"))) (advance! 1)) (read-decimal-digits!))))))) (define strip-underscores (fn (s) (let ((out (list)) (i 0) (n (len s))) (begin (define loop (fn () (when (< i n) (begin (when (not (= (nth s i) "_")) (append! out (nth s i))) (set! i (+ i 1)) (loop))))) (loop) (join "" out))))) (define read-number (fn (start) (cond ((and (= (cur) "0") (< (+ pos 1) src-len) (or (= (ocaml-peek 1) "x") (= (ocaml-peek 1) "X"))) (begin (advance! 2) (read-hex-digits!) (let ((raw (slice src (+ start 2) pos))) (parse-number (str "0x" (strip-underscores raw)))))) (else (begin (read-decimal-digits!) (when (and (< pos src-len) (= (cur) ".") (or (>= (+ pos 1) src-len) (not (= (ocaml-peek 1) ".")))) (begin (advance! 1) (read-decimal-digits!))) (read-exp-part!) (parse-number (strip-underscores (slice src start pos)))))))) (define read-string-literal (fn () (let ((chars (list))) (begin (advance! 1) (define loop (fn () (cond ((>= pos src-len) nil) ((= (cur) "\\") (begin (advance! 1) (when (< pos src-len) (let ((ch (cur))) (begin (cond ((= ch "n") (append! chars "\n")) ((= ch "t") (append! chars "\t")) ((= ch "r") (append! chars "\r")) ((= ch "b") (append! chars "\\b")) ((= ch "\\") (append! chars "\\")) ((= ch "'") (append! chars "'")) ((= ch "\"") (append! chars "\"")) ((= ch " ") nil) (else (append! chars ch))) (advance! 1)))) (loop))) ((= (cur) "\"") (advance! 1)) (else (begin (append! chars (cur)) (advance! 1) (loop)))))) (loop) (join "" chars))))) (define read-char-literal (fn () (begin (advance! 1) (let ((value (cond ((= (cur) "\\") (begin (advance! 1) (let ((ch (cur))) (begin (advance! 1) (cond ((= ch "n") "\n") ((= ch "t") "\t") ((= ch "r") "\r") ((= ch "b") "\\b") ((= ch "\\") "\\") ((= ch "'") "'") ((= ch "\"") "\"") (else ch)))))) (else (let ((ch (cur))) (begin (advance! 1) ch)))))) (begin (when (and (< pos src-len) (= (cur) "'")) (advance! 1)) value))))) (define try-punct (fn (start) (let ((c (cur)) (c1 (ocaml-peek 1)) (c2 (ocaml-peek 2))) (cond ((and (= c ";") (= c1 ";")) (begin (advance! 2) (push! "op" ";;" start) true)) ((and (= c "-") (= c1 ">")) (begin (advance! 2) (push! "op" "->" start) true)) ((and (= c "<") (= c1 "-")) (begin (advance! 2) (push! "op" "<-" start) true)) ((and (= c ":") (= c1 "=")) (begin (advance! 2) (push! "op" ":=" start) true)) ((and (= c ":") (= c1 ":")) (begin (advance! 2) (push! "op" "::" start) true)) ((and (= c "|") (= c1 "|")) (begin (advance! 2) (push! "op" "||" start) true)) ((and (= c "&") (= c1 "&")) (begin (advance! 2) (push! "op" "&&" start) true)) ((and (= c "<") (= c1 "=")) (begin (advance! 2) (push! "op" "<=" start) true)) ((and (= c ">") (= c1 "=")) (begin (advance! 2) (push! "op" ">=" start) true)) ((and (= c "<") (= c1 ">")) (begin (advance! 2) (push! "op" "<>" start) true)) ((and (= c "=") (= c1 "=")) (begin (advance! 2) (push! "op" "==" start) true)) ((and (= c "!") (= c1 "=")) (begin (advance! 2) (push! "op" "!=" start) true)) ((and (= c "|") (= c1 ">")) (begin (advance! 2) (push! "op" "|>" start) true)) ((and (= c "<") (= c1 "|")) (begin (advance! 2) (push! "op" "<|" start) true)) ((and (= c "@") (= c1 "@")) (begin (advance! 2) (push! "op" "@@" start) true)) ((and (= c "*") (= c1 "*")) (begin (advance! 2) (push! "op" "**" start) true)) ((or (= c "+") (= c "-") (= c "*") (= c "/") (= c "%") (= c "^") (= c "<") (= c ">") (= c "=") (= c "(") (= c ")") (= c "{") (= c "}") (= c "[") (= c "]") (= c ";") (= c ":") (= c ",") (= c ".") (= c "|") (= c "!") (= c "&") (= c "@") (= c "?") (= c "~") (= c "#")) (begin (advance! 1) (push! "op" c start) true)) (else false))))) (define step (fn () (begin (skip-ws!) (when (< pos src-len) (let ((start pos) (c (cur))) (cond ((ocaml-ident-start? c) (let ((word (read-ident start))) (begin (cond ((ocaml-keyword? word) (push! "keyword" word start)) ((ocaml-upper? c) (push! "ctor" word start)) (else (push! "ident" word start))) (step)))) ((ocaml-digit? c) (let ((v (read-number start))) (begin (push! "number" v start) (step)))) ((= c "\"") (let ((s (read-string-literal))) (begin (push! "string" s start) (step)))) ((and (= c "'") (< (+ pos 1) src-len) (or (and (= (ocaml-peek 1) "\\") (< (+ pos 3) src-len) (= (ocaml-peek 3) "'")) (and (not (= (ocaml-peek 1) "\\")) (< (+ pos 2) src-len) (= (ocaml-peek 2) "'")))) (let ((v (read-char-literal))) (begin (push! "char" v start) (step)))) ((= c "'") (begin (advance! 1) (when (and (< pos src-len) (ocaml-ident-start? (cur))) (begin (advance! 1) (read-ident (+ start 1)))) (push! "tyvar" (slice src (+ start 1) pos) start) (step))) ((try-punct start) (step)) (else (error (str "ocaml-tokenize: unexpected char " c " at " pos))))))))) (step) (push! "eof" nil pos) tokens)))