Skip to content

Instantly share code, notes, and snippets.

@muhuk
Last active August 29, 2015 14:00
Show Gist options
  • Save muhuk/7c4a2b8db63886e2a9cd to your computer and use it in GitHub Desktop.
Save muhuk/7c4a2b8db63886e2a9cd to your computer and use it in GitHub Desktop.
clojure: Idiomatic tokenizing and performance
(defn match-ident
[cs]
(let [start (first cs)]
(if (ident-first-char? start)
(let [ identseq (cons start (take-while ident-tail-char? (rest cs)))
^String ident (apply str identseq)]
[(drop (.length ident) cs) [:ident ident]]))))
(defn match-num
[cs]
(if (digit? (first cs))
(let [ numseq (take-while digit? cs)
^String numstr (apply str numseq)
retseq (drop (.length numstr) cs)]
(if (= (first retseq) \.)
nil
[retseq [:number numstr]]))))
(defn match-ws
[cs]
(if (whitespace-char? (first cs))
(let [ wsseq (take-while whitespace-char? cs)
^String wsstr (apply str wsseq)
retseq (drop (.length wsstr) cs)]
[retseq [:ws wsstr]])))
;;; ...
(defn next-token
[cs]
(or (match-ident cs)
(match-ws cs)
(match-punct cs)
(match-num cs)
(match-eof cs)
(match-unknown cs)))
;; Here I build the lazy seq of tokens.
(defn token-seq
[cs]
(let [[newcs tok] (next-token cs)]
(lazy-seq (cons tok (token-seq newcs)))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment