Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
The rest of our utility functions
(defn tokenizer-seq
"Build a lazy-seq out of a tokenizer with TermAttribute"
[^TokenStream tokenizer ^TermAttribute term-att]
(lazy-seq
(when (.incrementToken tokenizer)
(cons (.term term-att) (tokenizer-seq tokenizer term-att)))))
(defn load-analyzer [^java.util.Set stopwords]
(StandardAnalyzer. Version/LUCENE_CURRENT stopwords))
(defn tokenize-text
"Apply a lucene tokenizer to cleaned text content as a lazy-seq"
[^StandardAnalyzer analyzer page-text]
(let [reader (java.io.StringReader. page-text)
tokenizer (.tokenStream analyzer nil reader)
term-att (.addAttribute tokenizer TermAttribute)]
(tokenizer-seq tokenizer term-att)))
(defn emit-tokens [tokens-seq]
"Compute n-grams of a seq of tokens"
(partition 1 1 tokens-seq))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment