Created
November 4, 2011 22:32
-
-
Save sorenmacbeth/1340665 to your computer and use it in GitHub Desktop.
The rest of our utility functions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defn tokenizer-seq | |
"Build a lazy-seq out of a tokenizer with TermAttribute" | |
[^TokenStream tokenizer ^TermAttribute term-att] | |
(lazy-seq | |
(when (.incrementToken tokenizer) | |
(cons (.term term-att) (tokenizer-seq tokenizer term-att))))) | |
(defn load-analyzer [^java.util.Set stopwords] | |
(StandardAnalyzer. Version/LUCENE_CURRENT stopwords)) | |
(defn tokenize-text | |
"Apply a lucene tokenizer to cleaned text content as a lazy-seq" | |
[^StandardAnalyzer analyzer page-text] | |
(let [reader (java.io.StringReader. page-text) | |
tokenizer (.tokenStream analyzer nil reader) | |
term-att (.addAttribute tokenizer TermAttribute)] | |
(tokenizer-seq tokenizer term-att))) | |
(defn emit-tokens [tokens-seq] | |
"Compute n-grams of a seq of tokens" | |
(partition 1 1 tokens-seq)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment