Created
May 7, 2010 21:43
-
-
Save dakrone/394035 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defprotocol Searcher | |
"An interface for searching" | |
(score [this term text] "Score this text in similarity") | |
(rank [this term text] "Rank sentences in this text")) | |
(defrecord ContextSearcher [get-sentences tokenize pos-tag]) | |
; ... lots of impelementation details here ... | |
(extend-protocol Searcher ContextSearcher | |
(score | |
[{:keys [get-sentences tokenizer pos-tagger]} term text] | |
(let [words (get-scored-terms text term get-sentences tokenizer pos-tagger)] | |
(score-text text words get-sentences tokenizer))) | |
(rank | |
[{:keys [get-sentences tokenizer pos-tagger]} term text] | |
(let [words (get-scored-terms text term get-sentences tokenizer pos-tagger)] | |
(reverse (sort-by second (score-sentences text words get-sentences tokenizer)))))) | |
(defn make-context-searcher | |
"Generate a new Context Searcher using the given models. 3 models are | |
required, a sentence detector model, a tokenizing model and a pos-tagging | |
model." | |
[smodel tmodel pmodel] | |
(let [get-sentences (nlp/make-sentence-detector smodel) | |
tokenizer (nlp/make-tokenizer tmodel) | |
pos-tagger (nlp/make-pos-tagger pmodel)] | |
(ContextSearcher. get-sentences tokenizer pos-tagger))) | |
;user=> (def cs (make-context-searcher "models/EnglishSD.bin.gz" "models/EnglishTok.bin.gz" "models/tag.bin.gz")) | |
;#'user/cs | |
;user=> (score cs "foo" "This is some foo text which has some foo things in it.") | |
;java.lang.Exception: Unable to resolve symbol: score in this context (NO_SOURCE_FILE:3) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment