Created
August 30, 2013 22:45
-
-
Save jackschultz/6395017 to your computer and use it in GitHub Desktop.
Clojure implementation of a semi-naive article summarizer. Takes the url supplied and attempts to find the num-sentences most "valuable" sentences ranked by most words in common with other sentences. To run, throw into Leiningen and download the opennlp binaries.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns classify.core | |
(:use [boilerpipe-clj.core] | |
[opennlp.nlp] | |
[opennlp.treebank] | |
[clojure.pprint :only [pprint]] | |
[opennlp.tools.filters] | |
[clojure.set] | |
[stemmer.snowball]) | |
(:gen-class)) | |
(def url "http://www.jsonline.com/business/harley-cultivates-a-new-generation-of-riders-b9986025z1-221852321.html") | |
(def num-sentences 10) | |
(def get-sentences (make-sentence-detector "models/en-sent.bin")) | |
(def tokenize (make-tokenizer "models/en-token.bin")) | |
(def pos-tag (make-pos-tagger "models/en-pos-maxent.bin")) | |
(def name-find (make-name-finder "models/en-ner-person.bin")) | |
(def chunker (make-treebank-chunker "models/en-chunker.bin")) | |
(def eng-stemmer (stemmer "english")) | |
(defn get-article [url] | |
"Grabs the page from the url and strips the article from it" | |
(get-text | |
(slurp url))) | |
(defn gen-sentences [url] | |
"Sentence tokenizes the giant article" | |
(get-sentences | |
(get-article url))) | |
(defn gen-tokens | |
"Tokenizes the sentence" | |
[sen] | |
(tokenize sen)) | |
(defn stem | |
"Takes a sentence that has been tokenized, and returns that list | |
with the words stemmed" | |
[sen] | |
(map | |
eng-stemmer sen)) | |
(defn sentence-intersection | |
"Returns the count of the number of words that are interection. | |
sen1 and sen 2 should be lists of words. probably obtained by tokenize" | |
[sen1 sen2] | |
(/ | |
(count (intersection (set sen1) (set sen2))) | |
(/ (+(count sen1)(count sen2)) 2.0))) | |
(defn all-pairs | |
"Runs the " | |
[test sens] | |
(loop [lead test follow sens tot 0] | |
(if (empty? follow) | |
tot | |
(recur lead (rest follow) (+ tot (sentence-intersection lead (first follow))))))) | |
(defn score-sentences | |
"Takes the list of sentences that have been tokenized, and generates | |
scores for all the sentences, and returns a map with indexes and | |
their associated scores" | |
[sens] | |
(sort-by second > | |
(into [] | |
(for [x (range (count sens))] | |
[x (all-pairs (get sens x) (concat (take x sens) (nthnext sens (inc x))))] | |
)))) | |
(defn sentence-order | |
"Takes the values from the val-map, and gets a vector of the indecies of | |
best sentences sorted to just snag the sentences. It assumes that the | |
the val map is already sorted by value" | |
[num val-map] | |
(sort (take num (map first val-map)))) | |
(defn gather-sentences | |
"takes the map of sentences, the sentences, and the number | |
of sentences we want and returns them in order" | |
[val-map sens num] | |
num) | |
(defn -main [& args] | |
(def sentences | |
(gen-sentences url)) | |
(pprint | |
(map sentences | |
(sentence-order num-sentences | |
(score-sentences | |
(into [] | |
(map stem | |
(map gen-tokens sentences)))))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment