jackschultz/article-summarizer.clj

## article-summarizer.clj
(ns classify.core
  (:use [boilerpipe-clj.core]
         [opennlp.nlp]
         [opennlp.treebank]
         [clojure.pprint :only [pprint]]
         [opennlp.tools.filters]
         [clojure.set]
         [stemmer.snowball])
  (:gen-class))

(def url "http://www.jsonline.com/business/harley-cultivates-a-new-generation-of-riders-b9986025z1-221852321.html")
(def num-sentences 10)

(def get-sentences (make-sentence-detector "models/en-sent.bin"))
(def tokenize (make-tokenizer "models/en-token.bin"))
(def pos-tag (make-pos-tagger "models/en-pos-maxent.bin"))
(def name-find (make-name-finder "models/en-ner-person.bin"))
(def chunker (make-treebank-chunker "models/en-chunker.bin"))
(def eng-stemmer (stemmer "english"))

(defn get-article [url]
  "Grabs the page from the url and strips the article from it"
  (get-text
    (slurp url)))

(defn gen-sentences [url]
  "Sentence tokenizes the giant article"
  (get-sentences
    (get-article url)))

(defn gen-tokens
  "Tokenizes the sentence"
  [sen]
  (tokenize sen))

(defn stem
  "Takes a sentence that has been tokenized, and returns that list
    with the words stemmed"
  [sen]
  (map
    eng-stemmer sen))

(defn sentence-intersection
  "Returns the count of the number of words that are interection.
  sen1 and sen 2 should be lists of words. probably obtained by tokenize"
  [sen1 sen2]
  (/
    (count (intersection (set sen1) (set sen2)))
    (/ (+(count sen1)(count sen2)) 2.0)))


(defn all-pairs
  "Runs the "
  [test sens]
  (loop [lead test follow sens tot 0]
    (if (empty? follow)
      tot
      (recur lead (rest follow) (+ tot (sentence-intersection lead (first follow)))))))

(defn score-sentences
  "Takes the list of sentences that have been tokenized, and generates
   scores for all the sentences, and returns a map with indexes and
   their associated scores"
  [sens]
  (sort-by second >
    (into []
      (for [x (range (count sens))]
        [x (all-pairs (get sens x) (concat (take x sens) (nthnext sens (inc x))))]
        ))))

(defn sentence-order
  "Takes the values from the val-map, and gets a vector of the indecies of
   best sentences sorted to just snag the sentences. It assumes that the
   the val map is already sorted by value"
  [num val-map]
  (sort (take num (map first val-map))))

(defn gather-sentences
  "takes the map of sentences, the sentences, and the number
   of sentences we want and returns them in order"
  [val-map sens num]

  num)


(defn -main [& args]
  (def sentences
    (gen-sentences url))
  (pprint
    (map sentences
        (sentence-order num-sentences
          (score-sentences
          (into []
            (map stem
                (map gen-tokens sentences))))))))
	(ns classify.core
	(:use [boilerpipe-clj.core]
	[opennlp.nlp]
	[opennlp.treebank]
	[clojure.pprint :only [pprint]]
	[opennlp.tools.filters]
	[clojure.set]
	[stemmer.snowball])
	(:gen-class))

	(def url "http://www.jsonline.com/business/harley-cultivates-a-new-generation-of-riders-b9986025z1-221852321.html")
	(def num-sentences 10)

	(def get-sentences (make-sentence-detector "models/en-sent.bin"))
	(def tokenize (make-tokenizer "models/en-token.bin"))
	(def pos-tag (make-pos-tagger "models/en-pos-maxent.bin"))
	(def name-find (make-name-finder "models/en-ner-person.bin"))
	(def chunker (make-treebank-chunker "models/en-chunker.bin"))
	(def eng-stemmer (stemmer "english"))

	(defn get-article [url]
	"Grabs the page from the url and strips the article from it"
	(get-text
	(slurp url)))

	(defn gen-sentences [url]
	"Sentence tokenizes the giant article"
	(get-sentences
	(get-article url)))

	(defn gen-tokens
	"Tokenizes the sentence"
	[sen]
	(tokenize sen))

	(defn stem
	"Takes a sentence that has been tokenized, and returns that list
	with the words stemmed"
	[sen]
	(map
	eng-stemmer sen))

	(defn sentence-intersection
	"Returns the count of the number of words that are interection.
	sen1 and sen 2 should be lists of words. probably obtained by tokenize"
	[sen1 sen2]
	(/
	(count (intersection (set sen1) (set sen2)))
	(/ (+(count sen1)(count sen2)) 2.0)))


	(defn all-pairs
	"Runs the "
	[test sens]
	(loop [lead test follow sens tot 0]
	(if (empty? follow)
	tot
	(recur lead (rest follow) (+ tot (sentence-intersection lead (first follow)))))))

	(defn score-sentences
	"Takes the list of sentences that have been tokenized, and generates
	scores for all the sentences, and returns a map with indexes and
	their associated scores"
	[sens]
	(sort-by second >
	(into []
	(for [x (range (count sens))]
	[x (all-pairs (get sens x) (concat (take x sens) (nthnext sens (inc x))))]
	))))

	(defn sentence-order
	"Takes the values from the val-map, and gets a vector of the indecies of
	best sentences sorted to just snag the sentences. It assumes that the
	the val map is already sorted by value"
	[num val-map]
	(sort (take num (map first val-map))))

	(defn gather-sentences
	"takes the map of sentences, the sentences, and the number
	of sentences we want and returns them in order"
	[val-map sens num]

	num)


	(defn -main [& args]
	(def sentences
	(gen-sentences url))
	(pprint
	(map sentences
	(sentence-order num-sentences
	(score-sentences
	(into []
	(map stem
	(map gen-tokens sentences))))))))