jackschultz/classify.clj

## classify.clj
(ns gb-or-syria.core
  (:use [boilerpipe-clj.core]
         [opennlp.nlp]
         [opennlp.treebank]
         [clojure.pprint :only [pprint]]
         [opennlp.tools.filters]
         [clojure.set]
         [clojure.string :only [split-lines]]
         [stemmer.snowball])
  (:gen-class))

(def get-sentences (make-sentence-detector "models/en-sent.bin"))
(def tokenize (make-tokenizer "models/en-token.bin"))
(def pos-tag (make-pos-tagger "models/en-pos-maxent.bin"))
(def name-find (make-name-finder "models/en-ner-person.bin"))
(def chunker (make-treebank-chunker "models/en-chunker.bin"))
(def eng-stemmer (stemmer "english"))

(def stop-words
  (set (split-lines (slurp "models/english-stopwords"))))

(def puntctuation-marks
  #{"+" "-" "*" "^" "." ";" "%" "\\" "," "..." "!" "?" ":" "\""})

(defn read-file-newlines
  "Reads a file and puts the strings, line by line into a vector"
  [path]
  (into []
    (split-lines
      (slurp path))))

(defn get-article [url]
  "Grabs the page from the url and strips the article from it"
  (get-text
    (slurp url)))

(defn get-sentence-tokens
  "tokenizes the sentences after they are split"
  [article]
  (into []
    (map pos-tag
      (map tokenize
        (get-sentences article)))))

(defn filter-nouns-verbs
  "From an article, we want to generate a list of tokens
   that are nouns and verbs"
  [sens]
  (map first
    (concat
      (reduce concat (map nouns sens))
      (reduce concat (map verbs sens)))))

(defn freqs-from-urls
  "Get the frequencies of words from the articles
   in the list of the urls"
  [urls]
  (frequencies
    (reduce concat
      (map filter-nouns-verbs
        (map get-sentence-tokens
          (map get-article urls))))))

(defn num-total-words
  "Counts the toal number of words in the
   frequency map, to be used for normailzation"
  [freq-map]
  (reduce + (map second freq-map)))

(defn score-freqs
  [poss freq]
  (float
    (/
      (reduce + (remove nil? (map freq (map first poss))))
      (num-total-words freq))))

(defn get-freq-scores
  "Returns the number of similarity between the freq
   dists"
  [poss & freqs]
  (map #(score-freqs poss %) freqs))


(def train-path-syria "urls/train/syria.txt")
(def train-path-gb "urls/train/gb.txt")

(def test-path-syria "urls/test/syria.txt")
(def test-path-gb "urls/test/gb.txt")

(defn -main
  [& args]

  (def syria-train (freqs-from-urls
    (read-file-newlines train-path-syria)))
  (def gb-train (freqs-from-urls
    (read-file-newlines train-path-gb)))
  (def gb-test-freqs
    (map #(freqs-from-urls (list %)) (read-file-newlines test-path-gb)))
  (def syria-test-freqs
    (map #(freqs-from-urls (list %)) (read-file-newlines test-path-syria)))
  (pprint
    (map #(get-freq-scores % gb-train syria-train) gb-test-freqs))
  (pprint
    (map #(get-freq-scores % gb-train syria-train) syria-test-freqs))
)
	(ns gb-or-syria.core
	(:use [boilerpipe-clj.core]
	[opennlp.nlp]
	[opennlp.treebank]
	[clojure.pprint :only [pprint]]
	[opennlp.tools.filters]
	[clojure.set]
	[clojure.string :only [split-lines]]
	[stemmer.snowball])
	(:gen-class))

	(def get-sentences (make-sentence-detector "models/en-sent.bin"))
	(def tokenize (make-tokenizer "models/en-token.bin"))
	(def pos-tag (make-pos-tagger "models/en-pos-maxent.bin"))
	(def name-find (make-name-finder "models/en-ner-person.bin"))
	(def chunker (make-treebank-chunker "models/en-chunker.bin"))
	(def eng-stemmer (stemmer "english"))

	(def stop-words
	(set (split-lines (slurp "models/english-stopwords"))))

	(def puntctuation-marks
	#{"+" "-" "*" "^" "." ";" "%" "\\" "," "..." "!" "?" ":" "\""})

	(defn read-file-newlines
	"Reads a file and puts the strings, line by line into a vector"
	[path]
	(into []
	(split-lines
	(slurp path))))

	(defn get-article [url]
	"Grabs the page from the url and strips the article from it"
	(get-text
	(slurp url)))

	(defn get-sentence-tokens
	"tokenizes the sentences after they are split"
	[article]
	(into []
	(map pos-tag
	(map tokenize
	(get-sentences article)))))

	(defn filter-nouns-verbs
	"From an article, we want to generate a list of tokens
	that are nouns and verbs"
	[sens]
	(map first
	(concat
	(reduce concat (map nouns sens))
	(reduce concat (map verbs sens)))))

	(defn freqs-from-urls
	"Get the frequencies of words from the articles
	in the list of the urls"
	[urls]
	(frequencies
	(reduce concat
	(map filter-nouns-verbs
	(map get-sentence-tokens
	(map get-article urls))))))

	(defn num-total-words
	"Counts the toal number of words in the
	frequency map, to be used for normailzation"
	[freq-map]
	(reduce + (map second freq-map)))

	(defn score-freqs
	[poss freq]
	(float
	(/
	(reduce + (remove nil? (map freq (map first poss))))
	(num-total-words freq))))

	(defn get-freq-scores
	"Returns the number of similarity between the freq
	dists"
	[poss & freqs]
	(map #(score-freqs poss %) freqs))


	(def train-path-syria "urls/train/syria.txt")
	(def train-path-gb "urls/train/gb.txt")

	(def test-path-syria "urls/test/syria.txt")
	(def test-path-gb "urls/test/gb.txt")

	(defn -main
	[& args]

	(def syria-train (freqs-from-urls
	(read-file-newlines train-path-syria)))
	(def gb-train (freqs-from-urls
	(read-file-newlines train-path-gb)))
	(def gb-test-freqs
	(map #(freqs-from-urls (list %)) (read-file-newlines test-path-gb)))
	(def syria-test-freqs
	(map #(freqs-from-urls (list %)) (read-file-newlines test-path-syria)))
	(pprint
	(map #(get-freq-scores % gb-train syria-train) gb-test-freqs))
	(pprint
	(map #(get-freq-scores % gb-train syria-train) syria-test-freqs))
	)