Skip to content

Instantly share code, notes, and snippets.

@jackschultz
Created September 1, 2013 21:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jackschultz/6407330 to your computer and use it in GitHub Desktop.
Save jackschultz/6407330 to your computer and use it in GitHub Desktop.
Simple classification algorithm that uses urls and the text in the articles.
(ns gb-or-syria.core
(:use [boilerpipe-clj.core]
[opennlp.nlp]
[opennlp.treebank]
[clojure.pprint :only [pprint]]
[opennlp.tools.filters]
[clojure.set]
[clojure.string :only [split-lines]]
[stemmer.snowball])
(:gen-class))
(def get-sentences (make-sentence-detector "models/en-sent.bin"))
(def tokenize (make-tokenizer "models/en-token.bin"))
(def pos-tag (make-pos-tagger "models/en-pos-maxent.bin"))
(def name-find (make-name-finder "models/en-ner-person.bin"))
(def chunker (make-treebank-chunker "models/en-chunker.bin"))
(def eng-stemmer (stemmer "english"))
(def stop-words
(set (split-lines (slurp "models/english-stopwords"))))
(def puntctuation-marks
#{"+" "-" "*" "^" "." ";" "%" "\\" "," "..." "!" "?" ":" "\""})
(defn read-file-newlines
"Reads a file and puts the strings, line by line into a vector"
[path]
(into []
(split-lines
(slurp path))))
(defn get-article [url]
"Grabs the page from the url and strips the article from it"
(get-text
(slurp url)))
(defn get-sentence-tokens
"tokenizes the sentences after they are split"
[article]
(into []
(map pos-tag
(map tokenize
(get-sentences article)))))
(defn filter-nouns-verbs
"From an article, we want to generate a list of tokens
that are nouns and verbs"
[sens]
(map first
(concat
(reduce concat (map nouns sens))
(reduce concat (map verbs sens)))))
(defn freqs-from-urls
"Get the frequencies of words from the articles
in the list of the urls"
[urls]
(frequencies
(reduce concat
(map filter-nouns-verbs
(map get-sentence-tokens
(map get-article urls))))))
(defn num-total-words
"Counts the toal number of words in the
frequency map, to be used for normailzation"
[freq-map]
(reduce + (map second freq-map)))
(defn score-freqs
[poss freq]
(float
(/
(reduce + (remove nil? (map freq (map first poss))))
(num-total-words freq))))
(defn get-freq-scores
"Returns the number of similarity between the freq
dists"
[poss & freqs]
(map #(score-freqs poss %) freqs))
(def train-path-syria "urls/train/syria.txt")
(def train-path-gb "urls/train/gb.txt")
(def test-path-syria "urls/test/syria.txt")
(def test-path-gb "urls/test/gb.txt")
(defn -main
[& args]
(def syria-train (freqs-from-urls
(read-file-newlines train-path-syria)))
(def gb-train (freqs-from-urls
(read-file-newlines train-path-gb)))
(def gb-test-freqs
(map #(freqs-from-urls (list %)) (read-file-newlines test-path-gb)))
(def syria-test-freqs
(map #(freqs-from-urls (list %)) (read-file-newlines test-path-syria)))
(pprint
(map #(get-freq-scores % gb-train syria-train) gb-test-freqs))
(pprint
(map #(get-freq-scores % gb-train syria-train) syria-test-freqs))
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment