Created
September 1, 2013 21:09
-
-
Save jackschultz/6407330 to your computer and use it in GitHub Desktop.
Simple classification algorithm that uses urls and the text in the articles.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns gb-or-syria.core | |
(:use [boilerpipe-clj.core] | |
[opennlp.nlp] | |
[opennlp.treebank] | |
[clojure.pprint :only [pprint]] | |
[opennlp.tools.filters] | |
[clojure.set] | |
[clojure.string :only [split-lines]] | |
[stemmer.snowball]) | |
(:gen-class)) | |
(def get-sentences (make-sentence-detector "models/en-sent.bin")) | |
(def tokenize (make-tokenizer "models/en-token.bin")) | |
(def pos-tag (make-pos-tagger "models/en-pos-maxent.bin")) | |
(def name-find (make-name-finder "models/en-ner-person.bin")) | |
(def chunker (make-treebank-chunker "models/en-chunker.bin")) | |
(def eng-stemmer (stemmer "english")) | |
(def stop-words | |
(set (split-lines (slurp "models/english-stopwords")))) | |
(def puntctuation-marks | |
#{"+" "-" "*" "^" "." ";" "%" "\\" "," "..." "!" "?" ":" "\""}) | |
(defn read-file-newlines | |
"Reads a file and puts the strings, line by line into a vector" | |
[path] | |
(into [] | |
(split-lines | |
(slurp path)))) | |
(defn get-article [url] | |
"Grabs the page from the url and strips the article from it" | |
(get-text | |
(slurp url))) | |
(defn get-sentence-tokens | |
"tokenizes the sentences after they are split" | |
[article] | |
(into [] | |
(map pos-tag | |
(map tokenize | |
(get-sentences article))))) | |
(defn filter-nouns-verbs | |
"From an article, we want to generate a list of tokens | |
that are nouns and verbs" | |
[sens] | |
(map first | |
(concat | |
(reduce concat (map nouns sens)) | |
(reduce concat (map verbs sens))))) | |
(defn freqs-from-urls | |
"Get the frequencies of words from the articles | |
in the list of the urls" | |
[urls] | |
(frequencies | |
(reduce concat | |
(map filter-nouns-verbs | |
(map get-sentence-tokens | |
(map get-article urls)))))) | |
(defn num-total-words | |
"Counts the toal number of words in the | |
frequency map, to be used for normailzation" | |
[freq-map] | |
(reduce + (map second freq-map))) | |
(defn score-freqs | |
[poss freq] | |
(float | |
(/ | |
(reduce + (remove nil? (map freq (map first poss)))) | |
(num-total-words freq)))) | |
(defn get-freq-scores | |
"Returns the number of similarity between the freq | |
dists" | |
[poss & freqs] | |
(map #(score-freqs poss %) freqs)) | |
(def train-path-syria "urls/train/syria.txt") | |
(def train-path-gb "urls/train/gb.txt") | |
(def test-path-syria "urls/test/syria.txt") | |
(def test-path-gb "urls/test/gb.txt") | |
(defn -main | |
[& args] | |
(def syria-train (freqs-from-urls | |
(read-file-newlines train-path-syria))) | |
(def gb-train (freqs-from-urls | |
(read-file-newlines train-path-gb))) | |
(def gb-test-freqs | |
(map #(freqs-from-urls (list %)) (read-file-newlines test-path-gb))) | |
(def syria-test-freqs | |
(map #(freqs-from-urls (list %)) (read-file-newlines test-path-syria))) | |
(pprint | |
(map #(get-freq-scores % gb-train syria-train) gb-test-freqs)) | |
(pprint | |
(map #(get-freq-scores % gb-train syria-train) syria-test-freqs)) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment