Created
February 2, 2015 14:47
-
-
Save iantruslove/f310f6346dfeb85976d0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns introspect.core | |
(:require [clojure.core.match :as match] | |
[clojure.java.io :as io] | |
[clojure-mail.core :as mail] | |
[clojure-mail.message :as message] | |
[opennlp.nlp :as onlp] | |
[opennlp.treebank :as treebank] | |
[opennlp.tools.filters :as filters] | |
[corenlp :as nlp]) | |
(:import (org.apache.tika Tika) | |
(org.apache.tika.parser Parser))) | |
(defonce sstore (mail/gen-store "ian@brownsofa.org" "Bb0ndwGM)!")) | |
(defonce inbox-messages (mail/inbox sstore)) | |
;; to convert a javamail message into a clojure message we need to call read-message | |
(defn read-message | |
"Returns a map of interesting data and metadata about the message, | |
including the text of the message body in the `:body-text` key." | |
[^javax.mail.Message m] | |
(try | |
(let [msg (message/read-message m)] | |
(-> msg | |
(assoc :body-text (:body | |
(first | |
(filter #(= "TEXT/PLAIN; charset=UTF-8" (:content-type %)) | |
(:body msg))))) | |
(dissoc :body))) | |
(catch Throwable t | |
;; meh. screw it. | |
{}))) | |
(comment | |
(clojure.pprint/pprint | |
(map read-message | |
(take 3 inbox-messages)) | |
) | |
) | |
(comment | |
(defonce get-sentences (onlp/make-sentence-detector "models/en-sent.bin")) | |
(defonce tokenize (onlp/make-tokenizer "models/en-token.bin")) | |
;;(def detokenize (onlp/make-detokenizer "models/english-detokenizer.xml")) | |
(defonce pos-tag (onlp/make-pos-tagger "models/en-pos-maxent.bin")) | |
(defonce name-find (onlp/make-name-finder "models/en-ner-person.bin")) | |
(defonce chunker (treebank/make-treebank-chunker "models/en-chunker.bin")) | |
(defn get-phrase-strings [s] | |
(-> (tokenize s) | |
pos-tag | |
chunker | |
treebank/phrase-strings)) | |
;; Gets all phrases for the first 3 messages | |
(map #(-> % | |
read-message | |
:body-text | |
get-phrase-strings) (take 3 inbox-messages)) | |
;; Entity Topic = (JJ)* (NN [P|S|PS]*)+ | |
;; Event topic 1 = [Entity] (VB [D|G|N|P|Z])+ | |
;; Event topic 2 = (VB [D|G|N|P|Z])+ [Entity] | |
(defn pos-tags-filter | |
"Finds POS patterns in POS-tagged seq s. | |
Returns a seq of POS-tagged seqs that match pattern." | |
[s pattern] | |
)) | |
(comment | |
;; core.match is interesting, but not sure it'll easily help me. | |
;; Particularlt, I don't want to have to reimplement regex grammar... | |
(doseq [n (range 1 101)] | |
(println | |
(match/match [(mod n 3) (mod n 5)] | |
[0 0] "FizzBuzz" | |
[0 _] "Fizz" | |
[_ 0] "Buzz" | |
:else n)))) | |
(comment | |
(-> "this is a sentence." | |
nlp/tokenize | |
nlp/pos-tag) | |
) | |
;; TODO: | |
;; ===== | |
;; * Use Stanford NLP and the TokensRegex lib to to POS tagging and pattern matching | |
;; * http://nlp.stanford.edu/software/tagger.shtml | |
;; * http://nlp.stanford.edu/software/tokensregex.shtml | |
;; * Usage demo: http://www.galalaly.me/index.php/2011/05/tagging-text-with-stanford-pos-tagger-in-java-applications/ | |
;; * Consider whether there's a clojure library to wrap around SNLP | |
;; * Literate Org mode | |
;; * Use Org + Babel to log one of these types of explorations - an engineering notebook for code. | |
;; * Export to Markdown and publish into Brownsofa | |
;; * Figure out syntax highlighting - Org as Org, Clojure as Clojure. | |
;; * See if there are any advanced REPL options available, to not need to keep tangling the source file. | |
;; * Links: | |
;; * Sweet example: http://limist.com/coding/an-example-of-literate-programming-in-clojure-using-emacsorg.html | |
;; * Babel: http://orgmode.org/worg/org-contrib/babel/intro.html |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment