Skip to content

Instantly share code, notes, and snippets.

@dakrone
Created February 18, 2010 22:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dakrone/308172 to your computer and use it in GitHub Desktop.
Save dakrone/308172 to your computer and use it in GitHub Desktop.
(use 'opennlp) ; http://github.com/dakrone/clojure-opennlp
(use 'clojure.contrib.pprint)
(use 'clojure.contrib.duck-streams)
(use 'clojure.contrib.seq-utils)
(defn strip-html-tags
"Messily strip html tags from a web page"
[string]
(.replaceAll
(.replaceAll
(.replaceAll
(.replaceAll string "<script .*?>.*?</script>" " ")
"<style .*?>.*?</style>" " ")
"<.*?>" " ")
"[ ]+" " "))
(defn fetch-page
[url]
(let [html (.replaceAll (slurp* url) "[\t\n\r]" " ")]
(re-find #"<body.*?</body>" html)))
(defn fetch-plain-page
[url]
(strip-html-tags (fetch-page url)))
(def get-sentences (make-sentence-detector "models/EnglishSD.bin.gz"))
(def tokenize (make-tokenizer "models/EnglishTok.bin.gz"))
(def pos-tag (make-pos-tagger "models/tag.bin.gz"))
(defn- tag-sentences
[sent-seq]
(map #(pos-tag (tokenize %)) sent-seq))
(defn tag-page
[url]
(let [page (fetch-plain-page url)
sentences (get-sentences page)
sent-seq (partition-all 10 sentences)]
(pmap tag-sentences sent-seq)))
(tag-page "http://writequit.org")
;; Output:
(((["Security" "NNP"] ["," ","] ["scripting" "VBG"] ["and" "CC"] ["packet" "NN"] ["analysis" "NN"] [":wq" "VBN"] ["home" "NN"] ["blog" "NN"] ["projects" "NNS"] ["papers" "NNS"] ["misc" "VBD"] ["about" "IN"] ["contact" "NN"] ["Welcome" "NNP"] ["Welcome" "NNP"] ["to" "TO"] ["writequit" "VB"] [".org" "NNP"] ["," ","] ["the" "DT"] ["home" "NN"] ["of" "IN"] ["Matthew" "NNP"] ["Lee" "NNP"] ["Hinman" "NNP"] ["." "."]) (["Here" "RB"] ["you" "PRP"] ["'ll" "MD"] ["find" "VB"] ["projects" "NNS"] ["I" "PRP"] ["work" "VBP"] ["on" "IN"] ["," ","] ["paper" "NN"] ["'s" "POS"] ["I" "PRP"] ["'ve" "VBP"] ["written" "VBN"] ["," ","] ["my" "PRP$"] ["blog" "NN"] ["," ","] ["etc" "FW"] ["." "."] ["Hope" "VB"] ["you" "PRP"] ["find" "VBP"] ["something" "NN"] ["useful" "JJ"] [":" ":"] [")" "-RRB-"] ["03.12.2009" "CD"] ["-" ":"] ["Added" "VBN"] ["the" "DT"] ["Ricepaper" "NNP"] ["link" "NN"] ["to" "TO"] ["the" "DT"] ["miscellaneous" "JJ"] ["page" "NN"] ["," ","] ["check" "VB"] ["out" "RP"] ["this" "DT"] ["blog" "NN"] ["post" "NN"] ["to" "TO"] ["see" "VB"] ["how" "WRB"] ["it" "PRP"] ["'s" "VBZ"] ["used" "VBN"] ["." "."]) (["10.09.2008" "CD"] ["-" ":"] ["Updated" "VBN"] ["the" "DT"] ["NSM-Console" "JJ"] ["page" "NN"] ["to" "TO"] ["reflect" "VB"] ["the" "DT"] ["switch" "NN"] ["to" "TO"] ["git" "VB"] ["and" "CC"] ["other" "JJ"] ["new" "JJ"] ["things" "NNS"] ["." "."]) (["Updated" "VBN"] ["the" "DT"] ["HeX" "NNP"] ["page" "NN"] ["for" "IN"] ["the" "DT"] ["2.0" "CD"] ["release" "NN"] ["," ","] ["hurray" "NN"] ["!" "."]) (["09.14.2008" "CD"] ["-" ":"] ["Added" "VBN"] ["the" "DT"] ["lids" "NNS"] ["and" "CC"] ["the" "DT"] ["ruby" "NN"] ["datasuite" "NN"] ["pages" "NNS"] ["," ","] ["check" "VBP"] ["the" "DT"] ["projects" "NNS"] ["page" "NN"] ["and" "CC"] ["the" "DT"] ["miscellaneous" "JJ"] ["page" "NN"] ["for" "IN"] ["the" "DT"] ["the" "DT"] ["new" "JJ"] ["projects" "NNS"] ["." "."]) (["Also" "RB"] ["updated" "VBN"] ["the" "DT"] ["about" "RB"] ["page" "NN"] ["with" "IN"] ["a" "DT"] ["link" "NN"] ["to" "TO"] ["my" "PRP$"] ["github" "NN"] ["projects" "NNS"] ["." "."]) (["07.14.2008" "CD"] ["-" ":"] ["Added" "VBN"] ["the" "DT"] ["flowtag" "NN"] ["FreeBSD" "NNP"] ["port" "NN"] ["," ","] ["check" "VB"] ["the" "DT"] ["miscellaneous" "JJ"] ["page" "NN"] ["for" "IN"] ["the" "DT"] ["port" "NN"] ["tarball" "NN"] ["(until" "IN"] ["FreeBSD" "NNP"] ["accepts" "VBZ"] ["it" "PRP"] [")" "-RRB-"] ["." "."]) (["07.01.2008" "CD"] ["-" ":"] ["Added" "VBN"] ["a" "DT"] ["keygen" "NN"] ["for" "IN"] ["one" "CD"] ["of" "IN"] ["the" "DT"] ["crackmes" "NNS"] ["on" "IN"] ["crackmes" "NNS"] [".de" "NN"] ["," ","]) (["check" "VB"] ["the" "DT"] ["miscellaneous" "JJ"] ["page" "NN"] ["for" "IN"] ["the" "DT"] ["C" "NNP"] ["code" "NN"] ["." "."]) (["03.27.2008" "CD"] ["-" ":"] ["Just" "RB"] ["released" "VBN"] ["NSM-Console" "JJ"] ["version" "NN"] ["0.7" "CD"] ["for" "IN"] ["general" "JJ"] ["consumption" "NN"] ["," ","] ["lots" "NNS"] ["of" "IN"] ["new" "JJ"] ["features" "NNS"] ["in" "IN"] ["this" "DT"] ["release" "NN"] ["," ","] ["check" "VB"] ["out" "RP"] ["the" "DT"] ["blog" "NN"] ["post" "NN"] ["for" "IN"] ["release" "NN"] ["notes" "NNS"] ["." "."])) ((["02.04.2008" "CD"] ["-" ":"] ["Added" "VBN"] ["the" "DT"] ["yahsnarf" "NN"] ["project" "NN"] ["to" "TO"] ["the" "DT"] ["projects" "NNS"] ["page" "NN"] ["," ","] ["go" "VB"] ["forth" "RB"] ["and" "CC"] ["sniff" "VB"] ["Yahoo" "NNP"] ["IM" "NNP"] ["conversations" "NNS"] [":" ":"] [")" "-RRB-"] ["31.03.2008" "CD"] ["-" ":"] ["Added" "VBN"] ["the" "DT"] ["flowtime" "NN"] ["script" "NN"] ["to" "TO"] ["the" "DT"] ["miscellaneous" "JJ"] ["page" "NN"] ["," ","] ["now" "RB"] ["you" "PRP"] ["'ll" "MD"] ["be" "VB"] ["able" "JJ"] ["to" "TO"] ["graph" "VB"] ["packet" "NN"] ["timelines" "NNS"] [":" ":"] [")" "-RRB-"] ["19.03.2008" "CD"] ["-" ":"] ["Published" "VBN"] ["a" "DT"] ["newer" "JJR"] ["version" "NN"] ["of" "IN"] ["the" "DT"] ["Ruby" "NNP"] ["StreamBuilder" "NNP"] ["project" "NN"] ["," ","] ["this" "DT"] ["one" "CD"] ["includes" "VBZ"] ["the" "DT"] ["fuzzysort" "NN"] ["algorithm" "NN"] ["," ","] ["check" "VB"] ["out" "RP"] ["the" "DT"] ["project" "NN"] ["page" "NN"] ["to" "TO"] ["download" "VB"] ["the" "DT"] ["new" "JJ"] ["version" "NN"] ["," ","] ["or" "CC"] ["read" "VB"] ["the" "DT"] ["blog" "NN"] ["post" "NN"] ["." "."]) (["14.03.2008" "CD"] ["-" ":"] ["I" "PRP"] ["'ve" "VBP"] ["updated" "VBN"] ["the" "DT"] ["NSM-Console" "JJ"] ["project" "NN"] ["page" "NN"] ["with" "IN"] ["the" "DT"] ["latest" "JJS"] ["release" "NN"] ["," ","] ["0.6." "."]) (["Check" "VB"] ["out" "IN"] ["the" "DT"] ["project" "NN"] ["page" "NN"] ["or" "CC"] ["the" "DT"] ["release" "NN"] ["notes" "NNS"] ["for" "IN"] ["this" "DT"] ["release" "NN"] ["." "."]) (["11.03.2008" "CD"] ["-" ":"] ["I" "PRP"] ["'ve" "VBP"] ["add" "VB"] ["the" "DT"] ["RS" "NNP"] ["B" "NNP"] ["(" "-LRB-"] ["RubyStreamBuilder" "NN"] [")" "-RRB-"] ["project" "NN"] ["onto" "IN"] ["the" "DT"] ["projects" "NNS"] ["page" "NN"] ["," ","] ["RS" "NNP"] ["B" "NNP"] ["is" "VBZ"] ["some" "DT"] ["proof-of-concept" "NN"] ["code" "NN"] ["used" "VBN"] ["to" "TO"] ["rebuild" "VB"] ["TCP" "NNP"] ["streams" "NNS"] ["for" "IN"] ["data" "NNS"] ["access" "NN"] ["." "."]) (["Check" "VB"] ["out" "IN"] ["the" "DT"] ["first" "JJ"] ["blog" "NN"] ["post" "NN"] ["here" "RB"] ["." "."]) (["10.03.2008" "CD"] ["-" ":"] ["All" "DT"] ["pages" "NNS"] ["on" "IN"] ["the" "DT"] ["site" "NN"] ["should" "MD"] ["now" "RB"] ["work" "VB"] ["and" "CC"] ["should" "MD"] ["have" "VB"] ["content" "JJ"] ["," ","] ["archived" "JJ"] ["blog" "NN"] ["posts" "NNS"] ["are" "VBP"] ["available" "JJ"] ["on" "IN"] ["my" "PRP$"] ["current" "JJ"] ["blog" "NN"] ["," ","] ["or" "CC"] ["on" "IN"] ["the" "DT"] ["old" "JJ"] ["one" "CD"] ["." "."]) (["09.03.2008" "CD"] ["-" ":"] ["Papers" "NNS"] ["section" "NN"] ["now" "RB"] ["has" "VBZ"] ["the" "DT"] ["first" "JJ"] ["paper" "NN"] ["that" "IN"] ["I" "PRP"] ["'ve" "VBP"] ["written" "VBN"] ["-" ":"] ["An" "DT"] ["introduction" "NN"] ["to" "TO"] ["NSM-Console" "NNP"] ["." "."]) (["I" "PRP"] ["also" "RB"] ["finished" "VBD"] ["the" "DT"] ["About" "RB"] ["page" "NN"] ["," ","] ["misc" "NN"] ["section" "NN"] ["," ","] ["although" "IN"] ["I" "PRP"] ["'ll" "MD"] ["be" "VB"] ["adding" "VBG"] ["more" "JJR"] ["content" "NN"] ["to" "TO"] ["them" "PRP"] ["in" "IN"] ["the" "DT"] ["future" "NN"] ["." "."]) (["08.03.2008" "CD"] ["-" ":"] ["Site" "NNP"] ["launched" "VBD"] ["," ","] ["everything" "NN"] ["'s" "VBZ"] ["not" "RB"] ["all" "DT"] ["finished" "VBN"] ["and" "CC"] ["completely" "RB"] ["polished" "JJ"] ["yet" "RB"] ["," ","] ["but" "CC"] ["I" "PRP"] ["'m" "VBP"] ["working" "VBG"] ["on" "IN"] ["it" "PRP"] [":" ":"] [")" "-RRB-"] ["In" "IN"] ["the" "DT"] ["meantime" "NN"] ["," ","] ["check" "VB"] ["out" "RP"] ["the" "DT"] ["projects" "NNS"] ["page" "NN"] ["for" "IN"] ["a" "DT"] ["list" "NN"] ["of" "IN"] ["some" "DT"] ["projects" "NNS"] ["that" "IN"] ["I" "PRP"] ["work" "VBP"] ["on" "RB"] ["," ","] ["as" "RB"] ["well" "RB"] ["as" "IN"] ["the" "DT"] ["new" "JJ"] ["blog" "NN"] ["." "."]) (["Make" "VB"] ["sure" "JJ"] ["you" "PRP"] ["update" "VB"] ["any" "DT"] ["RS" "NNS"] ["S" "NNP"] ["feeds" "VBZ"] ["from" "IN"] ["my" "PRP$"] ["old" "JJ"] ["blog" "NN"] ["site" "NN"] ["at" "IN"] ["http" "JJ"] ["://thnetos.wordpress.com" "NN"])) ((["(" "-LRB-"] ["which" "WDT"] ["will" "MD"] ["stay" "VB"] ["up" "RP"] ["as" "IN"] ["an" "DT"] ["archive" "NN"] ["of" "IN"] ["all" "PDT"] ["the" "DT"] ["old" "JJ"] ["posts" "NNS"] [")" "-RRB-"] ["Copyright" "NNP"] ["&copy" "VBP"] [";" ":"] ["2005-2010" "CD"] ["Matthew" "NNP"] ["Lee" "NNP"] ["Hinman" "NNP"] ["." "."]) (["All" "DT"] ["rights" "NNS"] ["reserved" "VBN"] ["." "."]) (["Design" "NNP"] ["by" "IN"] ["Viere" "NNP"] ["Media" "NNP"] ["." "."])))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment