Created
November 9, 2012 09:56
-
-
Save damionjunk/4044936 to your computer and use it in GitHub Desktop.
extract semcor elements matching some criteria set in an NLP assignment.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns wsd.semfind | |
(:require [clojure.java.io :as io] | |
[clojure.string :as s]) | |
(:import [java.net URL] | |
[edu.mit.jsemcor.main IConcordance IConcordanceSet Semcor] | |
[edu.mit.jsemcor.element IWordform ISentence])) | |
(defn noun? | |
"Checks to see if the Wordform is a NN or NNS, and additionally has | |
a semantic tag, because we can't really make use of those without." | |
[^IWordform wf] | |
(let [pt (.. wf getPOSTag getValue)] | |
(and (not (nil? (.getSemanticTag wf))) | |
(or (= pt "NN") (= pt "NNS"))))) | |
(defn text | |
[^IWordform wf] (.getText wf)) | |
(defn sentence-text | |
[^ISentence s] | |
(map text (.getWordList s))) | |
(defn sentence-noun-text | |
[^ISentence s] | |
(map text (keep #(when (noun? %) %) (.getWordList s)))) | |
(defn noun-frequencies | |
"Runs through the semcor files and counts the frequencies of each NN, NNS | |
word. | |
" | |
[semcor-root concord-name] | |
(let [surl (URL. "file" nil semcor-root) | |
semcor (doto (Semcor. surl) (.open)) | |
concordi (iterator-seq | |
(.. semcor | |
(get concord-name) | |
iterator))] | |
(frequencies | |
(reduce (fn [x y] | |
(into x (sentence-noun-text y)) | |
) [] (mapcat #(.getSentences %) concordi))))) | |
(defn has-any-word? | |
[s words] | |
(let [swords (sentence-noun-text s)] | |
(seq (keep | |
(fn [sw] | |
(if (>= (.indexOf words sw) 0) true)) swords)))) | |
(defn sentences-containing | |
"From the Semcor training data, grab only the sentences containing a word | |
in the provided words list." | |
[words semcor-root concord-name] | |
(let [surl (URL. "file" nil semcor-root) | |
semcor (doto (Semcor. surl) (.open)) | |
concordi (iterator-seq | |
(.. semcor | |
(get concord-name) | |
iterator))] | |
(filter (fn [s] | |
(has-any-word? s words)) | |
(mapcat #(.getSentences %) concordi)))) | |
(defn write-sentences | |
"" | |
[sentences fout] | |
(with-open [wtr (io/writer fout)] | |
(doseq [s sentences] | |
(.write wtr (str (.getData s) "\n"))) | |
)) | |
;; | |
;; | |
;; | |
(defn start | |
"Entry point to be called from the main, command line options passed | |
via the opts map." | |
[opts] | |
(noun-frequencies (:sroot opts) (:concordance opts)) | |
) | |
(comment | |
(let [opts {:sroot "/Users/djunk/projects/L645/hw09/semcor3.0/" | |
:concordance "brown1"} | |
top25 | |
(take 25 | |
(sort-by val > | |
(start opts) ) ) | |
words25 (map first top25) | |
sentences25 (sentences-containing words25 | |
(:sroot opts) | |
(:concordance opts))] | |
(write-sentences sentences25 "/Users/djunk/projects/L645/hw09/semcor-top-25.xml") | |
) | |
;; | |
;; # of Sentences containing the top 25 words: 2223 | |
;; | |
;; [ ["man" 231] ["time" 228] ["years" 168] | |
;; ["way" 150] ["men" 131] ["people" 119] | |
;; ["day" 114] ["life" 113] ["night" 105] | |
;; ["surface" 1,00] ["work" 98] ["number" 97] | |
;; ["world" 92] ["head" 90] ["eyes" 90] | |
;; ["year" 84] ["house" 79] ["water" 78] | |
;; ["God" 78] ["body" 77] ["nothing" 77] | |
;; ["part" 77] ["group" 77] ["thing" 75] | |
;; ["area" 73]] | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment