Skip to content

Instantly share code, notes, and snippets.

@damionjunk
Created November 9, 2012 09:56
Show Gist options
  • Save damionjunk/4044936 to your computer and use it in GitHub Desktop.
Save damionjunk/4044936 to your computer and use it in GitHub Desktop.
extract semcor elements matching some criteria set in an NLP assignment.
(ns wsd.semfind
(:require [clojure.java.io :as io]
[clojure.string :as s])
(:import [java.net URL]
[edu.mit.jsemcor.main IConcordance IConcordanceSet Semcor]
[edu.mit.jsemcor.element IWordform ISentence]))
(defn noun?
"Checks to see if the Wordform is a NN or NNS, and additionally has
a semantic tag, because we can't really make use of those without."
[^IWordform wf]
(let [pt (.. wf getPOSTag getValue)]
(and (not (nil? (.getSemanticTag wf)))
(or (= pt "NN") (= pt "NNS")))))
(defn text
[^IWordform wf] (.getText wf))
(defn sentence-text
[^ISentence s]
(map text (.getWordList s)))
(defn sentence-noun-text
[^ISentence s]
(map text (keep #(when (noun? %) %) (.getWordList s))))
(defn noun-frequencies
"Runs through the semcor files and counts the frequencies of each NN, NNS
word.
"
[semcor-root concord-name]
(let [surl (URL. "file" nil semcor-root)
semcor (doto (Semcor. surl) (.open))
concordi (iterator-seq
(.. semcor
(get concord-name)
iterator))]
(frequencies
(reduce (fn [x y]
(into x (sentence-noun-text y))
) [] (mapcat #(.getSentences %) concordi)))))
(defn has-any-word?
[s words]
(let [swords (sentence-noun-text s)]
(seq (keep
(fn [sw]
(if (>= (.indexOf words sw) 0) true)) swords))))
(defn sentences-containing
"From the Semcor training data, grab only the sentences containing a word
in the provided words list."
[words semcor-root concord-name]
(let [surl (URL. "file" nil semcor-root)
semcor (doto (Semcor. surl) (.open))
concordi (iterator-seq
(.. semcor
(get concord-name)
iterator))]
(filter (fn [s]
(has-any-word? s words))
(mapcat #(.getSentences %) concordi))))
(defn write-sentences
""
[sentences fout]
(with-open [wtr (io/writer fout)]
(doseq [s sentences]
(.write wtr (str (.getData s) "\n")))
))
;;
;;
;;
(defn start
"Entry point to be called from the main, command line options passed
via the opts map."
[opts]
(noun-frequencies (:sroot opts) (:concordance opts))
)
(comment
(let [opts {:sroot "/Users/djunk/projects/L645/hw09/semcor3.0/"
:concordance "brown1"}
top25
(take 25
(sort-by val >
(start opts) ) )
words25 (map first top25)
sentences25 (sentences-containing words25
(:sroot opts)
(:concordance opts))]
(write-sentences sentences25 "/Users/djunk/projects/L645/hw09/semcor-top-25.xml")
)
;;
;; # of Sentences containing the top 25 words: 2223
;;
;; [ ["man" 231] ["time" 228] ["years" 168]
;; ["way" 150] ["men" 131] ["people" 119]
;; ["day" 114] ["life" 113] ["night" 105]
;; ["surface" 1,00] ["work" 98] ["number" 97]
;; ["world" 92] ["head" 90] ["eyes" 90]
;; ["year" 84] ["house" 79] ["water" 78]
;; ["God" 78] ["body" 77] ["nothing" 77]
;; ["part" 77] ["group" 77] ["thing" 75]
;; ["area" 73]]
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment