Created
December 17, 2012 22:42
-
-
Save jirkamarsik/4323107 to your computer and use it in GitHub Desktop.
Normalizing Stanford constituency trees as part of my "Applications of NLP" project.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns grook.preprocess | |
(:refer-clojure :exclude [==]) | |
(:use clojure.core.logic | |
[clojure.pprint :only [pprint]]) | |
(:require [clojure.walk :as walk] | |
[clojure.java.io :as io])) | |
;; Running this cleans any previously defined rules from the namespace. | |
(dorun (for [[sym var] (ns-interns *ns*) | |
:when (::rule (meta var))] | |
(ns-unmap *ns* sym))) | |
(def tree-dir | |
"Directory containing the parse trees which should be processed." | |
"../data/cleanedSentences/parsed/") | |
(def COMMA | |
"The comma is not allowed as a symbol in Clojure code. Hence, | |
to refer to the symbol, we have to write it down indirectly." | |
(symbol ",")) | |
(defn read-tree | |
"Reads in the tree at tree-dir/name." | |
[name] | |
(->> (str tree-dir name) | |
slurp | |
read-string | |
; Since the Clojure reader treats commas as whitespace, we put | |
; them back by replacing any instance of () with (, ,). | |
(walk/postwalk #({() (list COMMA COMMA)} % %)))) | |
(def trees | |
"Our dataset, represented as a map from file names to the trees | |
contained within." | |
(into {} (for [file (file-seq (io/file tree-dir)) | |
:let [name (.getName file)] | |
:when (.endsWith name ".pst-heads")] | |
[name (read-tree name)]))) | |
(def ^:dynamic ^{:doc "Bind this Var to some value before calling | |
fix-tree and records of all the adjunctions performed will | |
be conjed onto it."} *fix-tree-adjunctions*) | |
(defn record-adjunct | |
"Records an adjunction performed within fix-tree." | |
[phrase adjunct] | |
(if (thread-bound? #'*fix-tree-adjunctions*) | |
(set! *fix-tree-adjunctions* (conj *fix-tree-adjunctions* | |
{:phrase phrase | |
:adjunct adjunct})))) | |
(defn has-more-thano | |
"A goal satisfied iff l is a list with more than n elements; n is | |
not a relational argument." | |
[n l] | |
(fresh [a d] | |
(conso a d l) | |
(if (pos? n) | |
(has-more-thano (dec n) d) | |
succeed))) | |
(defn poly-appendo | |
"Like clojure.core.logic/appendo, but works with arbitrary number of | |
args, like clojure.core/concat." | |
([x y] | |
(== x y)) | |
([x y & more] | |
(fresh [z] | |
(apply poly-appendo z more) | |
(appendo x y z)))) | |
(defn adjuncto | |
"A goal satisfied iff `adjunct' is identified as an adjunct in the | |
phrase `phrase' (for the more general case when the adjunct is a | |
child of the constituent it modifies)." | |
[phrase adjunct] | |
(conde ; We treat all PPs as adjuncts. | |
[(firsto adjunct 'PP)] | |
[(firsto adjunct 'S) | |
; We don't want sentential complements such as "[to do Y]" | |
; in "The function of X is [to do Y]" to be recognized as | |
; adjuncts. Example file: ex03a.46 | |
(fresh [to _] | |
(conda [(== adjunct ['S ['VP=H to _]]) | |
(!= to ['TO=H 'to])] | |
[succeed]))] | |
; Commas are viewed as adjuncts. Usually, they introduce a | |
; sentential complement or a PP. Treating commas as just | |
; other adjuncts lets us extract both the comma and the | |
; adjunct it introduces as a two-level auxiliary tree. | |
[(firsto adjunct COMMA)] | |
[(firsto adjunct 'SBAR) | |
; We only want SBARS which correspond to appositive | |
; relative clauses introduced by the word "which". | |
(fresh [_] | |
(== adjunct ['SBAR ['WHNP=H ['WDT=H 'which]] _]))])) | |
(defn ^::rule fix-adjunctiono | |
"A rule which takes a left-most or right-most child A, which is an | |
adjunct, and puts the A on a separate level from the other children, | |
i.e. (R A ...) -> (R A (R ...)) or (R ... A) -> (R (R ...) A). | |
This solves the general case when modifiers and adjuncts end up as | |
children of the consituent they modify (e.g. in VPs)." | |
[in-tree out-tree] | |
(fresh [root children other-children adjunct head-child] | |
(conso root children in-tree) | |
(has-more-thano 2 children) | |
(conde [(conso adjunct other-children children) | |
(== out-tree (list root adjunct head-child))] | |
[(appendo other-children (list adjunct) children) | |
(== out-tree (list root head-child adjunct))]) | |
(conso root other-children head-child) | |
(adjuncto in-tree adjunct) | |
; Record the phrase-adjunct pair we have found. | |
(project [in-tree adjunct] | |
(do (record-adjunct in-tree adjunct) | |
succeed)))) | |
(defn ^::rule fix-flat-npo | |
"A rule which takes an NP whose head is preceded by a modifier and | |
puts the modifier and the head under a new binary head node. The | |
head noun can be any noun or noun phrase which is either the last | |
child of the NP or is followed by a comma or conjunction. | |
I.e., (NP ... MOD (N ...) (COMMA|CONJ) ...) -> | |
(NP ... (N MOD (N ...)) (COMMA|CONJ) ...) | |
and (NP ... MOD (N ...)) -> (NP ... (N MOD (N ...))). | |
This solves the case when modifiers are on the same level of the | |
tree as the constituent they modify (only NPs in our data)." | |
[in-np out-np] | |
(fresh [root-tag prefix modifier modifier-tag head-noun head-noun-tag | |
suffix affixes new-head] | |
(membero root-tag '[NP NP=H]) | |
(poly-appendo [root-tag] prefix [modifier] [head-noun] suffix in-np) | |
(appendo prefix suffix affixes) | |
(!= affixes ()) | |
(firsto modifier modifier-tag) | |
(membero modifier-tag ['NN 'NNP 'JJ 'VBN 'ADJP]) | |
(firsto head-noun head-noun-tag) | |
(membero head-noun-tag ['NN=H 'NNP=H 'NNS=H 'NP=H]) | |
(conde [(emptyo suffix)] | |
[(fresh [conjunction conjunction-tag] | |
(firsto suffix conjunction) | |
(firsto conjunction conjunction-tag) | |
(membero conjunction-tag ['CC COMMA]))]) | |
(== new-head (list head-noun-tag modifier head-noun)) | |
(poly-appendo [root-tag] prefix [new-head] suffix out-np) | |
; Record the phrase-adjunct pair we have found. | |
(project [in-np modifier] | |
(do (record-adjunct in-np modifier) | |
succeed)))) | |
(defmacro transformo | |
"Is satisfied when out-tree is a normalized version of the in-tree | |
node. Tries to use any goal annotated with ::rule." | |
[in-tree out-tree] | |
`(conde ~@(for [[sym var] (ns-interns *ns*) | |
:when (::rule (meta var))] | |
[(list sym in-tree out-tree)]))) | |
(defn try-transform | |
"Repeatedly transforms the input phrase tree until no further | |
normalization can be found." | |
[in-tree] | |
(let [solutions (run 1 [out-tree] | |
(transformo in-tree out-tree))] | |
(if (empty? solutions) | |
in-tree | |
(recur (first solutions))))) | |
(defn fix-tree | |
"Walks the tree bottom-up and tries to normalize every phrase. | |
If you bind a value to *fix-tree-adjunctions*, a record of every | |
adjunction performed by the algorithm will be conjed onto it." | |
[tree] | |
(walk/prewalk try-transform tree)) | |
(def fixed-trees | |
"A map from tree filenames to their fixed versions." | |
(into {} (for [[name tree] trees] | |
[name (fix-tree tree)]))) | |
(def adjunctions | |
"A vector recording all the phrases which we recognized as | |
adjunctions. This is the place to check for `false positives', | |
occassions when we identified something as an adjunct even though it | |
could have been something else." | |
(apply concat (for [[name tree] trees] | |
(binding [*fix-tree-adjunctions* []] | |
(fix-tree tree) | |
(map #(assoc % :name name) *fix-tree-adjunctions*))))) | |
(defn interesting-phrases | |
"Selects phrases with more than two children from all the | |
relevant-trees. relevant-trees is expected to be a map from tree | |
names to trees. This is the place to check for false | |
negatives (places where the system should have done more | |
normalization)." | |
[relevant-trees] | |
(for [[name tree] relevant-trees | |
subtree (tree-seq seq? seq tree) | |
:when (and (seq? subtree) | |
(> (count subtree) 3) | |
(not= (last subtree) '(. .)))] | |
[name subtree])) | |
(def fixed-dir | |
"The place where to save all the processed trees." | |
"../data/cleanedSentences/fixed/") | |
(defn fix-all-the-trees! | |
"Reads in all the trees from tree-dir, processes them and saves the | |
results in fixed-dir." | |
[] | |
(dorun (for [file (file-seq (io/file tree-dir)) | |
:let [name (.getName file)] | |
:when (.endsWith name ".pst-heads")] | |
(with-open [fixed-file (io/writer (str fixed-dir name "-fixed"))] | |
(pprint (fix-tree (read-tree name)) fixed-file))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment