Created
April 12, 2018 13:00
-
-
Save toVersus/3c3cf684c7402f2323e626aceacc9a37 to your computer and use it in GitHub Desktop.
[Language Processing 100 Essentials] #58: Extract tuples
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/xml" | |
"flag" | |
"fmt" | |
"os" | |
) | |
type Root struct { | |
Document *Document `xml:"document"` | |
} | |
type Document struct { | |
Sentences Sentences `xml:"sentences>sentence"` | |
Coreferences Coreferences `xml:"coreference>coreference"` | |
} | |
type Sentences []*Sentence | |
type Coreferences []*Coreference | |
type Sentence struct { | |
ID int `xml:"id,attr"` | |
Dependencies Dependencies `xml:"dependencies"` | |
Parse string `xml:"parse,omitempty"` | |
Tokens Tokes `xml:"tokens>token,omitempty"` | |
} | |
type Dependencies []*Dependencie | |
type Tokes []*Token | |
type Token struct { | |
ID string `xml:"id,attr"` | |
Word string `xml:"word,omitempty"` | |
Lemma string `xml:"lemma,omitempty"` | |
CharacterOffsetBegin int `xml:"CharacterOffsetBegin,omitempty"` | |
CharacterOffsetEnd int `xml:"CharacterOffsetEnd,omitempty"` | |
POS string `xml:"POS,omitempty"` | |
NER string `xml:"NER,omitempty"` | |
NormalizedNER string `xml:"NormalizedNER,omitempty"` | |
Speaker string `xml:"Speaker,omitempty"` | |
Timex *Timex `xml:"Timex,omitempty"` | |
} | |
type Timex struct { | |
Tid string `xml:"tid,attr"` | |
Type string `xml:"type,attr"` | |
Value string `xml:",chardata"` | |
} | |
type Governor struct { | |
Copy string `xml:"copy,attr"` | |
Idx int `xml:"idx,attr"` | |
Value string `xml:",chardata"` | |
} | |
type Dependent struct { | |
Copy string `xml:"copy,attr"` | |
Idx int `xml:"idx,attr"` | |
Value string `xml:",chardata"` | |
} | |
type Dep struct { | |
Extra string `xml:"extra,attr"` | |
Type string `xml:"type,attr"` | |
Dependent *Dependent `xml:"dependent,omitempty"` | |
Governor *Governor `xml:"governor,omitempty"` | |
} | |
type Dependencie struct { | |
Type string `xml:"type,attr"` | |
Deps Deps `xml:"dep,omitempty"` | |
} | |
func (dep Dependencie) getVerb() ([]string, error) { | |
if dep.Type != "collapsed-dependencies" { | |
return nil, nil | |
} | |
nsubj := map[int]string{} | |
for _, d := range dep.Deps { | |
if d.Type != "nsubj" { | |
continue | |
} | |
nsubj[d.Governor.Idx] = d.Governor.Value | |
} | |
// To store the keys in slice in sorted order | |
var govIdxs []int | |
for _, d := range dep.Deps { | |
if d.Type != "dobj" { | |
continue | |
} | |
if _, ok := nsubj[d.Governor.Idx]; ok { | |
govIdxs = append(govIdxs, d.Governor.Idx) | |
} | |
} | |
// Store verb in ascending order of index | |
var verbs []string | |
for _, i := range govIdxs { | |
verbs = append(verbs, nsubj[i]) | |
} | |
return verbs, nil | |
} | |
func (dep Dependencie) getSubject(verbs []string) ([]string, error) { | |
var subjects []string | |
for _, verb := range verbs { | |
for _, d := range dep.Deps { | |
if d.Type != "nsubj" { | |
continue | |
} | |
if d.Governor.Value != verb { | |
continue | |
} | |
subjects = append(subjects, d.Dependent.Value) | |
} | |
} | |
return subjects, nil | |
} | |
func (dep Dependencie) getObject(verbs []string) ([]string, error) { | |
var objects []string | |
for _, verb := range verbs { | |
for _, d := range dep.Deps { | |
if d.Type != "dobj" { | |
continue | |
} | |
if d.Governor.Value != verb { | |
continue | |
} | |
objects = append(objects, d.Dependent.Value) | |
} | |
} | |
return objects, nil | |
} | |
type Deps []*Dep | |
type Coreference struct { | |
Mentions Mentions `xml:"mention,omitempty"` | |
} | |
type Mentions []*Mention | |
type Mention struct { | |
Representative string `xml:"representative,attr"` | |
Sentence int `xml:"sentence,omitempty"` | |
Start int `xml:"start,omitempty"` | |
End int `xml:"end,omitempty"` | |
Head int `xml:"head,omitempty"` | |
Text string `xml:"text,omitempty"` | |
} | |
func main() { | |
var filePath, destFilePath string | |
var sentenceNum int | |
flag.StringVar(&filePath, "file", "", "specify a file path") | |
flag.StringVar(&filePath, "f", "", "specify a file path") | |
flag.StringVar(&destFilePath, "dest", "", "specify a dest file path") | |
flag.StringVar(&destFilePath, "d", "", "specify a dest file path") | |
flag.IntVar(&sentenceNum, "n", 1, "specify number of sentence") | |
flag.Parse() | |
r, err := readXML(filePath) | |
if err != nil { | |
fmt.Println(err) | |
os.Exit(1) | |
} | |
var verbs, subjects, objects []string | |
for _, dep := range r.Document.Sentences[sentenceNum].Dependencies { | |
verbs, err = dep.getVerb() | |
if err != nil { | |
fmt.Print(err) | |
os.Exit(1) | |
} | |
if len(verbs) == 0 { | |
continue | |
} | |
subjects, err = dep.getSubject(verbs) | |
if err != nil { | |
fmt.Print(err) | |
os.Exit(1) | |
} | |
objects, err = dep.getObject(verbs) | |
if err != nil { | |
fmt.Print(err) | |
os.Exit(1) | |
} | |
for i := 0; i < len(verbs); i++ { | |
fmt.Println(subjects[i] + "\t" + verbs[i] + "\t" + objects[i]) | |
} | |
} | |
} | |
// readXML reads the result of Stanford Core NLP and initiate the Root struct | |
func readXML(path string) (*Root, error) { | |
f, err := os.Open(path) | |
if err != nil { | |
return nil, fmt.Errorf("could not open a file: %s\n %s", path, err) | |
} | |
defer f.Close() | |
r := &Root{} | |
dec := xml.NewDecoder(f) | |
err = dec.Decode(r) | |
if err != nil { | |
return nil, err | |
} | |
return r, nil | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"os" | |
"testing" | |
"github.com/go-test/deep" | |
) | |
var tokenizeTests = []struct { | |
name string | |
file string | |
text string | |
expect []string | |
}{ | |
{ | |
name: "should slice the simple sentence into words", | |
file: "simple-test.xml", | |
text: `<?xml version="1.0" encoding="UTF-8"?> | |
<?xml-stylesheet href="CoreNLP-to-HTML.xsl" type="text/xsl"?> | |
<root> | |
<document> | |
<docId>test.txt</docId> | |
<sentences> | |
<sentence id="1"> | |
<tokens> | |
<token id="1"> | |
<word>Many</word> | |
<lemma>many</lemma> | |
<CharacterOffsetBegin>0</CharacterOffsetBegin> | |
<CharacterOffsetEnd>4</CharacterOffsetEnd> | |
<POS>JJ</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
<token id="2"> | |
<word>challenges</word> | |
<lemma>challenge</lemma> | |
<CharacterOffsetBegin>5</CharacterOffsetBegin> | |
<CharacterOffsetEnd>15</CharacterOffsetEnd> | |
<POS>NNS</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
<token id="3"> | |
<word>in</word> | |
<lemma>in</lemma> | |
<CharacterOffsetBegin>16</CharacterOffsetBegin> | |
<CharacterOffsetEnd>18</CharacterOffsetEnd> | |
<POS>IN</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
<token id="4"> | |
<word>NLP</word> | |
<lemma>nlp</lemma> | |
<CharacterOffsetBegin>19</CharacterOffsetBegin> | |
<CharacterOffsetEnd>22</CharacterOffsetEnd> | |
<POS>NN</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
<token id="5"> | |
<word>involve</word> | |
<lemma>involve</lemma> | |
<CharacterOffsetBegin>23</CharacterOffsetBegin> | |
<CharacterOffsetEnd>30</CharacterOffsetEnd> | |
<POS>VBP</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
<token id="6"> | |
<word>natural</word> | |
<lemma>natural</lemma> | |
<CharacterOffsetBegin>31</CharacterOffsetBegin> | |
<CharacterOffsetEnd>38</CharacterOffsetEnd> | |
<POS>JJ</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
<token id="7"> | |
<word>language</word> | |
<lemma>language</lemma> | |
<CharacterOffsetBegin>39</CharacterOffsetBegin> | |
<CharacterOffsetEnd>47</CharacterOffsetEnd> | |
<POS>NN</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
<token id="8"> | |
<word>understanding</word> | |
<lemma>understanding</lemma> | |
<CharacterOffsetBegin>48</CharacterOffsetBegin> | |
<CharacterOffsetEnd>61</CharacterOffsetEnd> | |
<POS>NN</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
<token id="9"> | |
<word>,</word> | |
<lemma>,</lemma> | |
<CharacterOffsetBegin>61</CharacterOffsetBegin> | |
<CharacterOffsetEnd>62</CharacterOffsetEnd> | |
<POS>,</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
<token id="10"> | |
<word>that</word> | |
<lemma>that</lemma> | |
<CharacterOffsetBegin>63</CharacterOffsetBegin> | |
<CharacterOffsetEnd>67</CharacterOffsetEnd> | |
<POS>WDT</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
<token id="11"> | |
<word>is</word> | |
<lemma>be</lemma> | |
<CharacterOffsetBegin>68</CharacterOffsetBegin> | |
<CharacterOffsetEnd>70</CharacterOffsetEnd> | |
<POS>VBZ</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
<token id="12"> | |
<word>,</word> | |
<lemma>,</lemma> | |
<CharacterOffsetBegin>70</CharacterOffsetBegin> | |
<CharacterOffsetEnd>71</CharacterOffsetEnd> | |
<POS>,</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
<token id="13"> | |
<word>enabling</word> | |
<lemma>enable</lemma> | |
<CharacterOffsetBegin>72</CharacterOffsetBegin> | |
<CharacterOffsetEnd>80</CharacterOffsetEnd> | |
<POS>VBG</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
<token id="14"> | |
<word>computers</word> | |
<lemma>computer</lemma> | |
<CharacterOffsetBegin>81</CharacterOffsetBegin> | |
<CharacterOffsetEnd>90</CharacterOffsetEnd> | |
<POS>NNS</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
<token id="15"> | |
<word>to</word> | |
<lemma>to</lemma> | |
<CharacterOffsetBegin>91</CharacterOffsetBegin> | |
<CharacterOffsetEnd>93</CharacterOffsetEnd> | |
<POS>TO</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
<token id="16"> | |
<word>derive</word> | |
<lemma>derive</lemma> | |
<CharacterOffsetBegin>94</CharacterOffsetBegin> | |
<CharacterOffsetEnd>100</CharacterOffsetEnd> | |
<POS>VB</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
<token id="17"> | |
<word>meaning</word> | |
<lemma>meaning</lemma> | |
<CharacterOffsetBegin>101</CharacterOffsetBegin> | |
<CharacterOffsetEnd>108</CharacterOffsetEnd> | |
<POS>NN</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
<token id="18"> | |
<word>.</word> | |
<lemma>.</lemma> | |
<CharacterOffsetBegin>108</CharacterOffsetBegin> | |
<CharacterOffsetEnd>109</CharacterOffsetEnd> | |
<POS>.</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
</tokens> | |
<parse>(ROOT (S (NP (NP (JJ Many) (NNS challenges)) (PP (IN in) (NP (NN NLP)))) (VP (VBP involve) (S (NP (NP (JJ natural) (NN language) (NN understanding)) (, ,) (SBAR (WHNP (WDT that)) (S (VP (VBZ is)))) (, ,)) (VP (VBG enabling) (NP (NNS computers)) (S (VP (TO to) (VP (VB derive) (NP (NN meaning)))))))) (. .))) </parse> | |
<dependencies type="collapsed-dependencies"> | |
<dep type="root"> | |
<governor idx="0">ROOT</governor> | |
<dependent idx="5">involve</dependent> | |
</dep> | |
<dep type="amod"> | |
<governor idx="2">challenges</governor> | |
<dependent idx="1">Many</dependent> | |
</dep> | |
<dep type="nsubj"> | |
<governor idx="5">involve</governor> | |
<dependent idx="2">challenges</dependent> | |
</dep> | |
<dep type="case"> | |
<governor idx="4">NLP</governor> | |
<dependent idx="3">in</dependent> | |
</dep> | |
<dep type="nmod:in"> | |
<governor idx="2">challenges</governor> | |
<dependent idx="4">NLP</dependent> | |
</dep> | |
<dep type="amod"> | |
<governor idx="8">understanding</governor> | |
<dependent idx="6">natural</dependent> | |
</dep> | |
<dep type="compound"> | |
<governor idx="8">understanding</governor> | |
<dependent idx="7">language</dependent> | |
</dep> | |
<dep type="nsubj"> | |
<governor idx="13">enabling</governor> | |
<dependent idx="8">understanding</dependent> | |
</dep> | |
<dep type="punct"> | |
<governor idx="8">understanding</governor> | |
<dependent idx="9">,</dependent> | |
</dep> | |
<dep type="nsubj"> | |
<governor idx="11">is</governor> | |
<dependent idx="10">that</dependent> | |
</dep> | |
<dep type="acl:relcl"> | |
<governor idx="8">understanding</governor> | |
<dependent idx="11">is</dependent> | |
</dep> | |
<dep type="punct"> | |
<governor idx="8">understanding</governor> | |
<dependent idx="12">,</dependent> | |
</dep> | |
<dep type="dep"> | |
<governor idx="5">involve</governor> | |
<dependent idx="13">enabling</dependent> | |
</dep> | |
<dep type="dobj"> | |
<governor idx="13">enabling</governor> | |
<dependent idx="14">computers</dependent> | |
</dep> | |
<dep type="mark"> | |
<governor idx="16">derive</governor> | |
<dependent idx="15">to</dependent> | |
</dep> | |
<dep type="advcl"> | |
<governor idx="13">enabling</governor> | |
<dependent idx="16">derive</dependent> | |
</dep> | |
<dep type="dobj"> | |
<governor idx="16">derive</governor> | |
<dependent idx="17">meaning</dependent> | |
</dep> | |
<dep type="punct"> | |
<governor idx="5">involve</governor> | |
<dependent idx="18">.</dependent> | |
</dep> | |
</dependencies> | |
</sentence> | |
</sentences> | |
<coreference/> | |
</document> | |
</root>`, | |
expect: []string{"understanding enabling computers"}, | |
}, | |
} | |
func TestUpdateToRepresentiveText(t *testing.T) { | |
for _, testcase := range tokenizeTests { | |
t.Log(testcase.name) | |
f, err := os.Create(testcase.file) | |
if err != nil { | |
t.Errorf("could not create a file: %s\n %s\n", testcase.file, err) | |
} | |
f.WriteString(testcase.text) | |
f.Close() | |
root, err := readXML(testcase.file) | |
if err != nil { | |
t.Error(err) | |
} | |
var verbs, subjects, objects []string | |
for _, dep := range root.Document.Sentences[0].Dependencies { | |
verbs, err = dep.getVerb() | |
if err != nil { | |
fmt.Print(err) | |
os.Exit(1) | |
} | |
if len(verbs) == 0 { | |
continue | |
} | |
subjects, err = dep.getSubject(verbs) | |
if err != nil { | |
fmt.Print(err) | |
os.Exit(1) | |
} | |
objects, err = dep.getObject(verbs) | |
if err != nil { | |
fmt.Print(err) | |
os.Exit(1) | |
} | |
for i := 0; i < len(verbs); i++ { | |
result := subjects[i] + "\t" + verbs[i] + "\t" + objects[i] | |
if diff := deep.Equal(result, testcase.expect[i]); diff != nil { | |
t.Error(diff) | |
} | |
} | |
} | |
if err := os.Remove(testcase.file); err != nil { | |
t.Errorf("could not delete a file: %s\n %s\n", testcase.file, err) | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment