Skip to content

Instantly share code, notes, and snippets.

@toVersus
Created April 12, 2018 13:00
Show Gist options
  • Save toVersus/3c3cf684c7402f2323e626aceacc9a37 to your computer and use it in GitHub Desktop.
Save toVersus/3c3cf684c7402f2323e626aceacc9a37 to your computer and use it in GitHub Desktop.
[Language Processing 100 Essentials] #58: Extract tuples
package main
import (
"encoding/xml"
"flag"
"fmt"
"os"
)
type Root struct {
Document *Document `xml:"document"`
}
type Document struct {
Sentences Sentences `xml:"sentences>sentence"`
Coreferences Coreferences `xml:"coreference>coreference"`
}
type Sentences []*Sentence
type Coreferences []*Coreference
type Sentence struct {
ID int `xml:"id,attr"`
Dependencies Dependencies `xml:"dependencies"`
Parse string `xml:"parse,omitempty"`
Tokens Tokes `xml:"tokens>token,omitempty"`
}
type Dependencies []*Dependencie
type Tokes []*Token
type Token struct {
ID string `xml:"id,attr"`
Word string `xml:"word,omitempty"`
Lemma string `xml:"lemma,omitempty"`
CharacterOffsetBegin int `xml:"CharacterOffsetBegin,omitempty"`
CharacterOffsetEnd int `xml:"CharacterOffsetEnd,omitempty"`
POS string `xml:"POS,omitempty"`
NER string `xml:"NER,omitempty"`
NormalizedNER string `xml:"NormalizedNER,omitempty"`
Speaker string `xml:"Speaker,omitempty"`
Timex *Timex `xml:"Timex,omitempty"`
}
type Timex struct {
Tid string `xml:"tid,attr"`
Type string `xml:"type,attr"`
Value string `xml:",chardata"`
}
type Governor struct {
Copy string `xml:"copy,attr"`
Idx int `xml:"idx,attr"`
Value string `xml:",chardata"`
}
type Dependent struct {
Copy string `xml:"copy,attr"`
Idx int `xml:"idx,attr"`
Value string `xml:",chardata"`
}
type Dep struct {
Extra string `xml:"extra,attr"`
Type string `xml:"type,attr"`
Dependent *Dependent `xml:"dependent,omitempty"`
Governor *Governor `xml:"governor,omitempty"`
}
type Dependencie struct {
Type string `xml:"type,attr"`
Deps Deps `xml:"dep,omitempty"`
}
func (dep Dependencie) getVerb() ([]string, error) {
if dep.Type != "collapsed-dependencies" {
return nil, nil
}
nsubj := map[int]string{}
for _, d := range dep.Deps {
if d.Type != "nsubj" {
continue
}
nsubj[d.Governor.Idx] = d.Governor.Value
}
// To store the keys in slice in sorted order
var govIdxs []int
for _, d := range dep.Deps {
if d.Type != "dobj" {
continue
}
if _, ok := nsubj[d.Governor.Idx]; ok {
govIdxs = append(govIdxs, d.Governor.Idx)
}
}
// Store verb in ascending order of index
var verbs []string
for _, i := range govIdxs {
verbs = append(verbs, nsubj[i])
}
return verbs, nil
}
func (dep Dependencie) getSubject(verbs []string) ([]string, error) {
var subjects []string
for _, verb := range verbs {
for _, d := range dep.Deps {
if d.Type != "nsubj" {
continue
}
if d.Governor.Value != verb {
continue
}
subjects = append(subjects, d.Dependent.Value)
}
}
return subjects, nil
}
func (dep Dependencie) getObject(verbs []string) ([]string, error) {
var objects []string
for _, verb := range verbs {
for _, d := range dep.Deps {
if d.Type != "dobj" {
continue
}
if d.Governor.Value != verb {
continue
}
objects = append(objects, d.Dependent.Value)
}
}
return objects, nil
}
type Deps []*Dep
type Coreference struct {
Mentions Mentions `xml:"mention,omitempty"`
}
type Mentions []*Mention
type Mention struct {
Representative string `xml:"representative,attr"`
Sentence int `xml:"sentence,omitempty"`
Start int `xml:"start,omitempty"`
End int `xml:"end,omitempty"`
Head int `xml:"head,omitempty"`
Text string `xml:"text,omitempty"`
}
func main() {
var filePath, destFilePath string
var sentenceNum int
flag.StringVar(&filePath, "file", "", "specify a file path")
flag.StringVar(&filePath, "f", "", "specify a file path")
flag.StringVar(&destFilePath, "dest", "", "specify a dest file path")
flag.StringVar(&destFilePath, "d", "", "specify a dest file path")
flag.IntVar(&sentenceNum, "n", 1, "specify number of sentence")
flag.Parse()
r, err := readXML(filePath)
if err != nil {
fmt.Println(err)
os.Exit(1)
}
var verbs, subjects, objects []string
for _, dep := range r.Document.Sentences[sentenceNum].Dependencies {
verbs, err = dep.getVerb()
if err != nil {
fmt.Print(err)
os.Exit(1)
}
if len(verbs) == 0 {
continue
}
subjects, err = dep.getSubject(verbs)
if err != nil {
fmt.Print(err)
os.Exit(1)
}
objects, err = dep.getObject(verbs)
if err != nil {
fmt.Print(err)
os.Exit(1)
}
for i := 0; i < len(verbs); i++ {
fmt.Println(subjects[i] + "\t" + verbs[i] + "\t" + objects[i])
}
}
}
// readXML reads the result of Stanford Core NLP and initiate the Root struct
func readXML(path string) (*Root, error) {
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("could not open a file: %s\n %s", path, err)
}
defer f.Close()
r := &Root{}
dec := xml.NewDecoder(f)
err = dec.Decode(r)
if err != nil {
return nil, err
}
return r, nil
}
package main
import (
"fmt"
"os"
"testing"
"github.com/go-test/deep"
)
var tokenizeTests = []struct {
name string
file string
text string
expect []string
}{
{
name: "should slice the simple sentence into words",
file: "simple-test.xml",
text: `<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet href="CoreNLP-to-HTML.xsl" type="text/xsl"?>
<root>
<document>
<docId>test.txt</docId>
<sentences>
<sentence id="1">
<tokens>
<token id="1">
<word>Many</word>
<lemma>many</lemma>
<CharacterOffsetBegin>0</CharacterOffsetBegin>
<CharacterOffsetEnd>4</CharacterOffsetEnd>
<POS>JJ</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="2">
<word>challenges</word>
<lemma>challenge</lemma>
<CharacterOffsetBegin>5</CharacterOffsetBegin>
<CharacterOffsetEnd>15</CharacterOffsetEnd>
<POS>NNS</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="3">
<word>in</word>
<lemma>in</lemma>
<CharacterOffsetBegin>16</CharacterOffsetBegin>
<CharacterOffsetEnd>18</CharacterOffsetEnd>
<POS>IN</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="4">
<word>NLP</word>
<lemma>nlp</lemma>
<CharacterOffsetBegin>19</CharacterOffsetBegin>
<CharacterOffsetEnd>22</CharacterOffsetEnd>
<POS>NN</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="5">
<word>involve</word>
<lemma>involve</lemma>
<CharacterOffsetBegin>23</CharacterOffsetBegin>
<CharacterOffsetEnd>30</CharacterOffsetEnd>
<POS>VBP</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="6">
<word>natural</word>
<lemma>natural</lemma>
<CharacterOffsetBegin>31</CharacterOffsetBegin>
<CharacterOffsetEnd>38</CharacterOffsetEnd>
<POS>JJ</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="7">
<word>language</word>
<lemma>language</lemma>
<CharacterOffsetBegin>39</CharacterOffsetBegin>
<CharacterOffsetEnd>47</CharacterOffsetEnd>
<POS>NN</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="8">
<word>understanding</word>
<lemma>understanding</lemma>
<CharacterOffsetBegin>48</CharacterOffsetBegin>
<CharacterOffsetEnd>61</CharacterOffsetEnd>
<POS>NN</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="9">
<word>,</word>
<lemma>,</lemma>
<CharacterOffsetBegin>61</CharacterOffsetBegin>
<CharacterOffsetEnd>62</CharacterOffsetEnd>
<POS>,</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="10">
<word>that</word>
<lemma>that</lemma>
<CharacterOffsetBegin>63</CharacterOffsetBegin>
<CharacterOffsetEnd>67</CharacterOffsetEnd>
<POS>WDT</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="11">
<word>is</word>
<lemma>be</lemma>
<CharacterOffsetBegin>68</CharacterOffsetBegin>
<CharacterOffsetEnd>70</CharacterOffsetEnd>
<POS>VBZ</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="12">
<word>,</word>
<lemma>,</lemma>
<CharacterOffsetBegin>70</CharacterOffsetBegin>
<CharacterOffsetEnd>71</CharacterOffsetEnd>
<POS>,</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="13">
<word>enabling</word>
<lemma>enable</lemma>
<CharacterOffsetBegin>72</CharacterOffsetBegin>
<CharacterOffsetEnd>80</CharacterOffsetEnd>
<POS>VBG</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="14">
<word>computers</word>
<lemma>computer</lemma>
<CharacterOffsetBegin>81</CharacterOffsetBegin>
<CharacterOffsetEnd>90</CharacterOffsetEnd>
<POS>NNS</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="15">
<word>to</word>
<lemma>to</lemma>
<CharacterOffsetBegin>91</CharacterOffsetBegin>
<CharacterOffsetEnd>93</CharacterOffsetEnd>
<POS>TO</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="16">
<word>derive</word>
<lemma>derive</lemma>
<CharacterOffsetBegin>94</CharacterOffsetBegin>
<CharacterOffsetEnd>100</CharacterOffsetEnd>
<POS>VB</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="17">
<word>meaning</word>
<lemma>meaning</lemma>
<CharacterOffsetBegin>101</CharacterOffsetBegin>
<CharacterOffsetEnd>108</CharacterOffsetEnd>
<POS>NN</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="18">
<word>.</word>
<lemma>.</lemma>
<CharacterOffsetBegin>108</CharacterOffsetBegin>
<CharacterOffsetEnd>109</CharacterOffsetEnd>
<POS>.</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
</tokens>
<parse>(ROOT (S (NP (NP (JJ Many) (NNS challenges)) (PP (IN in) (NP (NN NLP)))) (VP (VBP involve) (S (NP (NP (JJ natural) (NN language) (NN understanding)) (, ,) (SBAR (WHNP (WDT that)) (S (VP (VBZ is)))) (, ,)) (VP (VBG enabling) (NP (NNS computers)) (S (VP (TO to) (VP (VB derive) (NP (NN meaning)))))))) (. .))) </parse>
<dependencies type="collapsed-dependencies">
<dep type="root">
<governor idx="0">ROOT</governor>
<dependent idx="5">involve</dependent>
</dep>
<dep type="amod">
<governor idx="2">challenges</governor>
<dependent idx="1">Many</dependent>
</dep>
<dep type="nsubj">
<governor idx="5">involve</governor>
<dependent idx="2">challenges</dependent>
</dep>
<dep type="case">
<governor idx="4">NLP</governor>
<dependent idx="3">in</dependent>
</dep>
<dep type="nmod:in">
<governor idx="2">challenges</governor>
<dependent idx="4">NLP</dependent>
</dep>
<dep type="amod">
<governor idx="8">understanding</governor>
<dependent idx="6">natural</dependent>
</dep>
<dep type="compound">
<governor idx="8">understanding</governor>
<dependent idx="7">language</dependent>
</dep>
<dep type="nsubj">
<governor idx="13">enabling</governor>
<dependent idx="8">understanding</dependent>
</dep>
<dep type="punct">
<governor idx="8">understanding</governor>
<dependent idx="9">,</dependent>
</dep>
<dep type="nsubj">
<governor idx="11">is</governor>
<dependent idx="10">that</dependent>
</dep>
<dep type="acl:relcl">
<governor idx="8">understanding</governor>
<dependent idx="11">is</dependent>
</dep>
<dep type="punct">
<governor idx="8">understanding</governor>
<dependent idx="12">,</dependent>
</dep>
<dep type="dep">
<governor idx="5">involve</governor>
<dependent idx="13">enabling</dependent>
</dep>
<dep type="dobj">
<governor idx="13">enabling</governor>
<dependent idx="14">computers</dependent>
</dep>
<dep type="mark">
<governor idx="16">derive</governor>
<dependent idx="15">to</dependent>
</dep>
<dep type="advcl">
<governor idx="13">enabling</governor>
<dependent idx="16">derive</dependent>
</dep>
<dep type="dobj">
<governor idx="16">derive</governor>
<dependent idx="17">meaning</dependent>
</dep>
<dep type="punct">
<governor idx="5">involve</governor>
<dependent idx="18">.</dependent>
</dep>
</dependencies>
</sentence>
</sentences>
<coreference/>
</document>
</root>`,
expect: []string{"understanding enabling computers"},
},
}
func TestUpdateToRepresentiveText(t *testing.T) {
for _, testcase := range tokenizeTests {
t.Log(testcase.name)
f, err := os.Create(testcase.file)
if err != nil {
t.Errorf("could not create a file: %s\n %s\n", testcase.file, err)
}
f.WriteString(testcase.text)
f.Close()
root, err := readXML(testcase.file)
if err != nil {
t.Error(err)
}
var verbs, subjects, objects []string
for _, dep := range root.Document.Sentences[0].Dependencies {
verbs, err = dep.getVerb()
if err != nil {
fmt.Print(err)
os.Exit(1)
}
if len(verbs) == 0 {
continue
}
subjects, err = dep.getSubject(verbs)
if err != nil {
fmt.Print(err)
os.Exit(1)
}
objects, err = dep.getObject(verbs)
if err != nil {
fmt.Print(err)
os.Exit(1)
}
for i := 0; i < len(verbs); i++ {
result := subjects[i] + "\t" + verbs[i] + "\t" + objects[i]
if diff := deep.Equal(result, testcase.expect[i]); diff != nil {
t.Error(diff)
}
}
}
if err := os.Remove(testcase.file); err != nil {
t.Errorf("could not delete a file: %s\n %s\n", testcase.file, err)
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment