Created
April 11, 2018 14:18
-
-
Save toVersus/4084efea6446f0d8bc23e3782aaea951 to your computer and use it in GitHub Desktop.
[Language Processing 100 Essentials] #57: Dependency analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/xml" | |
"flag" | |
"fmt" | |
"os" | |
"strconv" | |
"github.com/awalterschulze/gographviz" | |
) | |
type Root struct { | |
Document *Document `xml:"document"` | |
} | |
type Document struct { | |
Sentences Sentences `xml:"sentences>sentence"` | |
Coreferences Coreferences `xml:"coreference>coreference"` | |
} | |
type Sentences []*Sentence | |
type Coreferences []*Coreference | |
type Sentence struct { | |
ID int `xml:"id,attr"` | |
Dependencies Dependencies `xml:"dependencies"` | |
Parse string `xml:"parse,omitempty"` | |
Tokens Tokes `xml:"tokens>token,omitempty"` | |
} | |
type Dependencies []*Dependencie | |
type Tokes []*Token | |
type Token struct { | |
ID string `xml:"id,attr"` | |
Word string `xml:"word,omitempty"` | |
Lemma string `xml:"lemma,omitempty"` | |
CharacterOffsetBegin int `xml:"CharacterOffsetBegin,omitempty"` | |
CharacterOffsetEnd int `xml:"CharacterOffsetEnd,omitempty"` | |
POS string `xml:"POS,omitempty"` | |
NER string `xml:"NER,omitempty"` | |
NormalizedNER string `xml:"NormalizedNER,omitempty"` | |
Speaker string `xml:"Speaker,omitempty"` | |
Timex *Timex `xml:"Timex,omitempty"` | |
} | |
type Timex struct { | |
Tid string `xml:"tid,attr"` | |
Type string `xml:"type,attr"` | |
Value string `xml:",chardata"` | |
} | |
type Governor struct { | |
Copy string `xml:"copy,attr"` | |
Idx int `xml:"idx,attr"` | |
Value string `xml:",chardata"` | |
} | |
type Dependent struct { | |
Copy string `xml:"copy,attr"` | |
Idx int `xml:"idx,attr"` | |
Value string `xml:",chardata"` | |
} | |
type Dep struct { | |
Extra string `xml:"extra,attr"` | |
Type string `xml:"type,attr"` | |
Dependent *Dependent `xml:"dependent,omitempty"` | |
Governor *Governor `xml:"governor,omitempty"` | |
} | |
type Dependencie struct { | |
Type string `xml:"type,attr"` | |
Deps Deps `xml:"dep,omitempty"` | |
} | |
func (dep Dependencie) createDiGraphDotFile(sentenceNum int) error { | |
g := gographviz.NewGraph() | |
if err := g.SetDir(true); err != nil { | |
panic(err) | |
} | |
nodeAttrs := make(map[string]string) | |
nodeAttrs["colorscheme"] = "rdylgn11" | |
nodeAttrs["style"] = "\"solid,filled\"" | |
nodeAttrs["fontcolor"] = "black" | |
nodeAttrs["fontname"] = "\"Migu 1M\"" | |
nodeAttrs["color"] = "10" | |
nodeAttrs["fillcolor"] = "7" | |
edgeAttrs := make(map[string]string) | |
edgeAttrs["color"] = "black" | |
if dep.Type != "collapsed-dependencies" { | |
return nil | |
} | |
for _, d := range dep.Deps { | |
if d.Type == "punct" { | |
continue | |
} | |
if err := g.AddNode("G", strconv.Quote(d.Dependent.Value), nodeAttrs); err != nil { | |
return fmt.Errorf("could not add dependent word into the node:\n %s", err) | |
} | |
if err := g.AddNode("G", strconv.Quote(d.Governor.Value), nodeAttrs); err != nil { | |
return fmt.Errorf("could not add governor word into the node:\n %s", err) | |
} | |
if err := g.AddEdge(strconv.Quote(d.Dependent.Value), strconv.Quote(d.Governor.Value), true, edgeAttrs); err != nil { | |
return fmt.Errorf("could not add dependent and governor word into the edge:\n %s", err) | |
} | |
} | |
file, err := os.Create("digraph.dot") | |
if err != nil { | |
return fmt.Errorf("could not create digraph file:\n %s", err) | |
} | |
defer file.Close() | |
file.WriteString(g.String()) | |
return nil | |
} | |
type Deps []*Dep | |
type Coreference struct { | |
Mentions Mentions `xml:"mention,omitempty"` | |
} | |
type Mentions []*Mention | |
type Mention struct { | |
Representative string `xml:"representative,attr"` | |
Sentence int `xml:"sentence,omitempty"` | |
Start int `xml:"start,omitempty"` | |
End int `xml:"end,omitempty"` | |
Head int `xml:"head,omitempty"` | |
Text string `xml:"text,omitempty"` | |
} | |
func main() { | |
var filePath, destFilePath string | |
var sentenceNum int | |
flag.StringVar(&filePath, "file", "", "specify a file path") | |
flag.StringVar(&filePath, "f", "", "specify a file path") | |
flag.StringVar(&destFilePath, "dest", "", "specify a dest file path") | |
flag.StringVar(&destFilePath, "d", "", "specify a dest file path") | |
flag.IntVar(&sentenceNum, "n", 1, "specify number of sentence") | |
flag.Parse() | |
r, err := readXML(filePath) | |
if err != nil { | |
fmt.Println(err) | |
os.Exit(1) | |
} | |
for _, dep := range r.Document.Sentences[sentenceNum].Dependencies { | |
if err := dep.createDiGraphDotFile(sentenceNum); err != nil { | |
fmt.Print(err) | |
os.Exit(1) | |
} | |
} | |
} | |
// readXML reads the result of Stanford Core NLP and initiate the Root struct | |
func readXML(path string) (*Root, error) { | |
f, err := os.Open(path) | |
if err != nil { | |
return nil, fmt.Errorf("could not open a file: %s\n %s", path, err) | |
} | |
defer f.Close() | |
r := &Root{} | |
dec := xml.NewDecoder(f) | |
err = dec.Decode(r) | |
if err != nil { | |
return nil, err | |
} | |
return r, nil | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"io/ioutil" | |
"os" | |
"testing" | |
"github.com/go-test/deep" | |
) | |
var tokenizeTests = []struct { | |
name string | |
file string | |
text string | |
expect string | |
}{ | |
{ | |
name: "should slice the simple sentence into words", | |
file: "simple-test.xml", | |
text: `<?xml version="1.0" encoding="UTF-8"?> | |
<?xml-stylesheet href="CoreNLP-to-HTML.xsl" type="text/xsl"?> | |
<root> | |
<document> | |
<docId>test.txt</docId> | |
<sentences> | |
<sentence id="1"> | |
<tokens> | |
<token id="1"> | |
<word>I</word> | |
<lemma>I</lemma> | |
<CharacterOffsetBegin>0</CharacterOffsetBegin> | |
<CharacterOffsetEnd>1</CharacterOffsetEnd> | |
<POS>PRP</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
<token id="2"> | |
<word>am</word> | |
<lemma>be</lemma> | |
<CharacterOffsetBegin>2</CharacterOffsetBegin> | |
<CharacterOffsetEnd>4</CharacterOffsetEnd> | |
<POS>VBP</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
<token id="3"> | |
<word>Bob</word> | |
<lemma>Bob</lemma> | |
<CharacterOffsetBegin>5</CharacterOffsetBegin> | |
<CharacterOffsetEnd>8</CharacterOffsetEnd> | |
<POS>NNP</POS> | |
<NER>PERSON</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
<token id="4"> | |
<word>.</word> | |
<lemma>.</lemma> | |
<CharacterOffsetBegin>8</CharacterOffsetBegin> | |
<CharacterOffsetEnd>9</CharacterOffsetEnd> | |
<POS>.</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
</tokens> | |
<parse>(ROOT (S (NP (PRP I)) (VP (VBP am) (NP (NNP Bob))) (. .))) </parse> | |
<dependencies type="basic-dependencies"> | |
<dep type="root"> | |
<governor idx="0">ROOT</governor> | |
<dependent idx="3">Bob</dependent> | |
</dep> | |
<dep type="nsubj"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="1">I</dependent> | |
</dep> | |
<dep type="cop"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="2">am</dependent> | |
</dep> | |
<dep type="punct"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="4">.</dependent> | |
</dep> | |
</dependencies> | |
<dependencies type="collapsed-dependencies"> | |
<dep type="root"> | |
<governor idx="0">ROOT</governor> | |
<dependent idx="3">Bob</dependent> | |
</dep> | |
<dep type="nsubj"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="1">I</dependent> | |
</dep> | |
<dep type="cop"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="2">am</dependent> | |
</dep> | |
<dep type="punct"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="4">.</dependent> | |
</dep> | |
</dependencies> | |
<dependencies type="collapsed-ccprocessed-dependencies"> | |
<dep type="root"> | |
<governor idx="0">ROOT</governor> | |
<dependent idx="3">Bob</dependent> | |
</dep> | |
<dep type="nsubj"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="1">I</dependent> | |
</dep> | |
<dep type="cop"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="2">am</dependent> | |
</dep> | |
<dep type="punct"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="4">.</dependent> | |
</dep> | |
</dependencies> | |
<dependencies type="enhanced-dependencies"> | |
<dep type="root"> | |
<governor idx="0">ROOT</governor> | |
<dependent idx="3">Bob</dependent> | |
</dep> | |
<dep type="nsubj"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="1">I</dependent> | |
</dep> | |
<dep type="cop"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="2">am</dependent> | |
</dep> | |
<dep type="punct"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="4">.</dependent> | |
</dep> | |
</dependencies> | |
<dependencies type="enhanced-plus-plus-dependencies"> | |
<dep type="root"> | |
<governor idx="0">ROOT</governor> | |
<dependent idx="3">Bob</dependent> | |
</dep> | |
<dep type="nsubj"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="1">I</dependent> | |
</dep> | |
<dep type="cop"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="2">am</dependent> | |
</dep> | |
<dep type="punct"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="4">.</dependent> | |
</dep> | |
</dependencies> | |
</sentence> | |
</sentences> | |
<coreference> | |
<coreference> | |
<mention representative="true"> | |
<sentence>1</sentence> | |
<start>3</start> | |
<end>4</end> | |
<head>3</head> | |
<text>Bob</text> | |
</mention> | |
<mention> | |
<sentence>1</sentence> | |
<start>1</start> | |
<end>2</end> | |
<head>1</head> | |
<text>I</text> | |
</mention> | |
</coreference> | |
</coreference> | |
</document> | |
</root>`, | |
expect: `digraph { | |
"Bob"->"ROOT"[ color=black ]; | |
"I"->"Bob"[ color=black ]; | |
"am"->"Bob"[ color=black ]; | |
"Bob" [ color=10, colorscheme=rdylgn11, fillcolor=7, fontcolor=black, fontname="Migu 1M", style="solid,filled" ]; | |
"I" [ color=10, colorscheme=rdylgn11, fillcolor=7, fontcolor=black, fontname="Migu 1M", style="solid,filled" ]; | |
"ROOT" [ color=10, colorscheme=rdylgn11, fillcolor=7, fontcolor=black, fontname="Migu 1M", style="solid,filled" ]; | |
"am" [ color=10, colorscheme=rdylgn11, fillcolor=7, fontcolor=black, fontname="Migu 1M", style="solid,filled" ]; | |
} | |
`, | |
}, | |
} | |
func TestUpdateToRepresentiveText(t *testing.T) { | |
for _, testcase := range tokenizeTests { | |
t.Log(testcase.name) | |
f, err := os.Create(testcase.file) | |
if err != nil { | |
t.Errorf("could not create a file: %s\n %s\n", testcase.file, err) | |
} | |
f.WriteString(testcase.text) | |
f.Close() | |
root, err := readXML(testcase.file) | |
if err != nil { | |
t.Error(err) | |
} | |
for _, dep := range root.Document.Sentences[0].Dependencies { | |
if err := dep.createDiGraphDotFile(0); err != nil { | |
fmt.Print(err) | |
os.Exit(1) | |
} | |
} | |
buf, err := ioutil.ReadFile("digraph.dot") | |
if diff := deep.Equal(string(buf), testcase.expect); diff != nil { | |
t.Error(diff) | |
} | |
if err := os.Remove(testcase.file); err != nil { | |
t.Errorf("could not delete a file: %s\n %s\n", testcase.file, err) | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment