Skip to content

Instantly share code, notes, and snippets.

@toVersus
Created April 11, 2018 14:18
Show Gist options
  • Save toVersus/4084efea6446f0d8bc23e3782aaea951 to your computer and use it in GitHub Desktop.
Save toVersus/4084efea6446f0d8bc23e3782aaea951 to your computer and use it in GitHub Desktop.
[Language Processing 100 Essentials] #57: Dependency analysis
package main
import (
"encoding/xml"
"flag"
"fmt"
"os"
"strconv"
"github.com/awalterschulze/gographviz"
)
type Root struct {
Document *Document `xml:"document"`
}
type Document struct {
Sentences Sentences `xml:"sentences>sentence"`
Coreferences Coreferences `xml:"coreference>coreference"`
}
type Sentences []*Sentence
type Coreferences []*Coreference
type Sentence struct {
ID int `xml:"id,attr"`
Dependencies Dependencies `xml:"dependencies"`
Parse string `xml:"parse,omitempty"`
Tokens Tokes `xml:"tokens>token,omitempty"`
}
type Dependencies []*Dependencie
type Tokes []*Token
type Token struct {
ID string `xml:"id,attr"`
Word string `xml:"word,omitempty"`
Lemma string `xml:"lemma,omitempty"`
CharacterOffsetBegin int `xml:"CharacterOffsetBegin,omitempty"`
CharacterOffsetEnd int `xml:"CharacterOffsetEnd,omitempty"`
POS string `xml:"POS,omitempty"`
NER string `xml:"NER,omitempty"`
NormalizedNER string `xml:"NormalizedNER,omitempty"`
Speaker string `xml:"Speaker,omitempty"`
Timex *Timex `xml:"Timex,omitempty"`
}
type Timex struct {
Tid string `xml:"tid,attr"`
Type string `xml:"type,attr"`
Value string `xml:",chardata"`
}
type Governor struct {
Copy string `xml:"copy,attr"`
Idx int `xml:"idx,attr"`
Value string `xml:",chardata"`
}
type Dependent struct {
Copy string `xml:"copy,attr"`
Idx int `xml:"idx,attr"`
Value string `xml:",chardata"`
}
type Dep struct {
Extra string `xml:"extra,attr"`
Type string `xml:"type,attr"`
Dependent *Dependent `xml:"dependent,omitempty"`
Governor *Governor `xml:"governor,omitempty"`
}
type Dependencie struct {
Type string `xml:"type,attr"`
Deps Deps `xml:"dep,omitempty"`
}
func (dep Dependencie) createDiGraphDotFile(sentenceNum int) error {
g := gographviz.NewGraph()
if err := g.SetDir(true); err != nil {
panic(err)
}
nodeAttrs := make(map[string]string)
nodeAttrs["colorscheme"] = "rdylgn11"
nodeAttrs["style"] = "\"solid,filled\""
nodeAttrs["fontcolor"] = "black"
nodeAttrs["fontname"] = "\"Migu 1M\""
nodeAttrs["color"] = "10"
nodeAttrs["fillcolor"] = "7"
edgeAttrs := make(map[string]string)
edgeAttrs["color"] = "black"
if dep.Type != "collapsed-dependencies" {
return nil
}
for _, d := range dep.Deps {
if d.Type == "punct" {
continue
}
if err := g.AddNode("G", strconv.Quote(d.Dependent.Value), nodeAttrs); err != nil {
return fmt.Errorf("could not add dependent word into the node:\n %s", err)
}
if err := g.AddNode("G", strconv.Quote(d.Governor.Value), nodeAttrs); err != nil {
return fmt.Errorf("could not add governor word into the node:\n %s", err)
}
if err := g.AddEdge(strconv.Quote(d.Dependent.Value), strconv.Quote(d.Governor.Value), true, edgeAttrs); err != nil {
return fmt.Errorf("could not add dependent and governor word into the edge:\n %s", err)
}
}
file, err := os.Create("digraph.dot")
if err != nil {
return fmt.Errorf("could not create digraph file:\n %s", err)
}
defer file.Close()
file.WriteString(g.String())
return nil
}
type Deps []*Dep
type Coreference struct {
Mentions Mentions `xml:"mention,omitempty"`
}
type Mentions []*Mention
type Mention struct {
Representative string `xml:"representative,attr"`
Sentence int `xml:"sentence,omitempty"`
Start int `xml:"start,omitempty"`
End int `xml:"end,omitempty"`
Head int `xml:"head,omitempty"`
Text string `xml:"text,omitempty"`
}
func main() {
var filePath, destFilePath string
var sentenceNum int
flag.StringVar(&filePath, "file", "", "specify a file path")
flag.StringVar(&filePath, "f", "", "specify a file path")
flag.StringVar(&destFilePath, "dest", "", "specify a dest file path")
flag.StringVar(&destFilePath, "d", "", "specify a dest file path")
flag.IntVar(&sentenceNum, "n", 1, "specify number of sentence")
flag.Parse()
r, err := readXML(filePath)
if err != nil {
fmt.Println(err)
os.Exit(1)
}
for _, dep := range r.Document.Sentences[sentenceNum].Dependencies {
if err := dep.createDiGraphDotFile(sentenceNum); err != nil {
fmt.Print(err)
os.Exit(1)
}
}
}
// readXML reads the result of Stanford Core NLP and initiate the Root struct
func readXML(path string) (*Root, error) {
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("could not open a file: %s\n %s", path, err)
}
defer f.Close()
r := &Root{}
dec := xml.NewDecoder(f)
err = dec.Decode(r)
if err != nil {
return nil, err
}
return r, nil
}
package main
import (
"fmt"
"io/ioutil"
"os"
"testing"
"github.com/go-test/deep"
)
var tokenizeTests = []struct {
name string
file string
text string
expect string
}{
{
name: "should slice the simple sentence into words",
file: "simple-test.xml",
text: `<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet href="CoreNLP-to-HTML.xsl" type="text/xsl"?>
<root>
<document>
<docId>test.txt</docId>
<sentences>
<sentence id="1">
<tokens>
<token id="1">
<word>I</word>
<lemma>I</lemma>
<CharacterOffsetBegin>0</CharacterOffsetBegin>
<CharacterOffsetEnd>1</CharacterOffsetEnd>
<POS>PRP</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="2">
<word>am</word>
<lemma>be</lemma>
<CharacterOffsetBegin>2</CharacterOffsetBegin>
<CharacterOffsetEnd>4</CharacterOffsetEnd>
<POS>VBP</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="3">
<word>Bob</word>
<lemma>Bob</lemma>
<CharacterOffsetBegin>5</CharacterOffsetBegin>
<CharacterOffsetEnd>8</CharacterOffsetEnd>
<POS>NNP</POS>
<NER>PERSON</NER>
<Speaker>PER0</Speaker>
</token>
<token id="4">
<word>.</word>
<lemma>.</lemma>
<CharacterOffsetBegin>8</CharacterOffsetBegin>
<CharacterOffsetEnd>9</CharacterOffsetEnd>
<POS>.</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
</tokens>
<parse>(ROOT (S (NP (PRP I)) (VP (VBP am) (NP (NNP Bob))) (. .))) </parse>
<dependencies type="basic-dependencies">
<dep type="root">
<governor idx="0">ROOT</governor>
<dependent idx="3">Bob</dependent>
</dep>
<dep type="nsubj">
<governor idx="3">Bob</governor>
<dependent idx="1">I</dependent>
</dep>
<dep type="cop">
<governor idx="3">Bob</governor>
<dependent idx="2">am</dependent>
</dep>
<dep type="punct">
<governor idx="3">Bob</governor>
<dependent idx="4">.</dependent>
</dep>
</dependencies>
<dependencies type="collapsed-dependencies">
<dep type="root">
<governor idx="0">ROOT</governor>
<dependent idx="3">Bob</dependent>
</dep>
<dep type="nsubj">
<governor idx="3">Bob</governor>
<dependent idx="1">I</dependent>
</dep>
<dep type="cop">
<governor idx="3">Bob</governor>
<dependent idx="2">am</dependent>
</dep>
<dep type="punct">
<governor idx="3">Bob</governor>
<dependent idx="4">.</dependent>
</dep>
</dependencies>
<dependencies type="collapsed-ccprocessed-dependencies">
<dep type="root">
<governor idx="0">ROOT</governor>
<dependent idx="3">Bob</dependent>
</dep>
<dep type="nsubj">
<governor idx="3">Bob</governor>
<dependent idx="1">I</dependent>
</dep>
<dep type="cop">
<governor idx="3">Bob</governor>
<dependent idx="2">am</dependent>
</dep>
<dep type="punct">
<governor idx="3">Bob</governor>
<dependent idx="4">.</dependent>
</dep>
</dependencies>
<dependencies type="enhanced-dependencies">
<dep type="root">
<governor idx="0">ROOT</governor>
<dependent idx="3">Bob</dependent>
</dep>
<dep type="nsubj">
<governor idx="3">Bob</governor>
<dependent idx="1">I</dependent>
</dep>
<dep type="cop">
<governor idx="3">Bob</governor>
<dependent idx="2">am</dependent>
</dep>
<dep type="punct">
<governor idx="3">Bob</governor>
<dependent idx="4">.</dependent>
</dep>
</dependencies>
<dependencies type="enhanced-plus-plus-dependencies">
<dep type="root">
<governor idx="0">ROOT</governor>
<dependent idx="3">Bob</dependent>
</dep>
<dep type="nsubj">
<governor idx="3">Bob</governor>
<dependent idx="1">I</dependent>
</dep>
<dep type="cop">
<governor idx="3">Bob</governor>
<dependent idx="2">am</dependent>
</dep>
<dep type="punct">
<governor idx="3">Bob</governor>
<dependent idx="4">.</dependent>
</dep>
</dependencies>
</sentence>
</sentences>
<coreference>
<coreference>
<mention representative="true">
<sentence>1</sentence>
<start>3</start>
<end>4</end>
<head>3</head>
<text>Bob</text>
</mention>
<mention>
<sentence>1</sentence>
<start>1</start>
<end>2</end>
<head>1</head>
<text>I</text>
</mention>
</coreference>
</coreference>
</document>
</root>`,
expect: `digraph {
"Bob"->"ROOT"[ color=black ];
"I"->"Bob"[ color=black ];
"am"->"Bob"[ color=black ];
"Bob" [ color=10, colorscheme=rdylgn11, fillcolor=7, fontcolor=black, fontname="Migu 1M", style="solid,filled" ];
"I" [ color=10, colorscheme=rdylgn11, fillcolor=7, fontcolor=black, fontname="Migu 1M", style="solid,filled" ];
"ROOT" [ color=10, colorscheme=rdylgn11, fillcolor=7, fontcolor=black, fontname="Migu 1M", style="solid,filled" ];
"am" [ color=10, colorscheme=rdylgn11, fillcolor=7, fontcolor=black, fontname="Migu 1M", style="solid,filled" ];
}
`,
},
}
func TestUpdateToRepresentiveText(t *testing.T) {
for _, testcase := range tokenizeTests {
t.Log(testcase.name)
f, err := os.Create(testcase.file)
if err != nil {
t.Errorf("could not create a file: %s\n %s\n", testcase.file, err)
}
f.WriteString(testcase.text)
f.Close()
root, err := readXML(testcase.file)
if err != nil {
t.Error(err)
}
for _, dep := range root.Document.Sentences[0].Dependencies {
if err := dep.createDiGraphDotFile(0); err != nil {
fmt.Print(err)
os.Exit(1)
}
}
buf, err := ioutil.ReadFile("digraph.dot")
if diff := deep.Equal(string(buf), testcase.expect); diff != nil {
t.Error(diff)
}
if err := os.Remove(testcase.file); err != nil {
t.Errorf("could not delete a file: %s\n %s\n", testcase.file, err)
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment