Skip to content

Instantly share code, notes, and snippets.

@toVersus
Created April 10, 2018 11:24
Show Gist options
  • Save toVersus/c5ffbab86c43055cc7c573f33698d0f2 to your computer and use it in GitHub Desktop.
Save toVersus/c5ffbab86c43055cc7c573f33698d0f2 to your computer and use it in GitHub Desktop.
[Language Processing 100 Essentials] #56: Co-reference analysis
package main
import (
"encoding/xml"
"flag"
"fmt"
"os"
"strings"
)
type Root struct {
Document *Document `xml:"document"`
}
type Document struct {
Sentences []*Sentence `xml:"sentences>sentence"`
Coreferences []*Coreference `xml:"coreference>coreference"`
}
func (doc Document) updateToRepresentiveText() string {
var text string
for _, coref := range doc.Coreferences {
repl := ""
for _, mention := range coref.Mention {
sendID, start, end := mention.Sentence, mention.Start, mention.End
if mention.Representative == "true" {
repl = mention.Text
continue
}
doc.Sentences[sendID-1].Tokens[start-1].Word = "[" + repl + "] (" + doc.Sentences[sendID-1].Tokens[start-1].Word
doc.Sentences[sendID-1].Tokens[end-2].Word += ")"
}
}
for _, s := range doc.Sentences {
text += s.wordString() + "\n"
}
return text
}
type Sentence struct {
ID int `xml:"id,attr"`
Dependencies []*Dependencies `xml:"dependencies"`
Parse string `xml:"parse,omitempty"`
Tokens []*Token `xml:"tokens>token,omitempty"`
}
func (sent Sentence) wordString() string {
var words []string
for _, token := range sent.Tokens {
words = append(words, token.Word)
}
return strings.Join(words, " ")
}
type Token struct {
ID string `xml:"id,attr"`
Word string `xml:"word,omitempty"`
Lemma string `xml:"lemma,omitempty"`
CharacterOffsetBegin int `xml:"CharacterOffsetBegin,omitempty"`
CharacterOffsetEnd int `xml:"CharacterOffsetEnd,omitempty"`
POS string `xml:"POS,omitempty"`
NER string `xml:"NER,omitempty"`
NormalizedNER string `xml:"NormalizedNER,omitempty"`
Speaker string `xml:"Speaker,omitempty"`
Timex *Timex `xml:"Timex,omitempty"`
}
func (token *Token) getWordTaggedByPos() string {
return fmt.Sprintf("%s\t%s\t%s", token.Word, token.Lemma, token.POS)
}
func (token *Token) getPersonName() string {
if token.NER == "PERSON" {
return token.Word
}
return ""
}
type Timex struct {
Tid string `xml:"tid,attr"`
Type string `xml:"type,attr"`
Value string `xml:",chardata"`
}
type Governor struct {
Copy string `xml:"copy,attr"`
Idx int `xml:"idx,attr"`
Value string `xml:",chardata"`
}
type Dependent struct {
Copy string `xml:"copy,attr"`
Idx int `xml:"idx,attr"`
Value string `xml:",chardata"`
}
type Dep struct {
Extra string `xml:"extra,attr"`
Type string `xml:"type,attr"`
Dependent *Dependent `xml:"dependent,omitempty"`
Governor *Governor `xml:"governor,omitempty"`
}
type Dependencies struct {
Type string `xml:"type,attr"`
Dep []*Dep `xml:"dep,omitempty"`
}
type Coreference struct {
Mention []*Mention `xml:"mention,omitempty"`
}
type Mention struct {
Representative string `xml:"representative,attr"`
Sentence int `xml:"sentence,omitempty"`
Start int `xml:"start,omitempty"`
End int `xml:"end,omitempty"`
Head int `xml:"head,omitempty"`
Text string `xml:"text,omitempty"`
}
func main() {
var filePath, destFilePath string
flag.StringVar(&filePath, "file", "", "specify a file path")
flag.StringVar(&filePath, "f", "", "specify a file path")
flag.StringVar(&destFilePath, "dest", "", "specify a dest file path")
flag.StringVar(&destFilePath, "d", "", "specify a dest file path")
flag.Parse()
r, err := readXML(filePath)
if err != nil {
fmt.Println(err)
os.Exit(1)
}
text := r.Document.updateToRepresentiveText()
fmt.Print(text)
}
// readXML reads the result of Stanford Core NLP and initiate the Root struct
func readXML(path string) (*Root, error) {
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("could not open a file: %s\n %s", path, err)
}
defer f.Close()
r := &Root{}
dec := xml.NewDecoder(f)
err = dec.Decode(r)
if err != nil {
return nil, err
}
return r, nil
}
package main
import (
"os"
"testing"
"github.com/go-test/deep"
)
var tokenizeTests = []struct {
name string
file string
text string
expect string
}{
{
name: "should slice the simple sentence into words",
file: "simple-test.xml",
text: `<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet href="CoreNLP-to-HTML.xsl" type="text/xsl"?>
<root>
<document>
<docId>test.txt</docId>
<sentences>
<sentence id="1">
<tokens>
<token id="1">
<word>I</word>
<lemma>I</lemma>
<CharacterOffsetBegin>0</CharacterOffsetBegin>
<CharacterOffsetEnd>1</CharacterOffsetEnd>
<POS>PRP</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="2">
<word>am</word>
<lemma>be</lemma>
<CharacterOffsetBegin>2</CharacterOffsetBegin>
<CharacterOffsetEnd>4</CharacterOffsetEnd>
<POS>VBP</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="3">
<word>Bob</word>
<lemma>Bob</lemma>
<CharacterOffsetBegin>5</CharacterOffsetBegin>
<CharacterOffsetEnd>8</CharacterOffsetEnd>
<POS>NNP</POS>
<NER>PERSON</NER>
<Speaker>PER0</Speaker>
</token>
<token id="4">
<word>.</word>
<lemma>.</lemma>
<CharacterOffsetBegin>8</CharacterOffsetBegin>
<CharacterOffsetEnd>9</CharacterOffsetEnd>
<POS>.</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
</tokens>
<parse>(ROOT (S (NP (PRP I)) (VP (VBP am) (NP (NNP Bob))) (. .))) </parse>
<dependencies type="basic-dependencies">
<dep type="root">
<governor idx="0">ROOT</governor>
<dependent idx="3">Bob</dependent>
</dep>
<dep type="nsubj">
<governor idx="3">Bob</governor>
<dependent idx="1">I</dependent>
</dep>
<dep type="cop">
<governor idx="3">Bob</governor>
<dependent idx="2">am</dependent>
</dep>
<dep type="punct">
<governor idx="3">Bob</governor>
<dependent idx="4">.</dependent>
</dep>
</dependencies>
<dependencies type="collapsed-dependencies">
<dep type="root">
<governor idx="0">ROOT</governor>
<dependent idx="3">Bob</dependent>
</dep>
<dep type="nsubj">
<governor idx="3">Bob</governor>
<dependent idx="1">I</dependent>
</dep>
<dep type="cop">
<governor idx="3">Bob</governor>
<dependent idx="2">am</dependent>
</dep>
<dep type="punct">
<governor idx="3">Bob</governor>
<dependent idx="4">.</dependent>
</dep>
</dependencies>
<dependencies type="collapsed-ccprocessed-dependencies">
<dep type="root">
<governor idx="0">ROOT</governor>
<dependent idx="3">Bob</dependent>
</dep>
<dep type="nsubj">
<governor idx="3">Bob</governor>
<dependent idx="1">I</dependent>
</dep>
<dep type="cop">
<governor idx="3">Bob</governor>
<dependent idx="2">am</dependent>
</dep>
<dep type="punct">
<governor idx="3">Bob</governor>
<dependent idx="4">.</dependent>
</dep>
</dependencies>
<dependencies type="enhanced-dependencies">
<dep type="root">
<governor idx="0">ROOT</governor>
<dependent idx="3">Bob</dependent>
</dep>
<dep type="nsubj">
<governor idx="3">Bob</governor>
<dependent idx="1">I</dependent>
</dep>
<dep type="cop">
<governor idx="3">Bob</governor>
<dependent idx="2">am</dependent>
</dep>
<dep type="punct">
<governor idx="3">Bob</governor>
<dependent idx="4">.</dependent>
</dep>
</dependencies>
<dependencies type="enhanced-plus-plus-dependencies">
<dep type="root">
<governor idx="0">ROOT</governor>
<dependent idx="3">Bob</dependent>
</dep>
<dep type="nsubj">
<governor idx="3">Bob</governor>
<dependent idx="1">I</dependent>
</dep>
<dep type="cop">
<governor idx="3">Bob</governor>
<dependent idx="2">am</dependent>
</dep>
<dep type="punct">
<governor idx="3">Bob</governor>
<dependent idx="4">.</dependent>
</dep>
</dependencies>
</sentence>
</sentences>
<coreference>
<coreference>
<mention representative="true">
<sentence>1</sentence>
<start>3</start>
<end>4</end>
<head>3</head>
<text>Bob</text>
</mention>
<mention>
<sentence>1</sentence>
<start>1</start>
<end>2</end>
<head>1</head>
<text>I</text>
</mention>
</coreference>
</coreference>
</document>
</root>`,
expect: `[Bob] (I) am Bob .
`,
},
}
func TestUpdateToRepresentiveText(t *testing.T) {
for _, testcase := range tokenizeTests {
t.Log(testcase.name)
f, err := os.Create(testcase.file)
if err != nil {
t.Errorf("could not create a file: %s\n %s\n", testcase.file, err)
}
f.WriteString(testcase.text)
f.Close()
root, err := readXML(testcase.file)
if err != nil {
t.Error(err)
}
result := root.Document.updateToRepresentiveText()
if diff := deep.Equal(result, testcase.expect); diff != nil {
t.Error(diff)
}
if err := os.Remove(testcase.file); err != nil {
t.Errorf("could not delete a file: %s\n %s\n", testcase.file, err)
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment