Created
April 10, 2018 11:24
-
-
Save toVersus/c5ffbab86c43055cc7c573f33698d0f2 to your computer and use it in GitHub Desktop.
[Language Processing 100 Essentials] #56: Co-reference analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/xml" | |
"flag" | |
"fmt" | |
"os" | |
"strings" | |
) | |
type Root struct { | |
Document *Document `xml:"document"` | |
} | |
type Document struct { | |
Sentences []*Sentence `xml:"sentences>sentence"` | |
Coreferences []*Coreference `xml:"coreference>coreference"` | |
} | |
func (doc Document) updateToRepresentiveText() string { | |
var text string | |
for _, coref := range doc.Coreferences { | |
repl := "" | |
for _, mention := range coref.Mention { | |
sendID, start, end := mention.Sentence, mention.Start, mention.End | |
if mention.Representative == "true" { | |
repl = mention.Text | |
continue | |
} | |
doc.Sentences[sendID-1].Tokens[start-1].Word = "[" + repl + "] (" + doc.Sentences[sendID-1].Tokens[start-1].Word | |
doc.Sentences[sendID-1].Tokens[end-2].Word += ")" | |
} | |
} | |
for _, s := range doc.Sentences { | |
text += s.wordString() + "\n" | |
} | |
return text | |
} | |
type Sentence struct { | |
ID int `xml:"id,attr"` | |
Dependencies []*Dependencies `xml:"dependencies"` | |
Parse string `xml:"parse,omitempty"` | |
Tokens []*Token `xml:"tokens>token,omitempty"` | |
} | |
func (sent Sentence) wordString() string { | |
var words []string | |
for _, token := range sent.Tokens { | |
words = append(words, token.Word) | |
} | |
return strings.Join(words, " ") | |
} | |
type Token struct { | |
ID string `xml:"id,attr"` | |
Word string `xml:"word,omitempty"` | |
Lemma string `xml:"lemma,omitempty"` | |
CharacterOffsetBegin int `xml:"CharacterOffsetBegin,omitempty"` | |
CharacterOffsetEnd int `xml:"CharacterOffsetEnd,omitempty"` | |
POS string `xml:"POS,omitempty"` | |
NER string `xml:"NER,omitempty"` | |
NormalizedNER string `xml:"NormalizedNER,omitempty"` | |
Speaker string `xml:"Speaker,omitempty"` | |
Timex *Timex `xml:"Timex,omitempty"` | |
} | |
func (token *Token) getWordTaggedByPos() string { | |
return fmt.Sprintf("%s\t%s\t%s", token.Word, token.Lemma, token.POS) | |
} | |
func (token *Token) getPersonName() string { | |
if token.NER == "PERSON" { | |
return token.Word | |
} | |
return "" | |
} | |
type Timex struct { | |
Tid string `xml:"tid,attr"` | |
Type string `xml:"type,attr"` | |
Value string `xml:",chardata"` | |
} | |
type Governor struct { | |
Copy string `xml:"copy,attr"` | |
Idx int `xml:"idx,attr"` | |
Value string `xml:",chardata"` | |
} | |
type Dependent struct { | |
Copy string `xml:"copy,attr"` | |
Idx int `xml:"idx,attr"` | |
Value string `xml:",chardata"` | |
} | |
type Dep struct { | |
Extra string `xml:"extra,attr"` | |
Type string `xml:"type,attr"` | |
Dependent *Dependent `xml:"dependent,omitempty"` | |
Governor *Governor `xml:"governor,omitempty"` | |
} | |
type Dependencies struct { | |
Type string `xml:"type,attr"` | |
Dep []*Dep `xml:"dep,omitempty"` | |
} | |
type Coreference struct { | |
Mention []*Mention `xml:"mention,omitempty"` | |
} | |
type Mention struct { | |
Representative string `xml:"representative,attr"` | |
Sentence int `xml:"sentence,omitempty"` | |
Start int `xml:"start,omitempty"` | |
End int `xml:"end,omitempty"` | |
Head int `xml:"head,omitempty"` | |
Text string `xml:"text,omitempty"` | |
} | |
func main() { | |
var filePath, destFilePath string | |
flag.StringVar(&filePath, "file", "", "specify a file path") | |
flag.StringVar(&filePath, "f", "", "specify a file path") | |
flag.StringVar(&destFilePath, "dest", "", "specify a dest file path") | |
flag.StringVar(&destFilePath, "d", "", "specify a dest file path") | |
flag.Parse() | |
r, err := readXML(filePath) | |
if err != nil { | |
fmt.Println(err) | |
os.Exit(1) | |
} | |
text := r.Document.updateToRepresentiveText() | |
fmt.Print(text) | |
} | |
// readXML reads the result of Stanford Core NLP and initiate the Root struct | |
func readXML(path string) (*Root, error) { | |
f, err := os.Open(path) | |
if err != nil { | |
return nil, fmt.Errorf("could not open a file: %s\n %s", path, err) | |
} | |
defer f.Close() | |
r := &Root{} | |
dec := xml.NewDecoder(f) | |
err = dec.Decode(r) | |
if err != nil { | |
return nil, err | |
} | |
return r, nil | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"os" | |
"testing" | |
"github.com/go-test/deep" | |
) | |
var tokenizeTests = []struct { | |
name string | |
file string | |
text string | |
expect string | |
}{ | |
{ | |
name: "should slice the simple sentence into words", | |
file: "simple-test.xml", | |
text: `<?xml version="1.0" encoding="UTF-8"?> | |
<?xml-stylesheet href="CoreNLP-to-HTML.xsl" type="text/xsl"?> | |
<root> | |
<document> | |
<docId>test.txt</docId> | |
<sentences> | |
<sentence id="1"> | |
<tokens> | |
<token id="1"> | |
<word>I</word> | |
<lemma>I</lemma> | |
<CharacterOffsetBegin>0</CharacterOffsetBegin> | |
<CharacterOffsetEnd>1</CharacterOffsetEnd> | |
<POS>PRP</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
<token id="2"> | |
<word>am</word> | |
<lemma>be</lemma> | |
<CharacterOffsetBegin>2</CharacterOffsetBegin> | |
<CharacterOffsetEnd>4</CharacterOffsetEnd> | |
<POS>VBP</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
<token id="3"> | |
<word>Bob</word> | |
<lemma>Bob</lemma> | |
<CharacterOffsetBegin>5</CharacterOffsetBegin> | |
<CharacterOffsetEnd>8</CharacterOffsetEnd> | |
<POS>NNP</POS> | |
<NER>PERSON</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
<token id="4"> | |
<word>.</word> | |
<lemma>.</lemma> | |
<CharacterOffsetBegin>8</CharacterOffsetBegin> | |
<CharacterOffsetEnd>9</CharacterOffsetEnd> | |
<POS>.</POS> | |
<NER>O</NER> | |
<Speaker>PER0</Speaker> | |
</token> | |
</tokens> | |
<parse>(ROOT (S (NP (PRP I)) (VP (VBP am) (NP (NNP Bob))) (. .))) </parse> | |
<dependencies type="basic-dependencies"> | |
<dep type="root"> | |
<governor idx="0">ROOT</governor> | |
<dependent idx="3">Bob</dependent> | |
</dep> | |
<dep type="nsubj"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="1">I</dependent> | |
</dep> | |
<dep type="cop"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="2">am</dependent> | |
</dep> | |
<dep type="punct"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="4">.</dependent> | |
</dep> | |
</dependencies> | |
<dependencies type="collapsed-dependencies"> | |
<dep type="root"> | |
<governor idx="0">ROOT</governor> | |
<dependent idx="3">Bob</dependent> | |
</dep> | |
<dep type="nsubj"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="1">I</dependent> | |
</dep> | |
<dep type="cop"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="2">am</dependent> | |
</dep> | |
<dep type="punct"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="4">.</dependent> | |
</dep> | |
</dependencies> | |
<dependencies type="collapsed-ccprocessed-dependencies"> | |
<dep type="root"> | |
<governor idx="0">ROOT</governor> | |
<dependent idx="3">Bob</dependent> | |
</dep> | |
<dep type="nsubj"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="1">I</dependent> | |
</dep> | |
<dep type="cop"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="2">am</dependent> | |
</dep> | |
<dep type="punct"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="4">.</dependent> | |
</dep> | |
</dependencies> | |
<dependencies type="enhanced-dependencies"> | |
<dep type="root"> | |
<governor idx="0">ROOT</governor> | |
<dependent idx="3">Bob</dependent> | |
</dep> | |
<dep type="nsubj"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="1">I</dependent> | |
</dep> | |
<dep type="cop"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="2">am</dependent> | |
</dep> | |
<dep type="punct"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="4">.</dependent> | |
</dep> | |
</dependencies> | |
<dependencies type="enhanced-plus-plus-dependencies"> | |
<dep type="root"> | |
<governor idx="0">ROOT</governor> | |
<dependent idx="3">Bob</dependent> | |
</dep> | |
<dep type="nsubj"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="1">I</dependent> | |
</dep> | |
<dep type="cop"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="2">am</dependent> | |
</dep> | |
<dep type="punct"> | |
<governor idx="3">Bob</governor> | |
<dependent idx="4">.</dependent> | |
</dep> | |
</dependencies> | |
</sentence> | |
</sentences> | |
<coreference> | |
<coreference> | |
<mention representative="true"> | |
<sentence>1</sentence> | |
<start>3</start> | |
<end>4</end> | |
<head>3</head> | |
<text>Bob</text> | |
</mention> | |
<mention> | |
<sentence>1</sentence> | |
<start>1</start> | |
<end>2</end> | |
<head>1</head> | |
<text>I</text> | |
</mention> | |
</coreference> | |
</coreference> | |
</document> | |
</root>`, | |
expect: `[Bob] (I) am Bob . | |
`, | |
}, | |
} | |
func TestUpdateToRepresentiveText(t *testing.T) { | |
for _, testcase := range tokenizeTests { | |
t.Log(testcase.name) | |
f, err := os.Create(testcase.file) | |
if err != nil { | |
t.Errorf("could not create a file: %s\n %s\n", testcase.file, err) | |
} | |
f.WriteString(testcase.text) | |
f.Close() | |
root, err := readXML(testcase.file) | |
if err != nil { | |
t.Error(err) | |
} | |
result := root.Document.updateToRepresentiveText() | |
if diff := deep.Equal(result, testcase.expect); diff != nil { | |
t.Error(diff) | |
} | |
if err := os.Remove(testcase.file); err != nil { | |
t.Errorf("could not delete a file: %s\n %s\n", testcase.file, err) | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment