Created
April 14, 2018 13:58
-
-
Save toVersus/ed36c4f84918948b2b3d72ce0097b694 to your computer and use it in GitHub Desktop.
[Language Processing 100 Essentials] #59: Parse S-expression
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/xml" | |
"flag" | |
"fmt" | |
"os" | |
"strings" | |
) | |
type Root struct { | |
Document *Document `xml:"document"` | |
} | |
type Document struct { | |
Sentences Sentences `xml:"sentences>sentence"` | |
Coreferences Coreferences `xml:"coreference>coreference"` | |
} | |
type Sentences []*Sentence | |
type Coreferences []*Coreference | |
type Sentence struct { | |
ID int `xml:"id,attr"` | |
Dependencies Dependencies `xml:"dependencies"` | |
Parse string `xml:"parse,omitempty"` | |
Tokens Tokes `xml:"tokens>token,omitempty"` | |
} | |
type Dependencies []*Dependencie | |
type Tokes []*Token | |
type Token struct { | |
ID string `xml:"id,attr"` | |
Word string `xml:"word,omitempty"` | |
Lemma string `xml:"lemma,omitempty"` | |
CharacterOffsetBegin int `xml:"CharacterOffsetBegin,omitempty"` | |
CharacterOffsetEnd int `xml:"CharacterOffsetEnd,omitempty"` | |
POS string `xml:"POS,omitempty"` | |
NER string `xml:"NER,omitempty"` | |
NormalizedNER string `xml:"NormalizedNER,omitempty"` | |
Speaker string `xml:"Speaker,omitempty"` | |
Timex *Timex `xml:"Timex,omitempty"` | |
} | |
type Timex struct { | |
Tid string `xml:"tid,attr"` | |
Type string `xml:"type,attr"` | |
Value string `xml:",chardata"` | |
} | |
type Governor struct { | |
Copy string `xml:"copy,attr"` | |
Idx int `xml:"idx,attr"` | |
Value string `xml:",chardata"` | |
} | |
type Dependent struct { | |
Copy string `xml:"copy,attr"` | |
Idx int `xml:"idx,attr"` | |
Value string `xml:",chardata"` | |
} | |
type Dep struct { | |
Extra string `xml:"extra,attr"` | |
Type string `xml:"type,attr"` | |
Dependent *Dependent `xml:"dependent,omitempty"` | |
Governor *Governor `xml:"governor,omitempty"` | |
} | |
type Dependencie struct { | |
Type string `xml:"type,attr"` | |
Deps Deps `xml:"dep,omitempty"` | |
} | |
type Deps []*Dep | |
type Coreference struct { | |
Mentions Mentions `xml:"mention,omitempty"` | |
} | |
type Mentions []*Mention | |
type Mention struct { | |
Representative string `xml:"representative,attr"` | |
Sentence int `xml:"sentence,omitempty"` | |
Start int `xml:"start,omitempty"` | |
End int `xml:"end,omitempty"` | |
Head int `xml:"head,omitempty"` | |
Text string `xml:"text,omitempty"` | |
} | |
type node struct { | |
parent *node | |
child []*node | |
pos string | |
value string | |
} | |
func (n *node) walkNPString() { | |
for i := len(n.child) - 1; i >= 0; i-- { | |
if n.child[i].value == "" || n.child[i].value == "," || n.child[i].value == "." { | |
n.child[i].walkNPString() | |
continue | |
} | |
if n.pos != "NP" || n.value != "" { | |
n.child[i].walkNPString() | |
continue | |
} | |
//fmt.Printf("parent: %p, type: %s, value: %s\n", n.child[i].parent, n.child[i].pos, n.child[i].value) | |
//fmt.Printf("%p == %p\n", n, n.child[i].parent) | |
fmt.Println(n.child[i].value) | |
n.child[i].walkNPString() | |
} | |
} | |
func newRootNode() *node { | |
return &node{ | |
parent: &node{}, | |
} | |
} | |
func newNode(parent *node) *node { | |
return &node{ | |
parent: parent, | |
} | |
} | |
func parse(str string) (*node, error) { | |
if str[0] != '(' { | |
return nil, fmt.Errorf("Initial letter must be '('\n input string: %s", str) | |
} | |
return newRootNode().addChild(str[1:]), nil | |
} | |
func (n *node) addChild(str string) *node { | |
if len(str) == 0 { | |
return n | |
} | |
str = strings.TrimSpace(str) | |
switch str[0] { | |
case '(': | |
next := newNode(n) | |
next = next.addChild(str[1:]) | |
n.child = append(n.child, next) | |
case ')': | |
n.parent = n.parent.addChild(str[1:]) | |
default: | |
for i, s := range str { | |
if (s != '(') && (s != ')') { | |
continue | |
} | |
tmp := strings.Split(str[:i], " ") | |
n.pos = tmp[0] | |
if len(tmp) == 2 { | |
n.value = tmp[1] | |
} | |
n = n.addChild(str[i:]) | |
break | |
} | |
} | |
return n | |
} | |
func main() { | |
var filePath string | |
var sentenceNum int | |
flag.StringVar(&filePath, "file", "", "specify a file path") | |
flag.StringVar(&filePath, "f", "", "specify a file path") | |
flag.IntVar(&sentenceNum, "n", 1, "specify number of sentence") | |
flag.Parse() | |
r, err := readXML(filePath) | |
if err != nil { | |
fmt.Println(err) | |
os.Exit(1) | |
} | |
sexp := r.Document.Sentences[sentenceNum].Parse | |
node, err := parse(sexp) | |
if err != nil { | |
fmt.Print(err) | |
os.Exit(1) | |
} | |
node.walkNPString() | |
} | |
// readXML reads the result of Stanford Core NLP and initiate the Root struct | |
func readXML(path string) (*Root, error) { | |
f, err := os.Open(path) | |
if err != nil { | |
return nil, fmt.Errorf("could not open a file: %s\n %s", path, err) | |
} | |
defer f.Close() | |
r := &Root{} | |
dec := xml.NewDecoder(f) | |
err = dec.Decode(r) | |
if err != nil { | |
return nil, err | |
} | |
return r, nil | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Incomplete work!
Currently just refactor the following code:
https://github.com/cipepser/goSExpression-sample/tree/master/gose