Last active
May 8, 2019 10:46
-
-
Save appachan/f1f9ab1a5661d996241dd2b2e33298cb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from predpatt import PredPatt, PredPattOpts | |
from predpatt.util.ud import dep_v2, dep_v1 | |
from predpatt.UDParse import UDParse, DepTriple | |
import spacy | |
import stanfordnlp | |
import argparse | |
STANFORDNLP = None | |
GINZA = None | |
def main(text, without_stanfordnlp, without_ginza): | |
options = PredPattOpts(ud=dep_v2.VERSION) # set UD version to v2.0 | |
#options = PredPattOpts(ud=dep_v1.VERSION) # set UD version to v1.0 | |
sentence = "太郎は本と熟れたりんごを購入した." | |
sentence = "太郎は本と熟れたリンゴを購入した." | |
#sentence = "リュック・ベッソンはミラ・ジョヴォヴィッチと結婚した." | |
sentence = "太郎は本を買った." | |
sentence = text | |
# Stanfordnlp | |
if not without_stanfordnlp: | |
doc = STANFORDNLP(sentence) | |
for sent in doc.sentences: | |
deps = sent.words | |
tokens = [e.text for e in deps] | |
tags = [e.upos for e in deps] | |
# PARSER indexing starts at one, but we want | |
# indexing to start at zero. Hence the -1 below. | |
triples = list(map(lambda e:DepTriple(rel=e.dependency_relation, gov=e.governor-1, dep=int(e.index)-1), deps)) | |
parse = UDParse(tokens=tokens, tags=tags, triples=triples) | |
pp = PredPatt(parse, opts=options) | |
print(pp.pprint()) | |
print(tokens) | |
print(tags) | |
print(triples) | |
if len(pp.instances) <= 0: | |
continue | |
print(pp.instances[0].arguments) | |
print(pp.instances[0].subj()) | |
print(pp.instances[0].obj()) | |
# GiNZA | |
if not without_ginza: | |
doc = GINZA(sentence) | |
for sent in doc.sents: | |
tokens = [token.orth_ for token in sent] | |
tags = [token.pos_ for token in sent] | |
#triples = list(map(lambda token: DepTriple(rel=token.dep_, gov=token.head.i, dep=token.i), sent)) | |
triples = [] | |
for token in sent: | |
gov = token.head.i - sent.start | |
dep = token.i - sent.start | |
if token.dep_ in ["root", "ROOT"]: | |
gov = -1 | |
triple = DepTriple(rel=token.dep_, gov=gov, dep=dep) | |
triples.append(triple) | |
parse = UDParse(tokens=tokens, tags=tags, triples=triples) | |
pp = PredPatt(parse, opts=options) | |
print(pp.pprint()) | |
print(tokens) | |
print(tags) | |
print(triples) | |
if len(pp.instances) <= 0: | |
continue | |
print(pp.instances[0].arguments) | |
print(pp.instances[0].subj()) | |
print(pp.instances[0].obj()) | |
if __name__ == "__main__": | |
argparser = argparse.ArgumentParser() | |
argparser.add_argument('--without-stanfordnlp', action='store_true') | |
argparser.add_argument('--without-ginza', action='store_true') | |
args = argparser.parse_args() | |
STANFORDNLP = stanfordnlp.Pipeline(lang='ja') if not args.without_stanfordnlp else None | |
GINZA = spacy.load('ja_ginza_nopn') if not args.without_ginza else None | |
while True: | |
print("\n>> input text:") | |
text = input().strip() | |
main(text, args.without_stanfordnlp, args.without_ginza) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment