Skip to content

Instantly share code, notes, and snippets.

@rgalhama
Last active November 21, 2018 13:06
Show Gist options
  • Save rgalhama/8beb48d21bcbc86982f97ac9c3a28f97 to your computer and use it in GitHub Desktop.
Save rgalhama/8beb48d21bcbc86982f97ac9c3a28f97 to your computer and use it in GitHub Desktop.
Script to parse text with Spacy and print the output in CoNLL-U format.
"""
__author__ = "Raquel G. Alhama"
__email__ = "rgalhama@gmail.com"
Script to parse text with Spacy and print the output in CoNLL-U format.
Refs:
https://spacy.io/
http://universaldependencies.org/format.html
"""
import sys
import argparse
import codecs
from os.path import exists, expanduser
import spacy
def sentences_to_conllu(doc, sent_id, prefix = ""):
""" Prints parsed sentences in CONLL-U format (as used in Universal Dependencies).
The format is specified at http://universaldependencies.org/docs/format.html
"""
for sent in doc.sents:
print("# sent_id = %s"%(prefix+str(sent_id)))
print("# text = %s"%sent.sent)
for i, word in enumerate(sent):
#Find head
if word.dep_.lower().strip() == 'root':
head_idx = 0
else:
head_idx = word.head.i + 1 - sent[0].i
#Find feature tag (if available)
ftidx = word.tag_.find("__") + 2
feature_tag=word.tag_[ftidx:]
linetuple = (
i+1, #ID: Word index.
word, #FORM: Word form or punctuation symbol.
word.lemma_.lower(), #LEMMA: Lemma or stem of word form.
word.pos_, #UPOSTAG: Universal part-of-speech tag drawn
# from revised version of the Google universal
# POS tags.
'_', #XPOSTAG: Language-specific part-of-speech tag; # underscore if not available.
'_' if feature_tag == "" else feature_tag, #FEATS: List of morphological features from the
# universal feature inventory or from a defined
# language-specific extension; underscore if not
# available.
head_idx, #HEAD: Head of the current token, which is
# either a value of ID or zero (0).
word.dep_.lower(), #DEPREL: Universal Stanford dependency relation
# to the HEAD (root iff HEAD = 0) or a defined
# language-specific subtype of one.
'_', #DEPS: List of secondary dependencies.
'_' #MISC: Any other annotation.
)
print("%i\t%s\t%s\t%s\t%s\t%s\t%i\t%s\t%s\t%s"%linetuple)
sent_id+=1
print("\n")
return sent_id
def main(input_file, output_file, prefix = ""):
if output_file:
sys.stdout=open(output_file, "w")
with codecs.open(input_file, "r", encoding='utf-8') as fh:
sent_id = 1
for nl,line in enumerate(fh):
doc = nlp(line.strip())
sent_id = sentences_to_conllu(doc, sent_id, prefix=prefix)
sys.stdout = sys.__stdout__
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input_file", required=True, type=str, help="Path to file with sentences to parse.")
parser.add_argument("--output_file", default='', type=str, help="Path to output file. If not specified, the output will be printed on standard output.")
parser.add_argument("--model", required=True, type=str, help="Spacy model to use (e.g. 'es_core_news_md').")
args = parser.parse_args()
#check if files exist
if not exists(expanduser(args.input_file)):
raise Exception(args.input_file, " does not exist!")
nlp = spacy.load(args.model)
main(args.input_file, args.output_file)
@rgalhama
Copy link
Author

I've shared it now as a repo at https://github.com/rgalhama/spaCy2CoNLLU

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment