Skip to content

Instantly share code, notes, and snippets.

@buriy
Last active September 4, 2020 10:49
Show Gist options
  • Save buriy/35b9658ddb3b30206144e2eb8f28d3cb to your computer and use it in GitHub Desktop.
Save buriy/35b9658ddb3b30206144e2eb8f28d3cb to your computer and use it in GitHub Desktop.
#by Anton Lozhkov
#pip install conllu tqdm
from conllu import parse_incr
from tqdm.auto import tqdm
import unicodedata
def count_roots(tokenlist):
return sum(1 for tok in tokenlist if tok['deprel'] == 'root')
def count_reachable_nodes(tree_root):
children_stack = tree_root.children
reachable_nodes = {tree_root.token['id']}
while children_stack:
child = children_stack.pop()
reachable_nodes.add(child.token['id'])
children_stack.extend(child.children)
return len(reachable_nodes)
def fix_tok_labels(token):
deprel = token['deprel']
upos = token['upos']
if deprel == 'det':
token['upos'] = 'DET'
# Nummod is for "number phrases" only.
if deprel == 'nummod' and upos not in ['NUM', 'NOUN', 'SYM']:
token['upos'] = 'NUM'
# Advmod is for adverbs, perhaps particles but not for prepositional phrases or clauses.
if deprel == 'advmod' and upos not in ['ADV', 'ADJ', 'CCONJ', 'DET', 'PART', 'SYM']:
token['upos'] = 'ADV'
# Known expletives are pronouns. Determiners and particles are probably acceptable, too.
if deprel == 'expl' and upos not in ['PRON', 'DET', 'PART']:
token['upos'] = 'PRON'
# Auxiliary verb/particle must be AUX.
if deprel == 'aux':
token['upos'] = 'AUX'
# Copula is an auxiliary verb/particle (AUX) or a pronoun (PRON|DET).
if deprel == 'cop' and upos not in ['AUX', 'PRON', 'DET', 'SYM']:
token['upos'] = 'PRON'
# Case is normally an adposition, maybe particle.
# However, there are also secondary adpositions and they may have the original POS tag
if deprel == 'case' and upos in ['PROPN', 'ADJ', 'PRON', 'DET', 'NUM', 'AUX']:
token['upos'] = 'ADP'
# Mark is normally a conjunction or adposition, maybe particle but definitely not a pronoun.
if deprel == 'mark' and upos in ['NOUN', 'PROPN', 'ADJ', 'PRON', 'DET', 'NUM', 'VERB', 'AUX', 'INTJ']:
token['upos'] = 'SCONJ'
# Cc is a conjunction, possibly an adverb or particle.
if deprel == 'cc' and upos in ['NOUN', 'PROPN', 'ADJ', 'PRON', 'DET', 'NUM', 'VERB', 'AUX', 'INTJ']:
token['upos'] = 'CCONJ'
if deprel == 'punct':
token['upos'] = 'PUNCT'
if upos == 'PUNCT' and deprel not in ['punct', 'root']:
token['deprel'] = 'punct'
def fix_sent_labels(sentence):
if len(sentence) < 1:
return None
if count_roots(sentence) != 1:
# print(f"[sent_id = {sentence.metadata['sent_id']}] Multiple roots")
return None
tree = sentence.to_tree()
if count_reachable_nodes(tree) != len(sentence):
# print(f"[sent_id = {sentence.metadata['sent_id']}] Unreachable nodes or cycles")
return None
# combine unicode symbols with accents such as 'Й' into one
sentence.metadata['text'] = unicodedata.normalize('NFC', sentence.metadata['text'])
stext = sentence.metadata['text']+'\n'
next_start = 0
for token in sentence:
token['form'] = unicodedata.normalize('NFC', token['form'])
# fix syntax
fix_tok_labels(token)
# infer SpaceAfter
start = stext.find(token['form'], next_start)
if not stext[start+len(token['form'])].isspace():
token['misc']['SpaceAfter'] = 'No'
next_start = start + len(token['form'])
return sentence
def main():
input_file = open("nerus_lenta.conllu", "r", encoding="utf-8")
output_file = open("nerus_clean.conllu", "w", encoding="utf-8")
for i, sentence in tqdm(enumerate(parse_incr(input_file))):
sentence = fix_sent_labels(sentence)
if sentence is None:
continue
output_file.writelines(sentence.serialize())
input_file.close()
output_file.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment