Skip to content

Instantly share code, notes, and snippets.

@sskorol
Last active October 26, 2020 18:29
Show Gist options
  • Save sskorol/2dcc110a58e932810ec55671e849a979 to your computer and use it in GitHub Desktop.
Save sskorol/2dcc110a58e932810ec55671e849a979 to your computer and use it in GitHub Desktop.
Convert Navec model into Spacy format for further POS/DEP training
import plac
import pymorphy2
import numpy as np
from navec import Navec
from spacy.language import Language
from spacy.vectors import Vectors
from spacy.vocab import Vocab
from pathlib import Path
class RussianLanguage(Language):
lang = 'ru'
def tags_from(word):
return str(word.tag).split(' ')[0]
@plac.annotations(
model=("Navec model archive.", "option", "m", str),
output_dir=("Model output directory", "option", "o", Path)
)
def main(model='navec_hudlit_v1_12B_500K_300d_100q.tar', output_dir='./model'):
morph_analyzer = pymorphy2.MorphAnalyzer()
navec_model = Navec.load(model)
print('Loaded model')
known_tags = [tag for tag in morph_analyzer.TagClass.KNOWN_GRAMMEMES]
vectors_dims = 300 + len(known_tags)
words = navec_model.vocab.words
vocabulary = Vocab()
vocabulary.vectors = Vectors(shape=(len(words), vectors_dims), name='navec_lex')
added_vectors = 0
added_lexemes = 0
for word in words:
parsed_word = morph_analyzer.parse(word)
word_tags = tags_from(parsed_word[0])
# retrieve unique lexemes for a given word filtered by similar tags
lexemes = set([lexeme.word for lexeme in parsed_word[0].lexeme if word_tags == tags_from(lexeme)])
# check if at least 1 lexeme is absent in a vocabulary (returned list will have -1 values)
rows = vocabulary.vectors.find(keys=lexemes)
if any(row == -1 for row in rows):
# find a vector for each lexeme if exists
vectors = []
for lexeme in lexemes:
vector = navec_model.get(lexeme)
if vector is not None:
vectors.append(vector)
if len(vectors) > 0:
tags_cipher = [+(tag in word_tags) for tag in known_tags]
# create mean vector merged with tags
mean_vector = np.append(np.unique(vectors, axis=0).mean(axis=0), tags_cipher, axis=0)
# filter lexemes by indices which reflect -1 values returned by vocab's rows lookup
unique_lexemes = [lexeme for idx, lexeme in enumerate(lexemes) if rows[idx] == -1]
# add a new vector and a first hash from paradigm
vector_row = vocabulary.vectors.add(unique_lexemes.pop(), vector=mean_vector)
# map other lexemes' hashes with the above vector
for lexeme in unique_lexemes:
vocabulary.vectors.add(lexeme, row=vector_row)
# collect stats
added_lexemes += (len(unique_lexemes) + 1)
added_vectors += 1
print('Vectors:', added_vectors, 'Lexemes:', added_lexemes)
removed_vectors = vocabulary.vectors.resize(shape=(added_vectors, vectors_dims))
print('Resized vocabulary to', added_vectors)
print('Removed vectors:', removed_vectors)
nlp = RussianLanguage(vocabulary)
nlp.to_disk(output_dir)
print('Saved model to disk')
if __name__ == "__main__":
plac.call(main)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment