Created
January 7, 2020 11:48
-
-
Save eiriks/47d54d33f624ba0d09954dfd234e2d83 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
import numpy | |
# load no model - this one: https://spacy.io/models/nb | |
nlp = spacy.load('nb_core_news_sm') | |
with open(w2v_path+'/w2vtrained.txt', 'rb') as file_: | |
header = file_.readline() | |
nr_row, nr_dim = header.split() | |
print(nr_row) #b'658624' <- n words | |
print(nr_dim) #b'300'<- n dim of vector | |
# trick: https://github.com/explosion/spaCy/issues/2914 | |
nlp.vocab.vectors.resize((int(nr_row), int(nr_dim))) | |
#nlp.vocab.reset_vectors(width = int(nr_dim)) | |
for line in file_: | |
line = line.rstrip().decode('utf8') | |
pieces = line.rsplit(' ', int(nr_dim)) | |
word = pieces[0] | |
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f') | |
nlp.vocab.set_vector(word, vector) | |
# save | |
nlp.to_disk("nb_core_news_sm_w_vec") | |
# load model with vectors | |
import spacy | |
nlp = spacy.load('./nb_core_news_sm_w_vec') | |
# now we can do: | |
doc1= nlp("Jeg hater måker") | |
doc2 = nlp("Jeg hater båter") | |
print(doc1.similarity(doc2)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment