Skip to content

Instantly share code, notes, and snippets.

@eiriks
Created January 7, 2020 11:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eiriks/47d54d33f624ba0d09954dfd234e2d83 to your computer and use it in GitHub Desktop.
Save eiriks/47d54d33f624ba0d09954dfd234e2d83 to your computer and use it in GitHub Desktop.
import spacy
import numpy
# load no model - this one: https://spacy.io/models/nb
nlp = spacy.load('nb_core_news_sm')
with open(w2v_path+'/w2vtrained.txt', 'rb') as file_:
header = file_.readline()
nr_row, nr_dim = header.split()
print(nr_row) #b'658624' <- n words
print(nr_dim) #b'300'<- n dim of vector
# trick: https://github.com/explosion/spaCy/issues/2914
nlp.vocab.vectors.resize((int(nr_row), int(nr_dim)))
#nlp.vocab.reset_vectors(width = int(nr_dim))
for line in file_:
line = line.rstrip().decode('utf8')
pieces = line.rsplit(' ', int(nr_dim))
word = pieces[0]
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
nlp.vocab.set_vector(word, vector)
# save
nlp.to_disk("nb_core_news_sm_w_vec")
# load model with vectors
import spacy
nlp = spacy.load('./nb_core_news_sm_w_vec')
# now we can do:
doc1= nlp("Jeg hater måker")
doc2 = nlp("Jeg hater båter")
print(doc1.similarity(doc2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment