Skip to content

Instantly share code, notes, and snippets.

Created January 10, 2018 23:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/4b0ee04cef7f703beb2f539d5577299f to your computer and use it in GitHub Desktop.
Save anonymous/4b0ee04cef7f703beb2f539d5577299f to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding: utf8
"""Load vectors for a language trained using fastText
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals
import plac
import numpy
import spacy
from spacy.language import Language
lang = "nb"
vectors_loc = "wiki.no.vec"
if lang is None:
nlp = Language()
else:
# create empty language class – this is required if you're planning to
# save the model to disk and load it back later (models always need a
# "lang" setting). Use 'xx' for blank multi-language class.
nlp = spacy.blank(lang)
with open(vectors_loc, 'rb') as file_:
header = file_.readline()
nr_row, nr_dim = header.split()
nlp.vocab.reset_vectors(width=int(nr_dim))
for line in file_:
line = line.rstrip().decode('utf8')
pieces = line.rsplit(' ', int(nr_dim))
word = pieces[0]
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
nlp.vocab.set_vector(word, vector) # add the vectors to the vocab
nlp.to_disk('norsk.model')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment