Skip to content

Instantly share code, notes, and snippets.

@codez266
Created March 7, 2018 17:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save codez266/bde0d2384ef1cda0e105b8f59d25524a to your computer and use it in GitHub Desktop.
Save codez266/bde0d2384ef1cda0e105b8f59d25524a to your computer and use it in GitHub Desktop.
"""
These meta-datasources operate on :class:`revscoring.Datasource`'s that
return `list`'s of items and produce vectors out of the same.
.. autoclass:: revscoring.datasources.meta.vectors
"""
import os.path
import logging
from gensim.models.keyedvectors import KeyedVectors
from ..datasource import Datasource
ASSET_SEARCH_DIRS = ["word2vec/", "~/.word2vec/", "/var/share/word2vec/"]
VECTOR_DIMENSIONS = 300
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
)
keyed_vecs = None
class word2vec(Datasource):
"""
Generates vectors for a list of items generated by another
datasource.
:Parameters:
items_datasource : :class:`revscoring.Datasource`
A datasource that returns a list of words.
keyed_vectors : :class:`gensim.models.keyedvectors.KeyedVectors`
loaded key-vectors. See :func:`~revscoring.datasources.meta.vectorizers.word2vec.load_kv`
name : `str`
A name for the `revscoring.FeatureVector`
""" # noqa
def __init__(self, items_datasource, keyed_vectors, name=None):
name = self._format_name(name, [items_datasource, keyed_vectors])
global keyed_vecs
keyed_vecs = keyed_vectors
#self.keyed_vectors = keyed_vectors
super().__init__(name, self.process, depends_on=[items_datasource])
def process(self, words):
return [keyed_vecs[word] if word in keyed_vecs
else [0] * VECTOR_DIMENSIONS
for word in words]
@staticmethod
def load_kv(filename=None, path=None, limit=None):
logger = logging.getLogger(__name__)
if path is not None:
logger.debug("Loading word2vec from {}".format(path))
return KeyedVectors.load_word2vec_format(
path, binary=True, limit=limit)
elif filename is not None:
for dir_path in ASSET_SEARCH_DIRS:
try:
path = os.path.join(dir_path, filename)
logger.debug("Loading word2vec from {}".format(path))
return KeyedVectors.load_word2vec_format(
path, binary=True, limit=limit)
except FileNotFoundError:
continue
else:
raise TypeError(
"load_kv() requires either 'filename' or 'path' to be set.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment