Created
March 7, 2018 17:00
-
-
Save codez266/bde0d2384ef1cda0e105b8f59d25524a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
These meta-datasources operate on :class:`revscoring.Datasource`'s that | |
return `list`'s of items and produce vectors out of the same. | |
.. autoclass:: revscoring.datasources.meta.vectors | |
""" | |
import os.path | |
import logging | |
from gensim.models.keyedvectors import KeyedVectors | |
from ..datasource import Datasource | |
ASSET_SEARCH_DIRS = ["word2vec/", "~/.word2vec/", "/var/share/word2vec/"] | |
VECTOR_DIMENSIONS = 300 | |
logging.basicConfig( | |
level=logging.DEBUG, | |
format='%(asctime)s %(levelname)s:%(name)s -- %(message)s' | |
) | |
keyed_vecs = None | |
class word2vec(Datasource): | |
""" | |
Generates vectors for a list of items generated by another | |
datasource. | |
:Parameters: | |
items_datasource : :class:`revscoring.Datasource` | |
A datasource that returns a list of words. | |
keyed_vectors : :class:`gensim.models.keyedvectors.KeyedVectors` | |
loaded key-vectors. See :func:`~revscoring.datasources.meta.vectorizers.word2vec.load_kv` | |
name : `str` | |
A name for the `revscoring.FeatureVector` | |
""" # noqa | |
def __init__(self, items_datasource, keyed_vectors, name=None): | |
name = self._format_name(name, [items_datasource, keyed_vectors]) | |
global keyed_vecs | |
keyed_vecs = keyed_vectors | |
#self.keyed_vectors = keyed_vectors | |
super().__init__(name, self.process, depends_on=[items_datasource]) | |
def process(self, words): | |
return [keyed_vecs[word] if word in keyed_vecs | |
else [0] * VECTOR_DIMENSIONS | |
for word in words] | |
@staticmethod | |
def load_kv(filename=None, path=None, limit=None): | |
logger = logging.getLogger(__name__) | |
if path is not None: | |
logger.debug("Loading word2vec from {}".format(path)) | |
return KeyedVectors.load_word2vec_format( | |
path, binary=True, limit=limit) | |
elif filename is not None: | |
for dir_path in ASSET_SEARCH_DIRS: | |
try: | |
path = os.path.join(dir_path, filename) | |
logger.debug("Loading word2vec from {}".format(path)) | |
return KeyedVectors.load_word2vec_format( | |
path, binary=True, limit=limit) | |
except FileNotFoundError: | |
continue | |
else: | |
raise TypeError( | |
"load_kv() requires either 'filename' or 'path' to be set.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment