Last active
December 11, 2017 15:08
-
-
Save codez266/58a7e4d81240979bfefc2cd5b5045539 to your computer and use it in GitHub Desktop.
Word vector features in revscoring
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from revscoring.features import wikitext | |
from revscoring.features.modifiers import max, sub | |
from revscoring.languages import english | |
from revscoring import Feature | |
from revscoring.features import FeatureVector | |
from revscoring.datasources import Datasource, revision_oriented | |
from revscoring.dependencies import solve | |
from gensim.models.keyedvectors import KeyedVectors | |
import numpy as np | |
word2vec = None | |
VECTORS_DIM = 300 | |
w2v_path = '/run/media/sumit/linux/POST/GoogleNews-vectors-negative300.bin' | |
def load_word2vec(filepath): | |
global word2vec | |
if word2vec is not None: | |
return word2vec | |
word2vec = KeyedVectors.load_word2vec_format(filepath, | |
binary=True, limit=20000) | |
return word2vec | |
def get_word_vectors(non_stop_tokens): | |
word2vec = load_word2vec(w2v_path) | |
vector = np.zeros((1, VECTORS_DIM)) | |
words_added = 0 | |
for tok in non_stop_tokens: | |
try: | |
vec = word2vec[tok] | |
pdb.set_trace() | |
vector += vec | |
words_added += 1 | |
except: | |
continue | |
return vector/words_added | |
word_vectors = Datasource("word_vectors", | |
get_word_vectors, | |
depends_on=[english.stopwords.revision.datasources.non_stopwords]) | |
w2v = FeatureVector("word2vec", get_word_vectors, | |
depends_on=[english.stopwords.revision.datasources.non_stopwords], | |
returns=np.ndarray) | |
z=solve(w2v, cache={revision_oriented.revision.text: 'Hi there.'}) | |
drafttopic = [w2v] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment