Skip to content

Instantly share code, notes, and snippets.

@codez266
Last active December 11, 2017 15:08
Show Gist options
  • Save codez266/58a7e4d81240979bfefc2cd5b5045539 to your computer and use it in GitHub Desktop.
Save codez266/58a7e4d81240979bfefc2cd5b5045539 to your computer and use it in GitHub Desktop.
Word vector features in revscoring
from revscoring.features import wikitext
from revscoring.features.modifiers import max, sub
from revscoring.languages import english
from revscoring import Feature
from revscoring.features import FeatureVector
from revscoring.datasources import Datasource, revision_oriented
from revscoring.dependencies import solve
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
word2vec = None
VECTORS_DIM = 300
w2v_path = '/run/media/sumit/linux/POST/GoogleNews-vectors-negative300.bin'
def load_word2vec(filepath):
global word2vec
if word2vec is not None:
return word2vec
word2vec = KeyedVectors.load_word2vec_format(filepath,
binary=True, limit=20000)
return word2vec
def get_word_vectors(non_stop_tokens):
word2vec = load_word2vec(w2v_path)
vector = np.zeros((1, VECTORS_DIM))
words_added = 0
for tok in non_stop_tokens:
try:
vec = word2vec[tok]
pdb.set_trace()
vector += vec
words_added += 1
except:
continue
return vector/words_added
word_vectors = Datasource("word_vectors",
get_word_vectors,
depends_on=[english.stopwords.revision.datasources.non_stopwords])
w2v = FeatureVector("word2vec", get_word_vectors,
depends_on=[english.stopwords.revision.datasources.non_stopwords],
returns=np.ndarray)
z=solve(w2v, cache={revision_oriented.revision.text: 'Hi there.'})
drafttopic = [w2v]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment