Skip to content

Instantly share code, notes, and snippets.

@sudevschiz
Created November 23, 2019 10:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sudevschiz/678de8629cb336243d802f6bb97e41c1 to your computer and use it in GitHub Desktop.
Save sudevschiz/678de8629cb336243d802f6bb97e41c1 to your computer and use it in GitHub Desktop.
import io
def load_vectors(fname):
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
n, d = map(int, fin.readline().split())
data = {}
for line in fin:
tokens = line.rstrip().split(' ')
data[tokens[0]] = list(map(float, tokens[1:]))
return n,d,data
#Download pre-trained vectors from fasttext.cc
fast_text_fname = 'ADD_FNAME'
n,d,ft_dict = load_vectors(fname)
def get_ft_vec(token,ft_dict,d=300):
try:
v = np.array(ft_dict[token])
except KeyError:
v = np.array([0]*d)
return v
def compute_ft_sum(text,ft_dict,d=300):
ret_vec = np.array([0]*d)
for key in text.keys():
# Multiply the word embedding with the
# number of times the user searched
vec = np.multiply(get_ft_vec(key,ft_dict),int(text[key]))
# Vector addition of token embeddings
ret_vec = ret_vec + vec
return list(user_vec)
vec_mappings = document_series.map(lambda x : compute_ft_sum(x,ft_dict,d=300))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment