sudevschiz/convert_to_fasttext_vec.py

## convert_to_fasttext_vec.py
import io

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))
    return n,d,data

#Download pre-trained vectors from fasttext.cc
fast_text_fname = 'ADD_FNAME'
n,d,ft_dict = load_vectors(fname)

def get_ft_vec(token,ft_dict,d=300):
    try:
        v = np.array(ft_dict[token])
    except KeyError:
        v = np.array([0]*d)
    return v

def compute_ft_sum(text,ft_dict,d=300):
   ret_vec = np.array([0]*d)
    for key in text.keys():
        # Multiply the word embedding with the
        # number of times the user searched
        vec = np.multiply(get_ft_vec(key,ft_dict),int(text[key]))
        # Vector addition of token embeddings
        ret_vec = ret_vec + vec
    return list(user_vec)

vec_mappings = document_series.map(lambda x : compute_ft_sum(x,ft_dict,d=300))
	import io

	def load_vectors(fname):
	fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
	n, d = map(int, fin.readline().split())
	data = {}
	for line in fin:
	tokens = line.rstrip().split(' ')
	data[tokens[0]] = list(map(float, tokens[1:]))
	return n,d,data

	#Download pre-trained vectors from fasttext.cc
	fast_text_fname = 'ADD_FNAME'
	n,d,ft_dict = load_vectors(fname)

	def get_ft_vec(token,ft_dict,d=300):
	try:
	v = np.array(ft_dict[token])
	except KeyError:
	v = np.array([0]*d)
	return v

	def compute_ft_sum(text,ft_dict,d=300):
	ret_vec = np.array([0]*d)
	for key in text.keys():
	# Multiply the word embedding with the
	# number of times the user searched
	vec = np.multiply(get_ft_vec(key,ft_dict),int(text[key]))
	# Vector addition of token embeddings
	ret_vec = ret_vec + vec
	return list(user_vec)

	vec_mappings = document_series.map(lambda x : compute_ft_sum(x,ft_dict,d=300))