timotta/word2vec_to_indexed_dataframe.py

## word2vec_to_indexed_dataframe.py
# Needs EMBED_SIZE ans SENTENCE_SIZE
# df.text_array is a column with list os word in each cell

w2v_model = Word2Vec(
    sentences=df.text_array,
    vector_size=EMBED_SIZE,
    window=5,
    min_count=1,
    workers=4,
    seed=1982,
    epochs=W2V_EPOCHS,
)
w2v_model.wv.add_vector("<UNK>", np.zeros(EMBED_SIZE))
w2v_model.wv.add_vector("<PAD>", np.zeros(EMBED_SIZE))

print(f"vocabulary: {len(w2v_model.wv.key_to_index)}")

def pad(size):
    def _pad(text_array):
        diff = size - len(text_array)
        if diff > 0:
            pads = [ "<PAD>" for i in range(diff) ]
            return pads + text_array
        return text_array[:size]
    return _pad

def word_to_index_array(w2v_model):
    def _w(text_array):
        result = []
        for w in text_array:
            values = w2v_model.wv.key_to_index.get(w)
            if not values:
                values = w2v_model.wv.key_to_index.get("<UNK>")
            result.append( values )
        return result
    return _w

df["text_array_padded"] = df.text_array.apply(pad(size=SEQUENCE_SIZE))

df["text_array_indexes"] = df.text_array_padded.apply(word_to_index_array(w2v_model))
	# Needs EMBED_SIZE ans SENTENCE_SIZE
	# df.text_array is a column with list os word in each cell

	w2v_model = Word2Vec(
	sentences=df.text_array,
	vector_size=EMBED_SIZE,
	window=5,
	min_count=1,
	workers=4,
	seed=1982,
	epochs=W2V_EPOCHS,
	)
	w2v_model.wv.add_vector("<UNK>", np.zeros(EMBED_SIZE))
	w2v_model.wv.add_vector("<PAD>", np.zeros(EMBED_SIZE))

	print(f"vocabulary: {len(w2v_model.wv.key_to_index)}")

	def pad(size):
	def _pad(text_array):
	diff = size - len(text_array)
	if diff > 0:
	pads = [ "<PAD>" for i in range(diff) ]
	return pads + text_array
	return text_array[:size]
	return _pad

	def word_to_index_array(w2v_model):
	def _w(text_array):
	result = []
	for w in text_array:
	values = w2v_model.wv.key_to_index.get(w)
	if not values:
	values = w2v_model.wv.key_to_index.get("<UNK>")
	result.append( values )
	return result
	return _w

	df["text_array_padded"] = df.text_array.apply(pad(size=SEQUENCE_SIZE))

	df["text_array_indexes"] = df.text_array_padded.apply(word_to_index_array(w2v_model))