Skip to content

Instantly share code, notes, and snippets.

View talk2sunil83's full-sized avatar

Sunil talk2sunil83

  • Focus Edumatics
View GitHub Profile
def calculate_avg_word2vec(cleaned_sentences, vectors, vocab, word_vector_len):
avg_w2v_vectors = []
for sentence in tqdm(cleaned_sentences):
vector = np.zeros(word_vector_len) # Avg vector for current sentence
word_count = 0 # word count of current sentence
for word in sentence.split():
if word in vocab:
vector += vectors[word]
word_count += 1
if word_count > 0:
def get_binary_encoded_col(data, column_name, sep=','):
# https://stackoverflow.com/questions/39303912/tfidfvectorizer-in-scikit-learn-valueerror-np-nan-is-an-invalid-document/39308809
all_values = data[column_name].values.astype('U') # Converting values to unicode so that nan couls be handeled, treating nan as a separate category
vectorizer = CountVectorizer(vocabulary=get_all_unique_of_column(all_values, sep=sep), lowercase=False, binary=True)
vectorizer.fit(all_values)
return vectorizer.transform(all_values)
def get_all_unique_of_column(col_values, sep=','):
inner_values_as_list = [str(item).split(sep) for item in col_values]
# print(inner_values_as_list)
flat_items = list(set([item for sublist in inner_values_as_list for item in sublist]))
return flat_items