Sunil talk2sunil83

## calculate_average_word_to_vec.py
def calculate_avg_word2vec(cleaned_sentences, vectors, vocab, word_vector_len):
    avg_w2v_vectors = []
    for sentence in tqdm(cleaned_sentences):
        vector = np.zeros(word_vector_len)  # Avg vector for current sentence
        word_count = 0  # word count of current sentence
        for word in sentence.split():
            if word in vocab:
                vector += vectors[word]
                word_count += 1
        if word_count > 0:

## binary encode column having multiple values in cell or row.py
def get_binary_encoded_col(data, column_name, sep=','):
#     https://stackoverflow.com/questions/39303912/tfidfvectorizer-in-scikit-learn-valueerror-np-nan-is-an-invalid-document/39308809
    all_values = data[column_name].values.astype('U') # Converting values to unicode so that nan couls be handeled, treating nan as a separate category
    vectorizer = CountVectorizer(vocabulary=get_all_unique_of_column(all_values, sep=sep), lowercase=False, binary=True)
    vectorizer.fit(all_values)
    return vectorizer.transform(all_values)

## get_all_unique_values_of_comma_separated_values_in_column.py
def get_all_unique_of_column(col_values, sep=','):
    inner_values_as_list = [str(item).split(sep) for item in col_values]
#     print(inner_values_as_list)
    flat_items = list(set([item for sublist in inner_values_as_list for item in sublist]))
    return flat_items
	def calculate_avg_word2vec(cleaned_sentences, vectors, vocab, word_vector_len):
	avg_w2v_vectors = []
	for sentence in tqdm(cleaned_sentences):
	vector = np.zeros(word_vector_len) # Avg vector for current sentence
	word_count = 0 # word count of current sentence
	for word in sentence.split():
	if word in vocab:
	vector += vectors[word]
	word_count += 1
	if word_count > 0:
	def get_binary_encoded_col(data, column_name, sep=','):
	# https://stackoverflow.com/questions/39303912/tfidfvectorizer-in-scikit-learn-valueerror-np-nan-is-an-invalid-document/39308809
	all_values = data[column_name].values.astype('U') # Converting values to unicode so that nan couls be handeled, treating nan as a separate category
	vectorizer = CountVectorizer(vocabulary=get_all_unique_of_column(all_values, sep=sep), lowercase=False, binary=True)
	vectorizer.fit(all_values)
	return vectorizer.transform(all_values)
	def get_all_unique_of_column(col_values, sep=','):
	inner_values_as_list = [str(item).split(sep) for item in col_values]
	# print(inner_values_as_list)
	flat_items = list(set([item for sublist in inner_values_as_list for item in sublist]))
	return flat_items