This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def calculate_avg_word2vec(cleaned_sentences, vectors, vocab, word_vector_len): | |
avg_w2v_vectors = [] | |
for sentence in tqdm(cleaned_sentences): | |
vector = np.zeros(word_vector_len) # Avg vector for current sentence | |
word_count = 0 # word count of current sentence | |
for word in sentence.split(): | |
if word in vocab: | |
vector += vectors[word] | |
word_count += 1 | |
if word_count > 0: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_binary_encoded_col(data, column_name, sep=','): | |
# https://stackoverflow.com/questions/39303912/tfidfvectorizer-in-scikit-learn-valueerror-np-nan-is-an-invalid-document/39308809 | |
all_values = data[column_name].values.astype('U') # Converting values to unicode so that nan couls be handeled, treating nan as a separate category | |
vectorizer = CountVectorizer(vocabulary=get_all_unique_of_column(all_values, sep=sep), lowercase=False, binary=True) | |
vectorizer.fit(all_values) | |
return vectorizer.transform(all_values) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_all_unique_of_column(col_values, sep=','): | |
inner_values_as_list = [str(item).split(sep) for item in col_values] | |
# print(inner_values_as_list) | |
flat_items = list(set([item for sublist in inner_values_as_list for item in sublist])) | |
return flat_items |