Skip to content

Instantly share code, notes, and snippets.

@arunm8489
Last active August 1, 2020 14:11
Show Gist options
  • Save arunm8489/57068e56d15766bbf841be624c098660 to your computer and use it in GitHub Desktop.
Save arunm8489/57068e56d15766bbf841be624c098660 to your computer and use it in GitHub Desktop.
def tockenize(X_train,X_test):
"""
bow encoding
"""
word_list = []
for sent in X_train:
for word in sent.split():
word_list.append(word)
corpus = Counter(word_list)
# sorting on the basis of most common words
corpus_ = sorted(corpus,key=corpus.get,reverse=True)
# creating a dict
onehot_dict = {w:i+1 for i,w in enumerate(corpus_)}
train_vec = []
test_vec = []
for sent in X_train:
train_vec.append([onehot_dict[word] for word in sent.split() if word in onehot_dict.keys()])
for sent in X_test:
test_vec.append([onehot_dict[word] for word in sent.split() if word in onehot_dict.keys()])
return train_vec,test_vec,corpus_
essay_train = X_train['essay'].values
essay_test = X_test['essay'].values
# encoding
essay_train_p,essay_test_p,corpus = tockenize(essay_train,essay_test)
print(len(corpus))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment