Created
May 6, 2021 13:05
-
-
Save harsh-99/ed0126389a525af7db8a7c85671c32ab to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import gensim | |
from collections import Counter | |
import json | |
train_path = "./aclImdb/train" | |
test_path = "./aclImdb/test" | |
#simple function which read the data from directory and return data and label | |
# you can make your own reader for other dataset. | |
def reader(path): | |
pos_path = os.path.join(path, "pos") | |
neg_path = os.path.join(path, "neg") | |
data = [] | |
label = [] | |
for file in os.listdir(pos_path): | |
f = open(os.path.join(pos_path, file)) | |
data.append(f.read()) | |
label.append(1) | |
for file in os.listdir(neg_path): | |
f = open(os.path.join(neg_path, file)) | |
data.append(f.read()) | |
label.append(0) | |
# print(data[:1]) | |
return data, label | |
def build_vocab(data, min_word_count = 5): | |
counter = Counter() | |
for line in data: | |
l = gensim.utils.simple_preprocess(line) | |
counter.update(l) | |
#initialise a dictionary or look up table | |
word2id = {} | |
word2id['<pad>'] = 0 | |
word2id['<unk>'] = 1 | |
# include only those in dictionary which have occered more than min word count in the entire data. | |
words = [word for word, count in counter.items() if count>min_word_count] | |
for i, word in enumerate(words): | |
word2id[word] = i+2 | |
with open("word2id.json", 'w') as f: | |
json.dump(word2id, f) | |
return word2id | |
data, label = reader(train_path) | |
word2id = build_vocab(data) | |
print("Dictionary Formed and saved. The length of dictionary is-: ", len(word2id)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment