Last active
March 19, 2019 17:54
-
-
Save VXU1230/166079105d6d869fd94dfc8ab315d231 to your computer and use it in GitHub Desktop.
create vocabulary
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
PERCENT_L = 10 | |
PERCENT_U = 90 | |
LOG_DIR = os.getcwd() | |
WARM_START = False | |
def build_vocab(data): | |
if not WARM_START: | |
dic = defaultdict(int) | |
for sent in data: | |
for word in sent: | |
dic[word] += 1 | |
freq_u = 100000 | |
freq_l = 0 | |
if PERCENT_U or PERCENT_L: | |
if PERCENT_U: | |
freq_u = int(np.percentile(np.fromiter(dic.values(), dtype=int), PERCENT_U)) | |
print("\nwords upper percentile: {} frequency: {}".format(PERCENT_U, freq_u)) | |
if PERCENT_L: | |
freq_l = int(np.percentile(np.fromiter(dic.values(), dtype=int), PERCENT_L)) | |
print("words lower percentile: {} frequency: {}".format(PERCENT_L, freq_l)) | |
dic = {k: v for k, v in dic.items() if v <= freq_u or v >= freq_l} | |
dic = OrderedDict(sorted(dic.items(), key=lambda x: x[1], reverse=True)) | |
dic = {w: i for i, w in enumerate(dic.keys(), start=1)} | |
dic["<unk>"] = 0 | |
reverse_dic = {i: w for w, i in dic.items()} | |
vocab_size = len(dic) | |
np.save(os.path.join(LOG_DIR, 'dic.npy'), dic) | |
print("vocabulary with word frequency between {} and {} size: {}".format(freq_l, freq_u, vocab_size)) | |
else: | |
dic = np.load(os.path.join(LOG_DIR, "dic.npy")).item() | |
reverse_dic = {i: w for w, i in dic.items()} | |
vocab_size = len(dic) | |
np.save(os.path.join(LOG_DIR, 'dic.npy'), dic) | |
print("restored dictionary size: {}".format(vocab_size)) | |
return dic, reverse_dic, vocab_size | |
dic, reverse_dic, VOCAB_SIZE = build_vocab(train_text) | |
train_text_ids = [[dic[word] if word in dic else dic["<unk>"] for word in sent] for sent in train_text] | |
test_text_ids = [[dic[word] if word in dic else dic["<unk>"] for word in sent] for sent in test_text] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment