Skip to content

Instantly share code, notes, and snippets.

@VXU1230
Last active March 19, 2019 17:54
Show Gist options
  • Save VXU1230/166079105d6d869fd94dfc8ab315d231 to your computer and use it in GitHub Desktop.
Save VXU1230/166079105d6d869fd94dfc8ab315d231 to your computer and use it in GitHub Desktop.
create vocabulary
PERCENT_L = 10
PERCENT_U = 90
LOG_DIR = os.getcwd()
WARM_START = False
def build_vocab(data):
if not WARM_START:
dic = defaultdict(int)
for sent in data:
for word in sent:
dic[word] += 1
freq_u = 100000
freq_l = 0
if PERCENT_U or PERCENT_L:
if PERCENT_U:
freq_u = int(np.percentile(np.fromiter(dic.values(), dtype=int), PERCENT_U))
print("\nwords upper percentile: {} frequency: {}".format(PERCENT_U, freq_u))
if PERCENT_L:
freq_l = int(np.percentile(np.fromiter(dic.values(), dtype=int), PERCENT_L))
print("words lower percentile: {} frequency: {}".format(PERCENT_L, freq_l))
dic = {k: v for k, v in dic.items() if v <= freq_u or v >= freq_l}
dic = OrderedDict(sorted(dic.items(), key=lambda x: x[1], reverse=True))
dic = {w: i for i, w in enumerate(dic.keys(), start=1)}
dic["<unk>"] = 0
reverse_dic = {i: w for w, i in dic.items()}
vocab_size = len(dic)
np.save(os.path.join(LOG_DIR, 'dic.npy'), dic)
print("vocabulary with word frequency between {} and {} size: {}".format(freq_l, freq_u, vocab_size))
else:
dic = np.load(os.path.join(LOG_DIR, "dic.npy")).item()
reverse_dic = {i: w for w, i in dic.items()}
vocab_size = len(dic)
np.save(os.path.join(LOG_DIR, 'dic.npy'), dic)
print("restored dictionary size: {}".format(vocab_size))
return dic, reverse_dic, vocab_size
dic, reverse_dic, VOCAB_SIZE = build_vocab(train_text)
train_text_ids = [[dic[word] if word in dic else dic["<unk>"] for word in sent] for sent in train_text]
test_text_ids = [[dic[word] if word in dic else dic["<unk>"] for word in sent] for sent in test_text]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment