Skip to content

Instantly share code, notes, and snippets.

@Sentdex
Created November 19, 2017 16:30
Show Gist options
  • Save Sentdex/c9dbbf4207c8095b2d96ab6a7288c568 to your computer and use it in GitHub Desktop.
Save Sentdex/c9dbbf4207c8095b2d96ab6a7288c568 to your computer and use it in GitHub Desktop.
from collections import Counter
files = ['train.to','train.from']
for name in files:
words = []
save_name = "vocab.{}".format(name.split('.')[1])
print(save_name)
with open(name, 'r') as f:
data = f.read().split('\n')
for l in data:
for word in l.split(' '):
if word != '':
words.append(word)
count = Counter(words).most_common(50000)
#print(count)
with open(save_name, "a") as f:
f.write("<unk>\n<s>\n</s>\n")
for k,v in count:
f.write(k+'\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment