Created
November 19, 2015 04:37
-
-
Save kylepjohnson/dc5027e3b83ee5a7d62c to your computer and use it in GitHub Desktop.
Get all unique words from PHI5, output to file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# In[1]: | |
import os | |
import re | |
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths | |
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup | |
# In[2]: | |
filepaths = assemble_phi5_author_filepaths() | |
# In[3]: | |
total_uniques = set() | |
count = 0 | |
for filepath in filepaths: | |
count += 1 | |
if count % 50 == 0: | |
print('Processing text #{}.'.format(count)) | |
with open(filepath) as file_open: | |
file_read = file_open.read() | |
file_read_lowered = file_read.casefold() | |
file_clean = phi5_plaintext_cleanup(file_read_lowered, rm_punctuation=True, rm_periods=True) | |
file_tokens = file_clean.split() | |
tokens_unique = set(file_tokens) | |
total_uniques = total_uniques.union(tokens_unique) | |
# In[4]: | |
print(len(total_uniques)) | |
# In[22]: | |
# extra cleanup | |
comp = re.compile(r'\(|\)|“|#|%|⚔|&|=|/|\\|〚|†|『|⚖|–|˘|⚕|☾|◌|◄|►|⌐|⌊|⌋|≈|∷|≈|∞|”') | |
total_uniques_clean = [comp.sub('', w) for w in total_uniques if comp.sub('', w)] | |
total_uniques_clean = set(total_uniques_clean) | |
# In[24]: | |
uniques = sorted(total_uniques_clean) | |
# In[28]: | |
uniques_str = '\n'.join(uniques) | |
# In[27]: | |
with open('phi5_vocabulary.txt', 'w') as fo: | |
fo.write(uniques_str) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment