Skip to content

Instantly share code, notes, and snippets.

@kylepjohnson
Created November 19, 2015 04:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kylepjohnson/dc5027e3b83ee5a7d62c to your computer and use it in GitHub Desktop.
Save kylepjohnson/dc5027e3b83ee5a7d62c to your computer and use it in GitHub Desktop.
Get all unique words from PHI5, output to file
# coding: utf-8
# In[1]:
import os
import re
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
# In[2]:
filepaths = assemble_phi5_author_filepaths()
# In[3]:
total_uniques = set()
count = 0
for filepath in filepaths:
count += 1
if count % 50 == 0:
print('Processing text #{}.'.format(count))
with open(filepath) as file_open:
file_read = file_open.read()
file_read_lowered = file_read.casefold()
file_clean = phi5_plaintext_cleanup(file_read_lowered, rm_punctuation=True, rm_periods=True)
file_tokens = file_clean.split()
tokens_unique = set(file_tokens)
total_uniques = total_uniques.union(tokens_unique)
# In[4]:
print(len(total_uniques))
# In[22]:
# extra cleanup
comp = re.compile(r'\(|\)|“|#|%|⚔|&|=|/|\\|〚|†|『|⚖|–|˘|⚕|☾|◌|◄|►|⌐|⌊|⌋|≈|∷|≈|∞|”')
total_uniques_clean = [comp.sub('', w) for w in total_uniques if comp.sub('', w)]
total_uniques_clean = set(total_uniques_clean)
# In[24]:
uniques = sorted(total_uniques_clean)
# In[28]:
uniques_str = '\n'.join(uniques)
# In[27]:
with open('phi5_vocabulary.txt', 'w') as fo:
fo.write(uniques_str)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment