Skip to content

Instantly share code, notes, and snippets.

@mlai-demo
Last active January 11, 2020 22:59
Show Gist options
  • Save mlai-demo/4ed02f2bba7e9faf1691f05eab8510d2 to your computer and use it in GitHub Desktop.
Save mlai-demo/4ed02f2bba7e9faf1691f05eab8510d2 to your computer and use it in GitHub Desktop.
explore text using nltk
import string
import re
import nltk
#nltk.download('punkt') #if using nltk for the first time or using Colab
#nltk.download('stopwords') #if using nltk for the first time or using Colab
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
%matplotlib inline
path = os.getcwd()
no_short = re.compile(r'\W*\b\w{1,2}\b')
with open(path + '/TextsPub/Russell.txt') as f, open(path + '/TextsPub/Russell_tokens.txt', 'w') as out_f:
text = f.read()
for line in f: #remove xtra empty lines
if not line.strip(): continue # skip the empty line
outfile.write(line)
text = no_short.sub('', text)
tokens = word_tokenize(text)
tokens = [w.lower() for w in tokens]
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
words = [word for word in stripped if word.isalpha()]
stop_words = my_stop_words
words = [w for w in words if not w in my_stop_words]
new_text = ' '.join(words)
plt.figure(figsize=(18, 9))
fd = nltk.FreqDist(words)
fd.plot(40,title = "40 Most Frequent Words", cumulative=False)
#print(new_text[:500])
out_f.write(new_text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment