Last active
January 11, 2020 22:59
-
-
Save mlai-demo/4ed02f2bba7e9faf1691f05eab8510d2 to your computer and use it in GitHub Desktop.
explore text using nltk
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
import re | |
import nltk | |
#nltk.download('punkt') #if using nltk for the first time or using Colab | |
#nltk.download('stopwords') #if using nltk for the first time or using Colab | |
from nltk.tokenize import word_tokenize | |
from nltk.corpus import stopwords | |
import matplotlib.pyplot as plt | |
%matplotlib inline | |
path = os.getcwd() | |
no_short = re.compile(r'\W*\b\w{1,2}\b') | |
with open(path + '/TextsPub/Russell.txt') as f, open(path + '/TextsPub/Russell_tokens.txt', 'w') as out_f: | |
text = f.read() | |
for line in f: #remove xtra empty lines | |
if not line.strip(): continue # skip the empty line | |
outfile.write(line) | |
text = no_short.sub('', text) | |
tokens = word_tokenize(text) | |
tokens = [w.lower() for w in tokens] | |
table = str.maketrans('', '', string.punctuation) | |
stripped = [w.translate(table) for w in tokens] | |
words = [word for word in stripped if word.isalpha()] | |
stop_words = my_stop_words | |
words = [w for w in words if not w in my_stop_words] | |
new_text = ' '.join(words) | |
plt.figure(figsize=(18, 9)) | |
fd = nltk.FreqDist(words) | |
fd.plot(40,title = "40 Most Frequent Words", cumulative=False) | |
#print(new_text[:500]) | |
out_f.write(new_text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment