Last active
August 11, 2019 19:02
-
-
Save mlai-demo/4145a7c2be538c6a50a7646bb3917549 to your computer and use it in GitHub Desktop.
tokenize text using nltk
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
from nltk.tokenize import word_tokenize | |
nltk.download('punkt') #need if using Google Colab | |
import matplotlib.pyplot as plt | |
%matplotlib inline | |
with open(fpath + '/Plutarch.txt') as f, open(fpath + '/Plutarch_tokens.txt', 'w') as out_f: | |
text = f.read() | |
tokens = word_tokenize(text) | |
tokens = [w.lower() for w in tokens] | |
table = str.maketrans('', '', string.punctuation) | |
stripped = [w.translate(table) for w in tokens] | |
words = [word for word in stripped if word.isalpha()] | |
stop_words = stopwords.words('english') | |
new_stop_words = ['one','came', 'come', 'upon', 'made','though', 'indeed', 'left', 'yet', 'without' | |
'thus','therefore', 'another', 'much', 'many','said', 'either', 'two', 'upon', 'would', | |
'around', 'without', 'when', 'also', 'could', 'sent', 'notwithstanding', 'hence', 'thus'] | |
stop_words.extend(new_stop_words) | |
words = [w for w in words if not w in stop_words] | |
new_text = ' '.join(words) | |
plt.figure(figsize=(16, 7)) | |
fd = nltk.FreqDist(words) | |
fd.plot(40,title = "40 Most Frequent Words", cumulative=False) | |
#print(new_text[:500]) #uncomment if would like to print some of the tokenized text | |
out_f.write(new_text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment