Skip to content

Instantly share code, notes, and snippets.

@mlai-demo
Last active August 11, 2019 19:02
Show Gist options
  • Save mlai-demo/4145a7c2be538c6a50a7646bb3917549 to your computer and use it in GitHub Desktop.
Save mlai-demo/4145a7c2be538c6a50a7646bb3917549 to your computer and use it in GitHub Desktop.
tokenize text using nltk
import string
from nltk.tokenize import word_tokenize
nltk.download('punkt') #need if using Google Colab
import matplotlib.pyplot as plt
%matplotlib inline
with open(fpath + '/Plutarch.txt') as f, open(fpath + '/Plutarch_tokens.txt', 'w') as out_f:
text = f.read()
tokens = word_tokenize(text)
tokens = [w.lower() for w in tokens]
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
words = [word for word in stripped if word.isalpha()]
stop_words = stopwords.words('english')
new_stop_words = ['one','came', 'come', 'upon', 'made','though', 'indeed', 'left', 'yet', 'without'
'thus','therefore', 'another', 'much', 'many','said', 'either', 'two', 'upon', 'would',
'around', 'without', 'when', 'also', 'could', 'sent', 'notwithstanding', 'hence', 'thus']
stop_words.extend(new_stop_words)
words = [w for w in words if not w in stop_words]
new_text = ' '.join(words)
plt.figure(figsize=(16, 7))
fd = nltk.FreqDist(words)
fd.plot(40,title = "40 Most Frequent Words", cumulative=False)
#print(new_text[:500]) #uncomment if would like to print some of the tokenized text
out_f.write(new_text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment