mlai-demo/te3.py

## te3.py
import string
from nltk.tokenize import word_tokenize
nltk.download('punkt') #need if using Google Colab
import matplotlib.pyplot as plt
%matplotlib inline

with open(fpath + '/Plutarch.txt') as f, open(fpath + '/Plutarch_tokens.txt', 'w') as out_f:
    text = f.read()
    tokens = word_tokenize(text)
    tokens = [w.lower() for w in tokens]
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    stop_words = stopwords.words('english')
    new_stop_words = ['one','came', 'come', 'upon', 'made','though', 'indeed', 'left', 'yet', 'without'
                 'thus','therefore', 'another', 'much', 'many','said', 'either', 'two', 'upon', 'would',
                     'around', 'without', 'when', 'also', 'could', 'sent', 'notwithstanding', 'hence', 'thus']
    stop_words.extend(new_stop_words)
    words = [w for w in words if not w in stop_words]
    new_text = ' '.join(words)
    plt.figure(figsize=(16, 7))
    fd = nltk.FreqDist(words)
    fd.plot(40,title = "40 Most Frequent Words", cumulative=False)
    #print(new_text[:500]) #uncomment if would like to print some of the tokenized text
    out_f.write(new_text)
	import string
	from nltk.tokenize import word_tokenize
	nltk.download('punkt') #need if using Google Colab
	import matplotlib.pyplot as plt
	%matplotlib inline

	with open(fpath + '/Plutarch.txt') as f, open(fpath + '/Plutarch_tokens.txt', 'w') as out_f:
	text = f.read()
	tokens = word_tokenize(text)
	tokens = [w.lower() for w in tokens]
	table = str.maketrans('', '', string.punctuation)
	stripped = [w.translate(table) for w in tokens]
	words = [word for word in stripped if word.isalpha()]
	stop_words = stopwords.words('english')
	new_stop_words = ['one','came', 'come', 'upon', 'made','though', 'indeed', 'left', 'yet', 'without'
	'thus','therefore', 'another', 'much', 'many','said', 'either', 'two', 'upon', 'would',
	'around', 'without', 'when', 'also', 'could', 'sent', 'notwithstanding', 'hence', 'thus']
	stop_words.extend(new_stop_words)
	words = [w for w in words if not w in stop_words]
	new_text = ' '.join(words)
	plt.figure(figsize=(16, 7))
	fd = nltk.FreqDist(words)
	fd.plot(40,title = "40 Most Frequent Words", cumulative=False)
	#print(new_text[:500]) #uncomment if would like to print some of the tokenized text
	out_f.write(new_text)