mlai-demo/explore_text_nltk.py

## explore_text_nltk.py
import string
import re
import nltk
#nltk.download('punkt') #if using nltk for the first time or using Colab
#nltk.download('stopwords') #if using nltk for the first time or using Colab
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
%matplotlib inline

path = os.getcwd()

no_short = re.compile(r'\W*\b\w{1,2}\b')
with open(path + '/TextsPub/Russell.txt') as f, open(path + '/TextsPub/Russell_tokens.txt', 'w') as out_f:
    text = f.read()
    for line in f:                                            #remove xtra empty lines
        if not line.strip(): continue  # skip the empty line
        outfile.write(line)
    text = no_short.sub('', text)
    tokens = word_tokenize(text)
    tokens = [w.lower() for w in tokens]
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    stop_words = my_stop_words
    words = [w for w in words if not w in my_stop_words]
    new_text = ' '.join(words)
    plt.figure(figsize=(18, 9))
    fd = nltk.FreqDist(words)
    fd.plot(40,title = "40 Most Frequent Words", cumulative=False)
    #print(new_text[:500])
    out_f.write(new_text)
	import string
	import re
	import nltk
	#nltk.download('punkt') #if using nltk for the first time or using Colab
	#nltk.download('stopwords') #if using nltk for the first time or using Colab
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	import matplotlib.pyplot as plt
	%matplotlib inline

	path = os.getcwd()

	no_short = re.compile(r'\W*\b\w{1,2}\b')
	with open(path + '/TextsPub/Russell.txt') as f, open(path + '/TextsPub/Russell_tokens.txt', 'w') as out_f:
	text = f.read()
	for line in f: #remove xtra empty lines
	if not line.strip(): continue # skip the empty line
	outfile.write(line)
	text = no_short.sub('', text)
	tokens = word_tokenize(text)
	tokens = [w.lower() for w in tokens]
	table = str.maketrans('', '', string.punctuation)
	stripped = [w.translate(table) for w in tokens]
	words = [word for word in stripped if word.isalpha()]
	stop_words = my_stop_words
	words = [w for w in words if not w in my_stop_words]
	new_text = ' '.join(words)
	plt.figure(figsize=(18, 9))
	fd = nltk.FreqDist(words)
	fd.plot(40,title = "40 Most Frequent Words", cumulative=False)
	#print(new_text[:500])
	out_f.write(new_text)