This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
fpath = os.getcwd(); fpath |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from google.colab import files | |
uploaded = files.upload() | |
for fn in uploaded.keys(): | |
print('User uploaded file "{name}" with length {length} bytes'.format( | |
name=fn, length=len(uploaded[fn]))) | |
# Click Files tab - the uploaded file(s) will be there |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
from nltk.tokenize import word_tokenize | |
nltk.download('punkt') #need if using Google Colab | |
import matplotlib.pyplot as plt | |
%matplotlib inline | |
with open(fpath + '/Plutarch.txt') as f, open(fpath + '/Plutarch_tokens.txt', 'w') as out_f: | |
text = f.read() | |
tokens = word_tokenize(text) | |
tokens = [w.lower() for w in tokens] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
bigrams = nltk.bigrams(words) | |
freq_bigrams = nltk.FreqDist(bigrams) | |
plt.figure(figsize=(13, 7)) | |
freq_bigrams.plot(20) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.stem.porter import PorterStemmer | |
with open(fpath + '/Plutarch_tokens.txt') as f, open(fpath + '/Plutarch_stem.txt', 'w') as out_f: | |
text = f.read() | |
tokens = word_tokenize(text) | |
porter = PorterStemmer() | |
stemmed = [porter.stem(word) for word in tokens] | |
print(stemmed[:100]) | |
new_stem_text = ' '.join(stemmed) | |
fd_stemmed = nltk.FreqDist(stemmed) | |
out_f.write(new_stem_text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
unique = set(words) | |
print("The tokenized text is {} words long, has {} unique words and {} letters on average".format | |
(len(words), len(unique), round(sum(len(word) for word in words)/len(words),2))) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from wordcloud import WordCloud | |
import matplotlib.pyplot as plt | |
from os import path | |
d = path.dirname(__file__) if "__file__" in locals() else os.getcwd() | |
text_standard = open(fpath + '/Plutarch_tokens.txt', 'rt').read() | |
wc = WordCloud(stopwords=stop_words, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from PIL import Image | |
roman_mask = np.array(Image.open(path.join(d, 'roman7.png'))) | |
text_stem = open(fpath + '/Plutarch_stem.txt', 'rt').read() | |
wc = WordCloud(stopwords=stop_words, | |
max_font_size=200, | |
width=5000, | |
height=4000, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
fpath = os.getcwd(); fpath |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
with open(fpath + '/Plutarch.txt') as f, | |
open(fpath + '/Plutarch2.txt', 'w') as out_f: | |
text = f.read().lower() | |
new_text = re.sub('[^a-z\.\?\!\-\'\:\;]', ' ', text) #keep only wanted characters (alphabet and select punctuation) new_text = re.sub(' +', ' ', new_text)#remove double empty spaces between words | |
new_text = re.sub(' +', ' ', new_text) #remove double space | |
new_text = re.sub('\n', ' ', new_text) #remove new line | |
items = [w for w in new_text.split(' ') if w.strip() != '' or w == '\n'] | |
unique_items = set(items) | |
print("The text is {} words long, has {} unique items and {} characters on average\n".format | |
(len(items), len(unique_items), round(sum(len(word) for word in items)/len(items),2))) |
OlderNewer