Skip to content

Instantly share code, notes, and snippets.

@mlai-demo
mlai-demo / te1.py
Last active August 11, 2019 18:50
import os, re, and file path
import os
fpath = os.getcwd(); fpath
@mlai-demo
mlai-demo / te2.py
Last active August 11, 2019 18:54
import files into Google Colab
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
print('User uploaded file "{name}" with length {length} bytes'.format(
name=fn, length=len(uploaded[fn])))
# Click Files tab - the uploaded file(s) will be there
@mlai-demo
mlai-demo / te3.py
Last active August 11, 2019 19:02
tokenize text using nltk
import string
from nltk.tokenize import word_tokenize
nltk.download('punkt') #need if using Google Colab
import matplotlib.pyplot as plt
%matplotlib inline
with open(fpath + '/Plutarch.txt') as f, open(fpath + '/Plutarch_tokens.txt', 'w') as out_f:
text = f.read()
tokens = word_tokenize(text)
tokens = [w.lower() for w in tokens]
@mlai-demo
mlai-demo / te4.py
Created August 11, 2019 19:08
NLTK bigrams
bigrams = nltk.bigrams(words)
freq_bigrams = nltk.FreqDist(bigrams)
plt.figure(figsize=(13, 7))
freq_bigrams.plot(20)
@mlai-demo
mlai-demo / te5.py
Last active August 11, 2019 22:22
NLTK stemmer and lemmatizer
from nltk.stem.porter import PorterStemmer
with open(fpath + '/Plutarch_tokens.txt') as f, open(fpath + '/Plutarch_stem.txt', 'w') as out_f:
text = f.read()
tokens = word_tokenize(text)
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in tokens]
print(stemmed[:100])
new_stem_text = ' '.join(stemmed)
fd_stemmed = nltk.FreqDist(stemmed)
out_f.write(new_stem_text)
unique = set(words)
print("The tokenized text is {} words long, has {} unique words and {} letters on average".format
(len(words), len(unique), round(sum(len(word) for word in words)/len(words),2)))
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from os import path
d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()
text_standard = open(fpath + '/Plutarch_tokens.txt', 'rt').read()
wc = WordCloud(stopwords=stop_words,
import numpy as np
from PIL import Image
roman_mask = np.array(Image.open(path.join(d, 'roman7.png')))
text_stem = open(fpath + '/Plutarch_stem.txt', 'rt').read()
wc = WordCloud(stopwords=stop_words,
max_font_size=200,
width=5000,
height=4000,
@mlai-demo
mlai-demo / tem1.py
Last active August 19, 2019 14:47
Import OS, regex and show file path
import os
import re
fpath = os.getcwd(); fpath
@mlai-demo
mlai-demo / tem2.py
Last active August 24, 2019 03:19
tokenize text
with open(fpath + '/Plutarch.txt') as f,
open(fpath + '/Plutarch2.txt', 'w') as out_f:
text = f.read().lower()
new_text = re.sub('[^a-z\.\?\!\-\'\:\;]', ' ', text) #keep only wanted characters (alphabet and select punctuation) new_text = re.sub(' +', ' ', new_text)#remove double empty spaces between words
new_text = re.sub(' +', ' ', new_text) #remove double space
new_text = re.sub('\n', ' ', new_text) #remove new line
items = [w for w in new_text.split(' ') if w.strip() != '' or w == '\n']
unique_items = set(items)
print("The text is {} words long, has {} unique items and {} characters on average\n".format
(len(items), len(unique_items), round(sum(len(word) for word in items)/len(items),2)))