Skip to content

Instantly share code, notes, and snippets.

@gugray gugray/ Secret
Created May 19, 2017

What would you like to do?
import os
import nltk
import string
import json
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from nltk import stem
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.manifold import MDS
num_clusters = 2
stemming = True
sitenames = []
def get_prettyname(filename):
fn = filename[3:]
fn = fn.replace(".html.txt", "")
return fn
# First pass: init site names (from files)
for filename in os.listdir("./txt"):
if not filename.endswith(".html.txt"): continue
fn = get_prettyname(filename)
sitetexts = []
fcount = 0;
for filename in os.listdir("./txt"):
if not filename.endswith(".html.txt"): continue
fcount += 1
#if fcount > 2: break;
text = None
with open("./txt/" + filename, "r", encoding="utf8") as f:
text =
stemmer = nltk.PorterStemmer()
stop = set(stopwords.words('english'))
def tokenize_and_stem(text):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if len(token) < 3: continue
token = token.lower()
if any(char.isdigit() for char in token): continue
if token in stop: continue
if token == "...": continue
if stemming: token = stemmer.stem(token)
return filtered_tokens
ranks = []
for i in range(0,len(sitenames)):
tfidf_vectorizer = TfidfVectorizer(max_df=0.6, max_features=200000,
min_df=0.05, stop_words='english',
use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))
tfidf_matrix = tfidf_vectorizer.fit_transform(sitetexts)
terms = tfidf_vectorizer.get_feature_names()
dist = 1 - cosine_similarity(tfidf_matrix)
km = KMeans(n_clusters=num_clusters)
clusters = km.labels_.tolist()
sites = { 'url': sitenames, 'rank': ranks, 'cluster': clusters }
frame = pd.DataFrame(sites, index = [clusters] , columns = ['rank', 'url', 'cluster'])
grouped = frame['rank'].groupby(frame['cluster'])
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
cluster_words = []
for i in range(num_clusters):
print("Cluster %d words:" % i, end='')
words_here = ""
for ind in order_centroids[i, :6]:
print(' %s' % terms[ind], end=',')
if len(words_here) > 0: words_here += ", "
words_here += terms[ind]
print("Cluster %d URLs:" % i, end='')
for url in frame.ix[i]['url'].values.tolist():
print(' %s,' % url, end='')
# two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist) # shape (n_components, n_samples)
xs, ys = pos[:, 0], pos[:, 1]
def strip_proppers_POS(text):
#tagged = pos_tag(text.split()) #use NLTK's part of speech tagger
#non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']
#return non_propernouns
tagged = pos_tag(text.split()) #use NLTK's part of speech tagger
non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']
return non_propernouns
#set up colors per clusters using a dict
cluster_colors = {0: '#ffff00', 1: '#00ff00', 2: '#ff00ff', 3: '#0000ff', 4: '#008000', 5: '#800000'}
#set up cluster names using a dict
# cluster_names = {0: 'One',
# 1: 'Two',
# 2: 'Three',
# 3: 'Four',
# 4: 'Five',
# 5: "Six" }
# cluster names are the characteristic words
cluster_names = []
for i in range(len(cluster_words)):
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, url=sitenames))
#group by cluster
groups = df.groupby('label')
# Build structure that will be dumped as JSON
jsdata = [{
"words": "hello, and goodbye",
"sites": ["", "", ""],
"data": [{ "x": 0, "y": 0, "rank": 8, "url": ""}, {"x": 30, "y": 30, "rank": 19, "site": ""}]
jsdata = []
for name, group in groups:
dataset = {}
dataset["words"] = cluster_names[name]
dataset["sites"] = []
for itm in group.url: dataset["sites"].append(itm)
dataset["data"] = []
for itm in group.x:
dpoint = {}
dpoint["x"] = itm
ix = 0
for itm in group.y:
dataset["data"][ix]["y"] = itm
ix += 1
ix = 0
for itm in group.url:
dataset["data"][ix]["url"] = itm
dataset["data"][ix]["rank"] = sitenames.index(itm)
ix += 1
fname = "./work/data-"
if stemming: fname += "stem-"
else: fname += "nostem-"
fname += str(num_clusters)
fname += ".json"
with open(fname, 'w', encoding="utf8") as f:
json.dump(jsdata, f, ensure_ascii=False, indent=2)
# set up plot
fig, ax = plt.subplots(figsize=(14, 10)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=cluster_names[name], color=cluster_colors[name], mec='none')
#ax.scatter(group.x, group.y, s=sizes, color=cluster_colors[name])
axis= 'x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
axis= 'y', # changes apply to the y-axis
which='both', # both major and minor ticks are affected
left='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
#show legend with only 1 point
ax.legend(numpoints=1, loc=3, bbox_to_anchor=(0, 0.95))
#add label in x,y position with the label as the film title
for i in range(len(df)):
ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['url'], size=10) #show the plot
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.