import os | |
import nltk | |
import string | |
import json | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import matplotlib as mpl | |
from nltk import stem | |
from nltk.corpus import stopwords | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from sklearn.cluster import KMeans | |
from sklearn.manifold import MDS | |
num_clusters = 2 | |
stemming = True | |
sitenames = [] | |
def get_prettyname(filename): | |
fn = filename[3:] | |
fn = fn.replace(".html.txt", "") | |
return fn | |
# First pass: init site names (from files) | |
for filename in os.listdir("./txt"): | |
if not filename.endswith(".html.txt"): continue | |
fn = get_prettyname(filename) | |
sitenames.append(fn) | |
sitetexts = [] | |
fcount = 0; | |
for filename in os.listdir("./txt"): | |
if not filename.endswith(".html.txt"): continue | |
fcount += 1 | |
#if fcount > 2: break; | |
text = None | |
with open("./txt/" + filename, "r", encoding="utf8") as f: | |
text = f.read() | |
sitetexts.append(text) | |
stemmer = nltk.PorterStemmer() | |
stop = set(stopwords.words('english')) | |
def tokenize_and_stem(text): | |
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token | |
tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] | |
filtered_tokens = [] | |
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) | |
for token in tokens: | |
if len(token) < 3: continue | |
token = token.lower() | |
if any(char.isdigit() for char in token): continue | |
if token in stop: continue | |
if token == "...": continue | |
if stemming: token = stemmer.stem(token) | |
filtered_tokens.append(token) | |
return filtered_tokens | |
ranks = [] | |
for i in range(0,len(sitenames)): | |
ranks.append(i) | |
tfidf_vectorizer = TfidfVectorizer(max_df=0.6, max_features=200000, | |
min_df=0.05, stop_words='english', | |
use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3)) | |
tfidf_matrix = tfidf_vectorizer.fit_transform(sitetexts) | |
terms = tfidf_vectorizer.get_feature_names() | |
dist = 1 - cosine_similarity(tfidf_matrix) | |
km = KMeans(n_clusters=num_clusters) | |
km.fit(tfidf_matrix) | |
clusters = km.labels_.tolist() | |
sites = { 'url': sitenames, 'rank': ranks, 'cluster': clusters } | |
frame = pd.DataFrame(sites, index = [clusters] , columns = ['rank', 'url', 'cluster']) | |
#print(frame['cluster'].value_counts()) | |
grouped = frame['rank'].groupby(frame['cluster']) | |
#print(grouped.mean()) | |
print("Top terms per cluster:") | |
print() | |
order_centroids = km.cluster_centers_.argsort()[:, ::-1] | |
cluster_words = [] | |
for i in range(num_clusters): | |
print("Cluster %d words:" % i, end='') | |
words_here = "" | |
for ind in order_centroids[i, :6]: | |
print(' %s' % terms[ind], end=',') | |
if len(words_here) > 0: words_here += ", " | |
words_here += terms[ind] | |
cluster_words.append(words_here) | |
print() | |
print() | |
print("Cluster %d URLs:" % i, end='') | |
for url in frame.ix[i]['url'].values.tolist(): | |
print(' %s,' % url, end='') | |
print() | |
print() | |
MDS() | |
# two components as we're plotting points in a two-dimensional plane | |
# "precomputed" because we provide a distance matrix | |
# we will also specify `random_state` so the plot is reproducible. | |
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) | |
pos = mds.fit_transform(dist) # shape (n_components, n_samples) | |
xs, ys = pos[:, 0], pos[:, 1] | |
def strip_proppers_POS(text): | |
#tagged = pos_tag(text.split()) #use NLTK's part of speech tagger | |
#non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS'] | |
#return non_propernouns | |
tagged = pos_tag(text.split()) #use NLTK's part of speech tagger | |
non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS'] | |
return non_propernouns | |
#set up colors per clusters using a dict | |
cluster_colors = {0: '#ffff00', 1: '#00ff00', 2: '#ff00ff', 3: '#0000ff', 4: '#008000', 5: '#800000'} | |
#set up cluster names using a dict | |
# cluster_names = {0: 'One', | |
# 1: 'Two', | |
# 2: 'Three', | |
# 3: 'Four', | |
# 4: 'Five', | |
# 5: "Six" } | |
# cluster names are the characteristic words | |
cluster_names = [] | |
for i in range(len(cluster_words)): | |
cluster_names.append(cluster_words[i]) | |
#create data frame that has the result of the MDS plus the cluster numbers and titles | |
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, url=sitenames)) | |
#group by cluster | |
groups = df.groupby('label') | |
# Build structure that will be dumped as JSON | |
jsdata = [{ | |
"words": "hello, and goodbye", | |
"sites": ["a.com", "b.com", "d.com"], | |
"data": [{ "x": 0, "y": 0, "rank": 8, "url": "a.com"}, {"x": 30, "y": 30, "rank": 19, "site": "b.com"}] | |
}] | |
jsdata = [] | |
for name, group in groups: | |
dataset = {} | |
dataset["words"] = cluster_names[name] | |
dataset["sites"] = [] | |
for itm in group.url: dataset["sites"].append(itm) | |
dataset["data"] = [] | |
for itm in group.x: | |
dpoint = {} | |
dpoint["x"] = itm | |
dataset["data"].append(dpoint) | |
ix = 0 | |
for itm in group.y: | |
dataset["data"][ix]["y"] = itm | |
ix += 1 | |
ix = 0 | |
for itm in group.url: | |
dataset["data"][ix]["url"] = itm | |
dataset["data"][ix]["rank"] = sitenames.index(itm) | |
ix += 1 | |
jsdata.append(dataset) | |
fname = "./work/data-" | |
if stemming: fname += "stem-" | |
else: fname += "nostem-" | |
fname += str(num_clusters) | |
fname += ".json" | |
with open(fname, 'w', encoding="utf8") as f: | |
json.dump(jsdata, f, ensure_ascii=False, indent=2) | |
# set up plot | |
fig, ax = plt.subplots(figsize=(14, 10)) # set size | |
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling | |
#iterate through groups to layer the plot | |
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label | |
for name, group in groups: | |
ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=cluster_names[name], color=cluster_colors[name], mec='none') | |
#ax.scatter(group.x, group.y, s=sizes, color=cluster_colors[name]) | |
ax.set_aspect('auto') | |
ax.tick_params(\ | |
axis= 'x', # changes apply to the x-axis | |
which='both', # both major and minor ticks are affected | |
bottom='off', # ticks along the bottom edge are off | |
top='off', # ticks along the top edge are off | |
labelbottom='off') | |
ax.tick_params(\ | |
axis= 'y', # changes apply to the y-axis | |
which='both', # both major and minor ticks are affected | |
left='off', # ticks along the bottom edge are off | |
top='off', # ticks along the top edge are off | |
labelleft='off') | |
#show legend with only 1 point | |
ax.legend(numpoints=1, loc=3, bbox_to_anchor=(0, 0.95)) | |
#add label in x,y position with the label as the film title | |
for i in range(len(df)): | |
ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['url'], size=10) | |
#plt.show() #show the plot |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment