Skip to content

Instantly share code, notes, and snippets.

@pwin
Created June 29, 2015 18:14
Show Gist options
  • Save pwin/a7559126ddc4c08d9d56 to your computer and use it in GitHub Desktop.
Save pwin/a7559126ddc4c08d9d56 to your computer and use it in GitHub Desktop.
hierarchical clustering of docs
from __future__ import print_function
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.externals import joblib
import os # for os.path.basename
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.manifold import MDS
from scipy.cluster.hierarchy import ward, dendrogram
####http://brandonrose.org/clustering
uris = [
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Cross_Agency_Strategy',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/High_Level_Support',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Holistic_Metrics',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/User_engagement_and_collaboration_throughout_the_lifecycle',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Organisational-internal_engagement',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Human_Readability_and_Machine_Processing',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Cost_of_Publication',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Stakeholders%E2%80%99_Interests_and_Rights',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Feedback_to_Improve_Quality',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Optimization_for_Search_Engines',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Publication_with_Common_Metadata',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Catalogs_and_Indexes',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Encourage_crowdsourcing',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Publish_spatial_data_on_the_web',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Monitoring_and_Benchmarking',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_quality_assessment',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Identifying_what_you_already_publish',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Make_the_data_available_in_the_language_people_want_it',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Management_Of_A_Wide_Public_Actors_Network',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Making_Research_Results_Open_For_The_Country',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Using_Business_Process_Paradigm_For_Open_Data_Lifecycle_Management',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Publishing_Statistical_Data_In_Linked_Data_Format',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Supervizor_-_An_Indispensable_Open_Government_Application_(Transparency_Of_Public_Spending)',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Civic_Use_Of_Open_Data',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_Publication_Plan',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/A_Federation_Tool_For_Opendata_Portals',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Traffic_Light_System_For_Data_Sharing',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_To_Improve_Sharing_And_Publication_Of_Information_Between_Public_Administrations',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Commercial_Considerations_in_Open_Data_Portal_Design',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Infomediary_Sector_Characteristics',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_2.0_-_Changing_Perspectives',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_Business_Model_Patterns_and_Open_Data_Business_Value_Disciplines',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/The_Central_Role_of_Location',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/An_ongoing_open_dialog_in_an_open_data_ecosystem',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Discover_published_information_by_site_scraping',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Free_our_maps']
replacements = ['=Title=',
'=Short Description=',
'=Overview=',
'=Why=',
'=Intended Outcome=',
'=Life Cycle Stage=',
'=Possible Approach=',
'=How to Test=',
'=Evidence=',
'=Lifecycle Stage=',
'=Audience=',
'=Related Best Practices=',
'=Tags=',
'=Status=',
'=Intended Audience=',
'nowiki',
'Name of the Share-PSI workshop:',
'Title of the Best Practice:',
'Outline of the best practice',
'Management summary',
'Challenge',
'Solution.',
'Best Practice Identification',
'Why is this a Best Practice?',
'What\'s the impact of the Best Practice?',
'Link to the PSI Directive',
'Why is there a need for this Best Practice?',
'What do you need for this Best Practice?',
'Applicability by other member states?',
'Contact info - record of the person to be contacted for additional information or advice.']
def tokenize_and_stem(text):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
stems = [stemmer.stem(t) for t in filtered_tokens]
return stems
def tokenize_only(text):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
return filtered_tokens
debug=False
synopses = []
titles = []
for i in uris:
titles.append(i.split('/')[-1])
if debug: print(i + "?action=raw")
text = requests.get(i + "?action=raw").text.lower()
if debug: print(text)
for q in replacements:
text = text.replace(q,'')
if debug: print(text)
synopses.append(text)
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in synopses:
allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
allwords_tokenized = tokenize_only(i)
totalvocab_tokenized.extend(allwords_tokenized)
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
min_df=0.2, stop_words='english',
use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))
tfidf_matrix = tfidf_vectorizer.fit_transform(synopses) #fit the vectorizer to synopses
print(tfidf_matrix.shape)
terms = tfidf_vectorizer.get_feature_names()
dist = 1 - cosine_similarity(tfidf_matrix)
num_clusters = 7
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()
BPs = { 'title': titles, 'synopsis': synopses, 'cluster': clusters}
frame = pd.DataFrame(BPs, index = [clusters] , columns = ['title', 'cluster'])
#grouped = frame['rank'].groupby(frame['cluster']) #groupby cluster for aggregation purposes
#grouped.mean() #average rank (1 to 100) per cluster
print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
for i in range(num_clusters):
print("Cluster %d words:" % i, end='')
for ind in order_centroids[i, :10]: #replace 6 with n words per cluster
print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
print() #add whitespace
print() #add whitespace
print("Cluster %d titles:" % i, end='')
for title in frame.ix[i]['title'].values.tolist():
print(' %s,' % title, end='')
print() #add whitespace
print() #add whitespace
print()
print()
MDS()
# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist) # shape (n_components, n_samples)
xs, ys = pos[:, 0], pos[:, 1]
print()
print()
#set up colors per clusters using a dict
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a',
4: '#66a61e', 5: '#111ddd', 6: '#444aaa'}
#set up cluster names using a dict
cluster_names = {0: 'Cl 0',
1: 'Cl 1',
2: 'Cl 2',
3: 'Cl 3',
4: 'Cl 4',
5: 'Cl 5',
6: 'Cl 6'}
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles))
#group by cluster
groups = df.groupby('label')
# set up plot
fig, ax = plt.subplots(figsize=(17, 9)) # set size
ax.margins(0.10) # Optional, just adds 5% padding to the autoscaling
#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
ax.plot(group.x, group.y, marker='o', linestyle='', ms=18,
label=cluster_names[name], color=cluster_colors[name],
mec='none')
ax.set_aspect('auto')
ax.tick_params(\
axis= 'x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelbottom='off')
ax.tick_params(\
axis= 'y', # changes apply to the y-axis
which='both', # both major and minor ticks are affected
left='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelleft='off')
ax.legend(numpoints=1, fontsize = 'x-small', loc='upper center', bbox_to_anchor=(0.5, -0.05),
fancybox=True, shadow=True, ncol=7) #show legend with only 1 point
#add label in x,y position with the label as the film title
for i in range(len(df)):
ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)
#plt.show() #show the plot
#uncomment the below to save the plot if need be
plt.savefig('clusters_small_noaxes.png', dpi=100)
linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances
fig, ax = plt.subplots(figsize=(15, 20)) # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=titles);
plt.tick_params(\
axis= 'x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelbottom='off')
plt.tight_layout() #show plot with tight layout
#uncomment below to save figure
plt.savefig('ward_clusters.png', dpi=100) #save figure as ward_clusters
print("Finished")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment