pwin/best_practices_cluster.py

## best_practices_cluster.py
from __future__ import print_function

import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.externals import joblib

import os  # for os.path.basename

import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.manifold import MDS

from scipy.cluster.hierarchy import ward, dendrogram


####http://brandonrose.org/clustering

uris = [
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Cross_Agency_Strategy',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/High_Level_Support',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Holistic_Metrics',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/User_engagement_and_collaboration_throughout_the_lifecycle',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Organisational-internal_engagement',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Human_Readability_and_Machine_Processing',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Cost_of_Publication',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Stakeholders%E2%80%99_Interests_and_Rights',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Feedback_to_Improve_Quality',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Optimization_for_Search_Engines',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Publication_with_Common_Metadata',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Catalogs_and_Indexes',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Encourage_crowdsourcing',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Publish_spatial_data_on_the_web',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Monitoring_and_Benchmarking',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_quality_assessment',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Identifying_what_you_already_publish',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Make_the_data_available_in_the_language_people_want_it',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Management_Of_A_Wide_Public_Actors_Network',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Making_Research_Results_Open_For_The_Country',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Using_Business_Process_Paradigm_For_Open_Data_Lifecycle_Management',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Publishing_Statistical_Data_In_Linked_Data_Format',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Supervizor_-_An_Indispensable_Open_Government_Application_(Transparency_Of_Public_Spending)',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Civic_Use_Of_Open_Data',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_Publication_Plan',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/A_Federation_Tool_For_Opendata_Portals',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Traffic_Light_System_For_Data_Sharing',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_To_Improve_Sharing_And_Publication_Of_Information_Between_Public_Administrations',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Commercial_Considerations_in_Open_Data_Portal_Design',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Infomediary_Sector_Characteristics',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_2.0_-_Changing_Perspectives',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_Business_Model_Patterns_and_Open_Data_Business_Value_Disciplines',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/The_Central_Role_of_Location',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/An_ongoing_open_dialog_in_an_open_data_ecosystem',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Discover_published_information_by_site_scraping',
    'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Free_our_maps']

replacements = ['=Title=',
                '=Short Description=',
                '=Overview=',
                '=Why=',
                '=Intended Outcome=',
                '=Life Cycle Stage=',
                '=Possible Approach=',
                '=How to Test=',
                '=Evidence=',
                '=Lifecycle Stage=',
                '=Audience=',
                '=Related Best Practices=',
                '=Tags=',
                '=Status=',
                '=Intended Audience=',
                'nowiki',
                'Name of the Share-PSI workshop:',
                'Title of the Best Practice:',
                'Outline of the best practice',
                'Management summary',
                'Challenge',
                'Solution.',
                'Best Practice Identification',
                'Why is this a Best Practice?',
                'What\'s the impact of the Best Practice?',
                'Link to the PSI Directive',
                'Why is there a need for this Best Practice?',
                'What do you need for this Best Practice?',
                'Applicability by other member states?',
                'Contact info - record of the person to be contacted for additional information or advice.']


def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens


debug=False
synopses = []
titles = []
for i in uris:
    titles.append(i.split('/')[-1])
    if debug: print(i + "?action=raw")
    text = requests.get(i + "?action=raw").text.lower()
    if debug: print(text)
    for q in replacements:
        text = text.replace(q,'')
    if debug: print(text)
    synopses.append(text)

# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')

# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

totalvocab_stemmed = []
totalvocab_tokenized = []
for i in synopses:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list

    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

tfidf_matrix = tfidf_vectorizer.fit_transform(synopses) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

terms = tfidf_vectorizer.get_feature_names()
dist = 1 - cosine_similarity(tfidf_matrix)

num_clusters = 7

km = KMeans(n_clusters=num_clusters)

km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

BPs = { 'title': titles, 'synopsis': synopses, 'cluster': clusters}

frame = pd.DataFrame(BPs, index = [clusters] , columns = ['title', 'cluster'])

#grouped = frame['rank'].groupby(frame['cluster']) #groupby cluster for aggregation purposes

#grouped.mean() #average rank (1 to 100) per cluster

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')

    for ind in order_centroids[i, :10]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace

    print("Cluster %d titles:" % i, end='')
    for title in frame.ix[i]['title'].values.tolist():
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace

print()
print()


MDS()

# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]
print()
print()


#set up colors per clusters using a dict
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a',
                  4: '#66a61e', 5: '#111ddd', 6: '#444aaa'}

#set up cluster names using a dict
cluster_names = {0: 'Cl 0',
                 1: 'Cl 1',
                 2: 'Cl 2',
                 3: 'Cl 3',
                 4: 'Cl 4',
                 5: 'Cl 5',
                 6: 'Cl 6'}


#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles))

#group by cluster
groups = df.groupby('label')


# set up plot
fig, ax = plt.subplots(figsize=(17, 9)) # set size
ax.margins(0.10) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=18,
            label=cluster_names[name], color=cluster_colors[name],
            mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')
    ax.tick_params(\
        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelleft='off')

ax.legend(numpoints=1, fontsize = 'x-small', loc='upper center', bbox_to_anchor=(0.5, -0.05),
          fancybox=True, shadow=True, ncol=7)  #show legend with only 1 point

#add label in x,y position with the label as the film title
for i in range(len(df)):
    ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)


#plt.show() #show the plot

#uncomment the below to save the plot if need be
plt.savefig('clusters_small_noaxes.png', dpi=100)


linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances

fig, ax = plt.subplots(figsize=(15, 20)) # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=titles);

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')

plt.tight_layout() #show plot with tight layout

#uncomment below to save figure
plt.savefig('ward_clusters.png', dpi=100) #save figure as ward_clusters


print("Finished")
	from __future__ import print_function

	import numpy as np
	import pandas as pd
	import nltk
	import re
	import os
	import codecs
	from sklearn import feature_extraction
	import mpld3
	import requests
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from sklearn.cluster import KMeans
	from sklearn.externals import joblib

	import os # for os.path.basename

	import matplotlib.pyplot as plt
	import matplotlib as mpl

	from sklearn.manifold import MDS

	from scipy.cluster.hierarchy import ward, dendrogram




	####http://brandonrose.org/clustering

	uris = [
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Cross_Agency_Strategy',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/High_Level_Support',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Holistic_Metrics',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/User_engagement_and_collaboration_throughout_the_lifecycle',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Organisational-internal_engagement',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Human_Readability_and_Machine_Processing',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Cost_of_Publication',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Stakeholders%E2%80%99_Interests_and_Rights',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Feedback_to_Improve_Quality',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Optimization_for_Search_Engines',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Publication_with_Common_Metadata',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Catalogs_and_Indexes',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Encourage_crowdsourcing',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Publish_spatial_data_on_the_web',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Monitoring_and_Benchmarking',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_quality_assessment',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Identifying_what_you_already_publish',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Make_the_data_available_in_the_language_people_want_it',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Management_Of_A_Wide_Public_Actors_Network',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Making_Research_Results_Open_For_The_Country',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Using_Business_Process_Paradigm_For_Open_Data_Lifecycle_Management',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Publishing_Statistical_Data_In_Linked_Data_Format',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Supervizor_-_An_Indispensable_Open_Government_Application_(Transparency_Of_Public_Spending)',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Civic_Use_Of_Open_Data',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_Publication_Plan',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/A_Federation_Tool_For_Opendata_Portals',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Traffic_Light_System_For_Data_Sharing',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_To_Improve_Sharing_And_Publication_Of_Information_Between_Public_Administrations',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Commercial_Considerations_in_Open_Data_Portal_Design',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Infomediary_Sector_Characteristics',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_2.0_-_Changing_Perspectives',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_Business_Model_Patterns_and_Open_Data_Business_Value_Disciplines',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/The_Central_Role_of_Location',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/An_ongoing_open_dialog_in_an_open_data_ecosystem',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Discover_published_information_by_site_scraping',
	'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Free_our_maps']

	replacements = ['=Title=',
	'=Short Description=',
	'=Overview=',
	'=Why=',
	'=Intended Outcome=',
	'=Life Cycle Stage=',
	'=Possible Approach=',
	'=How to Test=',
	'=Evidence=',
	'=Lifecycle Stage=',
	'=Audience=',
	'=Related Best Practices=',
	'=Tags=',
	'=Status=',
	'=Intended Audience=',
	'nowiki',
	'Name of the Share-PSI workshop:',
	'Title of the Best Practice:',
	'Outline of the best practice',
	'Management summary',
	'Challenge',
	'Solution.',
	'Best Practice Identification',
	'Why is this a Best Practice?',
	'What\'s the impact of the Best Practice?',
	'Link to the PSI Directive',
	'Why is there a need for this Best Practice?',
	'What do you need for this Best Practice?',
	'Applicability by other member states?',
	'Contact info - record of the person to be contacted for additional information or advice.']



	def tokenize_and_stem(text):
	# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
	tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
	filtered_tokens = []
	# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
	for token in tokens:
	if re.search('[a-zA-Z]', token):
	filtered_tokens.append(token)
	stems = [stemmer.stem(t) for t in filtered_tokens]
	return stems


	def tokenize_only(text):
	# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
	tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
	filtered_tokens = []
	# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
	for token in tokens:
	if re.search('[a-zA-Z]', token):
	filtered_tokens.append(token)
	return filtered_tokens




	debug=False
	synopses = []
	titles = []
	for i in uris:
	titles.append(i.split('/')[-1])
	if debug: print(i + "?action=raw")
	text = requests.get(i + "?action=raw").text.lower()
	if debug: print(text)
	for q in replacements:
	text = text.replace(q,'')
	if debug: print(text)
	synopses.append(text)

	# load nltk's English stopwords as variable called 'stopwords'
	stopwords = nltk.corpus.stopwords.words('english')

	# load nltk's SnowballStemmer as variabled 'stemmer'
	from nltk.stem.snowball import SnowballStemmer
	stemmer = SnowballStemmer("english")

	totalvocab_stemmed = []
	totalvocab_tokenized = []
	for i in synopses:
	allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
	totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list

	allwords_tokenized = tokenize_only(i)
	totalvocab_tokenized.extend(allwords_tokenized)

	vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
	print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

	#define vectorizer parameters
	tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
	min_df=0.2, stop_words='english',
	use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

	tfidf_matrix = tfidf_vectorizer.fit_transform(synopses) #fit the vectorizer to synopses

	print(tfidf_matrix.shape)

	terms = tfidf_vectorizer.get_feature_names()
	dist = 1 - cosine_similarity(tfidf_matrix)

	num_clusters = 7

	km = KMeans(n_clusters=num_clusters)

	km.fit(tfidf_matrix)

	clusters = km.labels_.tolist()

	BPs = { 'title': titles, 'synopsis': synopses, 'cluster': clusters}

	frame = pd.DataFrame(BPs, index = [clusters] , columns = ['title', 'cluster'])

	#grouped = frame['rank'].groupby(frame['cluster']) #groupby cluster for aggregation purposes

	#grouped.mean() #average rank (1 to 100) per cluster

	print("Top terms per cluster:")
	print()
	#sort cluster centers by proximity to centroid
	order_centroids = km.cluster_centers_.argsort()[:, ::-1]

	for i in range(num_clusters):
	print("Cluster %d words:" % i, end='')

	for ind in order_centroids[i, :10]: #replace 6 with n words per cluster
	print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
	print() #add whitespace
	print() #add whitespace

	print("Cluster %d titles:" % i, end='')
	for title in frame.ix[i]['title'].values.tolist():
	print(' %s,' % title, end='')
	print() #add whitespace
	print() #add whitespace

	print()
	print()


	MDS()

	# convert two components as we're plotting points in a two-dimensional plane
	# "precomputed" because we provide a distance matrix
	# we will also specify `random_state` so the plot is reproducible.
	mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

	pos = mds.fit_transform(dist) # shape (n_components, n_samples)

	xs, ys = pos[:, 0], pos[:, 1]
	print()
	print()


	#set up colors per clusters using a dict
	cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a',
	4: '#66a61e', 5: '#111ddd', 6: '#444aaa'}

	#set up cluster names using a dict
	cluster_names = {0: 'Cl 0',
	1: 'Cl 1',
	2: 'Cl 2',
	3: 'Cl 3',
	4: 'Cl 4',
	5: 'Cl 5',
	6: 'Cl 6'}


	#create data frame that has the result of the MDS plus the cluster numbers and titles
	df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles))

	#group by cluster
	groups = df.groupby('label')


	# set up plot
	fig, ax = plt.subplots(figsize=(17, 9)) # set size
	ax.margins(0.10) # Optional, just adds 5% padding to the autoscaling

	#iterate through groups to layer the plot
	#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
	for name, group in groups:
	ax.plot(group.x, group.y, marker='o', linestyle='', ms=18,
	label=cluster_names[name], color=cluster_colors[name],
	mec='none')
	ax.set_aspect('auto')
	ax.tick_params(\
	axis= 'x', # changes apply to the x-axis
	which='both', # both major and minor ticks are affected
	bottom='off', # ticks along the bottom edge are off
	top='off', # ticks along the top edge are off
	labelbottom='off')
	ax.tick_params(\
	axis= 'y', # changes apply to the y-axis
	which='both', # both major and minor ticks are affected
	left='off', # ticks along the bottom edge are off
	top='off', # ticks along the top edge are off
	labelleft='off')

	ax.legend(numpoints=1, fontsize = 'x-small', loc='upper center', bbox_to_anchor=(0.5, -0.05),
	fancybox=True, shadow=True, ncol=7) #show legend with only 1 point

	#add label in x,y position with the label as the film title
	for i in range(len(df)):
	ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)



	#plt.show() #show the plot

	#uncomment the below to save the plot if need be
	plt.savefig('clusters_small_noaxes.png', dpi=100)


	linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances

	fig, ax = plt.subplots(figsize=(15, 20)) # set size
	ax = dendrogram(linkage_matrix, orientation="right", labels=titles);

	plt.tick_params(\
	axis= 'x', # changes apply to the x-axis
	which='both', # both major and minor ticks are affected
	bottom='off', # ticks along the bottom edge are off
	top='off', # ticks along the top edge are off
	labelbottom='off')

	plt.tight_layout() #show plot with tight layout

	#uncomment below to save figure
	plt.savefig('ward_clusters.png', dpi=100) #save figure as ward_clusters



	print("Finished")