tommct/README.md

## README.md

      
    Raw
  

              README.md
            
          
    This is a recipe for using Sklearn to build a cosine similarity matrix and then to build dendrograms from it.
import numpy as np
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy
import scipy.spatial.distance
from scipy.spatial.distance import pdist
from sklearn.metrics.pairwise import cosine_similarity

# Make a "feature matrix" of 15 items that will be the binary representation of each index.
# That is, 0001, 0010, ... , 0111, 1111. We will then get the cosine distance between each
# integer using this binary feature.
M = []
L = []
rng = range(1,16)
for i in rng:
    astr = '{:04b}'.format(i)
    M.append(list(map(int, astr)))
    L.append(astr)
    
# Get the cosine similarity matrix from the feature matrix
c = cosine_similarity(M, M)
c = np.nan_to_num(c)
c = 1.0 - c  # Invert the similarity so that 0 is close and 1 is far.
np.fill_diagonal(c, 0)
c = np.clip(c, 0, 1)

# Now make a distance matrix to pass into the clustering method.
pdist = scipy.spatial.distance.squareform(pdist(c, 'sqeuclidean'))

# Print examples of each linkage method.
for method in ['single', 'complete', 'average', 'weighted']:
    Z = scipy.cluster.hierarchy.linkage(pdist, method=method)
    R = scipy.cluster.hierarchy.inconsistent(Z, d=2)
    plt.clf()
    fig = plt.figure()
    ax = fig.add_axes([.1, .1, .8, .8])
    dd = scipy.cluster.hierarchy.dendrogram(Z, labels=L, leaf_font_size=7, ax=ax)
    plt.savefig('{}.pdf'.format(method))