Skip to content

Instantly share code, notes, and snippets.

@mlovci
Last active October 19, 2020 08:52
Show Gist options
  • Save mlovci/dc23a1d30076746cf0d8 to your computer and use it in GitHub Desktop.
Save mlovci/dc23a1d30076746cf0d8 to your computer and use it in GitHub Desktop.
Classy Optimal Leaf Ordering - with code taken from seaborn and markak. Does not plot anything by default, but provides `obj.ordered_data` that can be sent to seaborn.heatmap
import pandas as pd
import numpy as np
from scipy.spatial.distance import squareform, pdist
from scipy.cluster import hierarchy
from scipy.spatial import distance
from scipy.cluster.hierarchy import linkage
import itertools
import matplotlib.pyplot as plt
import seaborn
class Dendrogram(object):
"""Lifted from seaborn.matrix"""
def __init__(self, data, metric='euclidean', method='single'):
"""Get dendrogram of the relationships between the columns of data
Parameters
----------
data : pandas.DataFrame
Rectangular data
"""
if isinstance(data, pd.DataFrame):
array = data.values
else:
array = np.asarray(data)
data = pd.DataFrame(array)
self.array = array
self.data = data
self.shape = self.data.shape
self.metric = metric
self.method = method
self.linkage = self.calculated_linkage
self.dendrogram = self.calculate_dendrogram()
self.dependent_coord = self.dendrogram['dcoord']
self.independent_coord = self.dendrogram['icoord']
def _calculate_linkage_scipy(self):
if np.product(self.shape) >= 10000:
UserWarning('This will be slow... (gentle suggestion: '
'"pip install fastcluster")')
pairwise_dists = distance.pdist(self.array, metric=self.metric)
linkage = hierarchy.linkage(pairwise_dists, method=self.method)
del pairwise_dists
return linkage
def _calculate_linkage_fastcluster(self):
import fastcluster
# Fastcluster has a memory-saving vectorized version, but only
# with certain linkage methods, and mostly with euclidean metric
vector_methods = ('single', 'centroid', 'median', 'ward')
euclidean_methods = ('centroid', 'median', 'ward')
euclidean = self.metric == 'euclidean' and self.method in \
euclidean_methods
if euclidean or self.method == 'single':
return fastcluster.linkage_vector(self.array,
method=self.method,
metric=self.metric)
else:
pairwise_dists = distance.pdist(self.array, metric=self.metric)
linkage = fastcluster.linkage(pairwise_dists, method=self.method)
del pairwise_dists
return linkage
@property
def calculated_linkage(self):
try:
return self._calculate_linkage_fastcluster()
except ImportError:
return self._calculate_linkage_scipy()
def calculate_dendrogram(self):
"""Calculates a dendrogram based on the linkage matrix
Made a separate function, not a property because don't want to
recalculate the dendrogram every time it is accessed.
Returns
-------
dendrogram : dict
Dendrogram dictionary as returned by scipy.cluster.hierarchy
.dendrogram. The important key-value pairing is
"reordered_ind" which indicates the re-ordering of the matrix
"""
return hierarchy.dendrogram(self.linkage, no_plot=True,
color_threshold=-np.inf)
def leaves(t, t2=None):
""" Returns the leaves of a ClusterNode """
try:
return t.pre_order()
except AttributeError:
if t2 is not None:
return t2.pre_order()
else:
return []
def other(x, V, W):
# For an element x, returns the set that x isn't in
if x in V:
return W
else:
return V
class OLO(object):
"""
Lifted (mostly) from github user markak:
http://nbviewer.ipython.org/url/stanford.edu/~markak/2015-09-02%20-%20Optimal%20leaf%20ordering%20for%20hierarchical%20clustering.ipynb#
data - pandas dataframe
metric - metric for clustering
"""
def __init__(self, data, metric="euclidean", method='single'):
self.data = data
self.metric=metric
self.method=method
def compute_dendrogram(self, axis=0):
if axis == 1:
data = self.data.T
else:
data = self.data
d = Dendrogram(data, metric=self.metric, method=self.method)
self.M = {}
tree = hierarchy.to_tree(d.linkage)
dists = squareform(pdist(d.array,
metric=self.metric))
tree = self.order_tree(tree, dists)
order = leaves(tree)
del self.M
return d, order
@property
def row_dendrogram(self):
if not hasattr(self, '_row_dendrogram'):
self._row_dendrogram, self._row_order = self.compute_dendrogram(axis=0)
return self._row_dendrogram, self._row_order
@property
def col_dendrogram(self):
if not hasattr(self, '_col_dendrogram'):
self._col_dendrogram, self._col_order = self.compute_dendrogram(axis=1)
return self._col_dendrogram, self._col_order
@property
def col_order(self):
return self.col_dendrogram[1]
@property
def row_order(self):
return self.row_dendrogram[1]
@property
def ordered_data(self):
return self.data.iloc[self.row_order, self.col_order]
def optimal_scores(self, v, D, fast=True):
""" Implementation of Ziv-Bar-Joseph et al.'s leaf order algorithm
v is a ClusterNode
D is a distance matrix """
def score_func(left, right, u, m, w, k):
return Mfunc(left, u, m) + Mfunc(right, w, k) + D[m, k]
def Mfunc(v, a, b):
if a == b:
self.M[v, a, b] = 0
return self.M[v, a, b]
if v.is_leaf():
n = v.get_id()
self.M[v, n, n] = 0
return 0
else:
L = leaves(v.left)
R = leaves(v.right)
LL = leaves(v.left.left, v.left)
LR = leaves(v.left.right, v.left)
RL = leaves(v.right.left, v.right)
RR = leaves(v.right.right, v.right)
for l in L:
for r in R:
self.M[v.left, l, r] = self.optimal_scores(v.left, D, fast=False)
self.M[v.right, l, r] = self.optimal_scores(v.right, D, fast=False)
for u in L:
for w in R:
if fast:
m_order = sorted(other(u, LL, LR), key=lambda m: Mfunc(v.left, u, m))
k_order = sorted(other(w, RL, RR), key=lambda k: Mfunc(v.right, w, k))
C = min([D[m, k] for m in other(u, LL, LR) for k in other(w, RL, RR)])
Cmin = 1e10
for m in m_order:
if self.M[v.left, u, m] + self.M[v.right, w, k_order[0]] + C >= Cmin:
break
for k in k_order:
if self.M[v.left, u, m] + self.M[v.right, w, k] + C >= Cmin:
break
C = score_func(v.left, v.right, u, m, w, k)
if C < Cmin:
Cmin = C
self.M[v, u, w] = self.M[v, w, u] = Cmin
else:
self.M[v, u, w] = self.M[v, w, u] = \
min([score_func(v.left, v.right, u, m, w, k) \
for m in other(u, LL, LR) \
for k in other(w, RL, RR)])
return self.M[v, l, r]
def order_tree(self, v, D, fM=None, fast=True):
""" Returns an optimally ordered tree """
# Generate scores the first pass
if fM is None:
fM = 1
self.optimal_scores(v, D, fast=fast)
L = leaves(v.left)
R = leaves(v.right)
if len(L) and len(R):
def getkey(z): ##added, this is to replace a lambda function
u,w = z
return self.M[v,u,w]
if len(L) and len(R):
u, w = min(itertools.product(L,R), key=getkey) ##updated function
if w in leaves(v.right.left):
v.right.right, v.right.left = v.right.left, v.right.right
if u in leaves(v.left.right):
v.left.left, v.left.right = v.left.right, v.left.left
v.left = self.order_tree(v.left, D, fM)
v.right = self.order_tree(v.right, D, fM)
return v
if __name__ == "__main__":
# Create simulated data as in the paper
X = np.zeros((100, 100))
for n in [0,10,20,30,40,50,60]:
X[(10.*n/7):(10.*(n+10)/7):,n:n+40] = 1
for x,y in zip(np.random.choice(np.arange(100), 500), np.random.choice(np.arange(100), 500)):
X[x,y] = -1
seaborn.heatmap(X)
olo_data = OLO(pd.DataFrame(X), metric='hamming', method='complete')
plt.figure()
seaborn.heatmap(olo_data.ordered_data)
plt.show()
@mlovci
Copy link
Author

mlovci commented Dec 17, 2015

@markak - I put your code in a class. Take a look. Thanks for writing it up to start.
@olgabot & @mwaskom - I picked out parts of seaborn.matrix for the Dendrogram calculations

Thanks for making this easy!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment