Last active
October 19, 2020 08:52
-
-
Save mlovci/dc23a1d30076746cf0d8 to your computer and use it in GitHub Desktop.
Classy Optimal Leaf Ordering - with code taken from seaborn and markak. Does not plot anything by default, but provides `obj.ordered_data` that can be sent to seaborn.heatmap
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from scipy.spatial.distance import squareform, pdist | |
from scipy.cluster import hierarchy | |
from scipy.spatial import distance | |
from scipy.cluster.hierarchy import linkage | |
import itertools | |
import matplotlib.pyplot as plt | |
import seaborn | |
class Dendrogram(object): | |
"""Lifted from seaborn.matrix""" | |
def __init__(self, data, metric='euclidean', method='single'): | |
"""Get dendrogram of the relationships between the columns of data | |
Parameters | |
---------- | |
data : pandas.DataFrame | |
Rectangular data | |
""" | |
if isinstance(data, pd.DataFrame): | |
array = data.values | |
else: | |
array = np.asarray(data) | |
data = pd.DataFrame(array) | |
self.array = array | |
self.data = data | |
self.shape = self.data.shape | |
self.metric = metric | |
self.method = method | |
self.linkage = self.calculated_linkage | |
self.dendrogram = self.calculate_dendrogram() | |
self.dependent_coord = self.dendrogram['dcoord'] | |
self.independent_coord = self.dendrogram['icoord'] | |
def _calculate_linkage_scipy(self): | |
if np.product(self.shape) >= 10000: | |
UserWarning('This will be slow... (gentle suggestion: ' | |
'"pip install fastcluster")') | |
pairwise_dists = distance.pdist(self.array, metric=self.metric) | |
linkage = hierarchy.linkage(pairwise_dists, method=self.method) | |
del pairwise_dists | |
return linkage | |
def _calculate_linkage_fastcluster(self): | |
import fastcluster | |
# Fastcluster has a memory-saving vectorized version, but only | |
# with certain linkage methods, and mostly with euclidean metric | |
vector_methods = ('single', 'centroid', 'median', 'ward') | |
euclidean_methods = ('centroid', 'median', 'ward') | |
euclidean = self.metric == 'euclidean' and self.method in \ | |
euclidean_methods | |
if euclidean or self.method == 'single': | |
return fastcluster.linkage_vector(self.array, | |
method=self.method, | |
metric=self.metric) | |
else: | |
pairwise_dists = distance.pdist(self.array, metric=self.metric) | |
linkage = fastcluster.linkage(pairwise_dists, method=self.method) | |
del pairwise_dists | |
return linkage | |
@property | |
def calculated_linkage(self): | |
try: | |
return self._calculate_linkage_fastcluster() | |
except ImportError: | |
return self._calculate_linkage_scipy() | |
def calculate_dendrogram(self): | |
"""Calculates a dendrogram based on the linkage matrix | |
Made a separate function, not a property because don't want to | |
recalculate the dendrogram every time it is accessed. | |
Returns | |
------- | |
dendrogram : dict | |
Dendrogram dictionary as returned by scipy.cluster.hierarchy | |
.dendrogram. The important key-value pairing is | |
"reordered_ind" which indicates the re-ordering of the matrix | |
""" | |
return hierarchy.dendrogram(self.linkage, no_plot=True, | |
color_threshold=-np.inf) | |
def leaves(t, t2=None): | |
""" Returns the leaves of a ClusterNode """ | |
try: | |
return t.pre_order() | |
except AttributeError: | |
if t2 is not None: | |
return t2.pre_order() | |
else: | |
return [] | |
def other(x, V, W): | |
# For an element x, returns the set that x isn't in | |
if x in V: | |
return W | |
else: | |
return V | |
class OLO(object): | |
""" | |
Lifted (mostly) from github user markak: | |
http://nbviewer.ipython.org/url/stanford.edu/~markak/2015-09-02%20-%20Optimal%20leaf%20ordering%20for%20hierarchical%20clustering.ipynb# | |
data - pandas dataframe | |
metric - metric for clustering | |
""" | |
def __init__(self, data, metric="euclidean", method='single'): | |
self.data = data | |
self.metric=metric | |
self.method=method | |
def compute_dendrogram(self, axis=0): | |
if axis == 1: | |
data = self.data.T | |
else: | |
data = self.data | |
d = Dendrogram(data, metric=self.metric, method=self.method) | |
self.M = {} | |
tree = hierarchy.to_tree(d.linkage) | |
dists = squareform(pdist(d.array, | |
metric=self.metric)) | |
tree = self.order_tree(tree, dists) | |
order = leaves(tree) | |
del self.M | |
return d, order | |
@property | |
def row_dendrogram(self): | |
if not hasattr(self, '_row_dendrogram'): | |
self._row_dendrogram, self._row_order = self.compute_dendrogram(axis=0) | |
return self._row_dendrogram, self._row_order | |
@property | |
def col_dendrogram(self): | |
if not hasattr(self, '_col_dendrogram'): | |
self._col_dendrogram, self._col_order = self.compute_dendrogram(axis=1) | |
return self._col_dendrogram, self._col_order | |
@property | |
def col_order(self): | |
return self.col_dendrogram[1] | |
@property | |
def row_order(self): | |
return self.row_dendrogram[1] | |
@property | |
def ordered_data(self): | |
return self.data.iloc[self.row_order, self.col_order] | |
def optimal_scores(self, v, D, fast=True): | |
""" Implementation of Ziv-Bar-Joseph et al.'s leaf order algorithm | |
v is a ClusterNode | |
D is a distance matrix """ | |
def score_func(left, right, u, m, w, k): | |
return Mfunc(left, u, m) + Mfunc(right, w, k) + D[m, k] | |
def Mfunc(v, a, b): | |
if a == b: | |
self.M[v, a, b] = 0 | |
return self.M[v, a, b] | |
if v.is_leaf(): | |
n = v.get_id() | |
self.M[v, n, n] = 0 | |
return 0 | |
else: | |
L = leaves(v.left) | |
R = leaves(v.right) | |
LL = leaves(v.left.left, v.left) | |
LR = leaves(v.left.right, v.left) | |
RL = leaves(v.right.left, v.right) | |
RR = leaves(v.right.right, v.right) | |
for l in L: | |
for r in R: | |
self.M[v.left, l, r] = self.optimal_scores(v.left, D, fast=False) | |
self.M[v.right, l, r] = self.optimal_scores(v.right, D, fast=False) | |
for u in L: | |
for w in R: | |
if fast: | |
m_order = sorted(other(u, LL, LR), key=lambda m: Mfunc(v.left, u, m)) | |
k_order = sorted(other(w, RL, RR), key=lambda k: Mfunc(v.right, w, k)) | |
C = min([D[m, k] for m in other(u, LL, LR) for k in other(w, RL, RR)]) | |
Cmin = 1e10 | |
for m in m_order: | |
if self.M[v.left, u, m] + self.M[v.right, w, k_order[0]] + C >= Cmin: | |
break | |
for k in k_order: | |
if self.M[v.left, u, m] + self.M[v.right, w, k] + C >= Cmin: | |
break | |
C = score_func(v.left, v.right, u, m, w, k) | |
if C < Cmin: | |
Cmin = C | |
self.M[v, u, w] = self.M[v, w, u] = Cmin | |
else: | |
self.M[v, u, w] = self.M[v, w, u] = \ | |
min([score_func(v.left, v.right, u, m, w, k) \ | |
for m in other(u, LL, LR) \ | |
for k in other(w, RL, RR)]) | |
return self.M[v, l, r] | |
def order_tree(self, v, D, fM=None, fast=True): | |
""" Returns an optimally ordered tree """ | |
# Generate scores the first pass | |
if fM is None: | |
fM = 1 | |
self.optimal_scores(v, D, fast=fast) | |
L = leaves(v.left) | |
R = leaves(v.right) | |
if len(L) and len(R): | |
def getkey(z): ##added, this is to replace a lambda function | |
u,w = z | |
return self.M[v,u,w] | |
if len(L) and len(R): | |
u, w = min(itertools.product(L,R), key=getkey) ##updated function | |
if w in leaves(v.right.left): | |
v.right.right, v.right.left = v.right.left, v.right.right | |
if u in leaves(v.left.right): | |
v.left.left, v.left.right = v.left.right, v.left.left | |
v.left = self.order_tree(v.left, D, fM) | |
v.right = self.order_tree(v.right, D, fM) | |
return v | |
if __name__ == "__main__": | |
# Create simulated data as in the paper | |
X = np.zeros((100, 100)) | |
for n in [0,10,20,30,40,50,60]: | |
X[(10.*n/7):(10.*(n+10)/7):,n:n+40] = 1 | |
for x,y in zip(np.random.choice(np.arange(100), 500), np.random.choice(np.arange(100), 500)): | |
X[x,y] = -1 | |
seaborn.heatmap(X) | |
olo_data = OLO(pd.DataFrame(X), metric='hamming', method='complete') | |
plt.figure() | |
seaborn.heatmap(olo_data.ordered_data) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@markak - I put your code in a class. Take a look. Thanks for writing it up to start.
@olgabot & @mwaskom - I picked out parts of seaborn.matrix for the Dendrogram calculations
Thanks for making this easy!