Skip to content

Instantly share code, notes, and snippets.

@endrebak
Created August 15, 2019 07:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save endrebak/69a194d089fa173009a8674434671cc4 to your computer and use it in GitHub Desktop.
Save endrebak/69a194d089fa173009a8674434671cc4 to your computer and use it in GitHub Desktop.
# Works on very large datasets.
import pandas as pd
try:
import mkl
mkl.set_num_threads(1)
except:
pass
f = "/mnt/work/endrebak/epigenome_roadmap_analyses/H3K27me3/data/hg38/matrix/tfv_20000.txt"
df = pd.read_csv(f, sep="\t", index_col=list(range(0, 7)), nrows=None)
def column_order(df, metric="euclidean", method="single"):
from scipy.spatial.distance import pdist #, squareform
from scipy.cluster.hierarchy import dendrogram, linkage, leaves_list
_distance_matrix = pdist(df.T, metric=metric)
# distance_matrix = squareform(_distance_matrix)
linkage_matrix = linkage(_distance_matrix, method=method, metric=metric)
sort_order = leaves_list(linkage_matrix)
columns_in_sort_order = df.columns[sort_order]
return columns_in_sort_order
def row_labels(df, k):
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(k)
kmeans.fit(df)
return kmeans.labels_
def labeled_matrix(df, k, metric="euclidean", method="single"):
columns = column_order(df, metric, method)
labels = row_labels(df, k)
df = df[columns]
df.insert(0, "Label", labels)
df = df.set_index("Label", append=True)
return df
df2 = labeled_matrix(df, k=5, metric="euclidean", method="single")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment