Skip to content

Instantly share code, notes, and snippets.

@dmyersturnbull
Last active November 21, 2016 23:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dmyersturnbull/418a813f19ba5b6e64319436c8c48af6 to your computer and use it in GitHub Desktop.
Save dmyersturnbull/418a813f19ba5b6e64319436c8c48af6 to your computer and use it in GitHub Desktop.
Calculate tSNE or MDS+PCA and plot the results in Seaborn in a way that doesn't look terrible.
# Douglas Myers-Turnbull wrote this for the Kokel Lab, which has released it under the Apache Software License, Version 2.0
# See the license file here: https://gist.github.com/dmyersturnbull/bfa1c3371e7449db553aaa1e7cd3cac1
# The list of copyright owners is unknown
import pandas as pd
import seaborn as sns
from scipy.spatial.distance import squareform
from sklearn import manifold
from scipy.spatial.distance import pdist
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
from typing import Optional, Union, List, Dict, Callable
def _make_pc_df(data_df: pd.DataFrame, positions: np.ndarray) -> pd.DataFrame:
tsne_df = pd.DataFrame(positions).rename(columns={0: 'PC1', 1: 'PC2'})
tsne_df = data_df.reset_index().merge(tsne_df, left_index=True, right_index=True).set_index(data_df.index)
for column in data_df.columns | data_df.index.names:
tsne_df.drop(column, axis=1, inplace=True)
return tsne_df / tsne_df.mean() # rescale because matplotlib's autoscale fails miserably for small values
def calc_tsne(data_df: pd.DataFrame, tsne: manifold.TSNE=manifold.TSNE(n_components=2,
perplexity=20.0,
early_exaggeration=4.0,
learning_rate=300.0,
n_iter=5000,
n_iter_without_progress=100,
min_grad_norm=1e-07,
metric='euclidean',
init='random',
method='barnes_hut',
angle=0.1),
random_seed: Optional[int]=None) -> pd.DataFrame:
"""This function doesn't really do anything; it's just a reminder of how to run tSNE.
data_df should be a dataframe containing only the data to run through tSNE (but other data in the index is ok).
"""
tsne.random_state = None if random_seed is None else np.random.RandomState(seed=random_seed)
positions = tsne.fit_transform(data_df.values)
return _make_pc_df(data_df, positions)
def calc_mds_and_pca(S: pd.DataFrame,
is_metric: bool=True,
distance: Union[str, Callable[[np.ndarray], np.ndarray]]='euclidean',
random_seed: Optional[int]=None,
max_iter: int=3000,
convergence_tolerance: float=1e-9) -> pd.DataFrame:
positions = calc_mds_only(S, 2, is_metric=is_metric, distance=distance, random_seed=random_seed, max_iter=max_iter, convergence_tolerance=convergence_tolerance)
pca = PCA(n_components=2)
positions = pca.fit_transform(positions)
return _make_pc_df(data_df, positions)
def calc_mds_only(data_df: pd.DataFrame, n_components: int,
distance: Union[str, Callable[[np.ndarray], np.ndarray]]='euclidean',
is_metric: bool=True,
random_seed: Optional[int]=None,
max_iter: int=3000,
convergence_tolerance: float=1e-9) -> pd.DataFrame:
seed = None if random_seed is None else np.random.RandomState(seed=random_seed)
# for whatever reason, MDS only allows dissimilarity='precomputed' or dissimilarity='euclidean'
dissimilarities = squareform(pdist(data_df.values, distance))
# Note: n_jobs>1 hangs indefinitely
mds = manifold.MDS(n_components=n_components, metric=is_metric, max_iter=max_iter, eps=convergence_tolerance, random_state=seed, dissimilarity='precomputed')
positions = mds.fit_transform(dissimilarities)
return _make_pc_df(data_df, positions)
def plot_tsne(tsne_df: pd.DataFrame,
colors: Optional[Dict[str, str]]=None, markers: Optional[List[str]]=None, sizes: Union[float, List[float]]=100, hue_column: str='class',
size: float=10, aspect: float = 1,
show_ticks: bool=False, style: str='whitegrid', font_scale: float=1.4, legend_font_scale: float=20) -> sns.FacetGrid:
sns.set(font_scale=font_scale)
sns.set_style(style)
if markers is None: # setting markers=None results in no markers
plot = sns.lmplot('PC2', 'PC1', tsne_df, hue=hue_column, fit_reg=False, size=10, aspect=1, palette=colors, scatter_kws={'s': sizes})
else:
plot = sns.lmplot('PC2', 'PC1', tsne_df, hue=hue_column, fit_reg=False, size=10, aspect=1, palette=colors, markers=markers, scatter_kws={'s': sizes})
if not show_ticks:
plot.ax.set_xticks([])
plot.ax.set_yticks([])
legend = plot.fig.legends[0]
legend.prop.set_size(legend_font_scale)
legend.set_title(None)
return plot
@dmyersturnbull
Copy link
Author

Example usage

data_df = pd.DataFrame({'id': ['a', 'b', 'c', 'd', 'e', 'f', 'g'], 'class': ['class1', 'class1', 'class1', 'class2', 'class2', 'class2', 'class2'], 'one': [170.3, 62.10, -135.235, -144.235, 60.235, 68.236, 19.4]}).set_index(['id', 'class'])
tsne_df = calc_tsne(data_df, random_seed=2).reset_index()
plot_tsne(tsne_df)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment