Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
def plot_differential_enrichment(
enrichment_table,
enrichment_type,
data_type="ATAC-seq",
direction_dependent=True,
output_dir="results/differential_analysis_{data_type}/enrichments",
comp_variable="comparison_name",
output_prefix="differential_analysis",
rasterized=True,
barplots=True,
correlation_plots=True,
clustermap_metric='correlation',
top_n=5,
z_score=0,
cmap=None):
"""
Given a table of enrichment terms across several comparisons, produced
plots illustrating these enrichments in the various comparisons.
# TODO: add plotting of genomic region and chromatin state enrichment.
`comp_variable` is the column in the enrichment table that labels groups.
# TODO: split function in its smaller parts and call them appropriately.
:param enrichment_table: Data frame with enrichment results as produced by
``ngs_toolkit.general.differential_enrichment`` or
``ngs_toolkit.general.collect_differential_enrichment``.
:type enrichment_table: pandas.DataFrame
:param enrichment_type: One of 'lola', 'enrichr', 'motif', 'plot_differential_enrichment', great'.
:type enrichment_type: str
:param data_type: Data type. One of "ATAC-seq" and "RNA-seq". Defaults to "ATAC-seq".
:type data_type: str, optional
:param direction_dependent: Whether enrichments were made in a direction-dependent way
(up-regulated and down-regulated features separately).
This implies a column named "direction" exists". Defaults to True.
:type direction_dependent: bool, optional
:param output_dir: Directory to create output files. Defaults to "results/differential_analysis_{data_type}/enrichments".
:type output_dir: str, optional
:param comp_variable: Column defining which comparison enrichment terms belong to. Defaults to "comparison_name".
:type comp_variable: str, optional
:param output_prefix: Prefix to use when creating output files. Defaults to "differential_analysis".
:type output_prefix: str, optional
:param rasterized: Whether or not to rasterize heatmaps for efficient plotting. Defaults to True.
:type rasterized: bool, optional
:param barplots: Whether barplots with top enriched terms per comparison should be produced. Defaults to True.
:type barplots: bool, optional
:param correlation_plots: Whether correlation plots of comparisons across enriched terms should be produced. Defaults to True.
:type correlation_plots: bool, optional
:param clustermap_metric: Distance metric to use for clustermap clustering, Default to "correlation" (Pearson's).
:type clustermap_metric: str, optional
:param top_n: Top terms to be used to make barplots. Defaults to 5
:type top_n: number, optional
:param z_score: Which dimention/axis to perform Z-score transformation for. Numpy/Pandas conventions are used:
`0` is row-wise (in this case across comparisons) and `1` is column-wise (across terms). Defaults to 0.
:type z_score: number, optional
:param cmap: Colormap to use in heatmaps. Default None.
:type cmap: str, optional
"""
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
if enrichment_type not in ["homer_consensus"']:
raise AssertionError("`enrichment_type` must be 'homer_consensus'.")
if "{data_type}" in output_dir:
output_dir = output_dir.format(data_type=data_type)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if z_score == 0:
z_score_label = "Row"
elif z_score == 1:
z_score_label = "Column"
elif z_score is None:
pass
else:
raise ValueError("Argument 'z_score' must be on of 0, 1 or None.")
enrichment_table = enrichment_table.copy()
if "direction" in enrichment_table.columns and direction_dependent:
enrichment_table[comp_variable] = enrichment_table[comp_variable].astype(
str) + " " + enrichment_table["direction"].astype(str)
if enrichment_type == "homer_consensus":
enrichment_table.loc[:, "log_p_value"] = -(enrichment_table["Log P-value"].astype(np.float64))
# Plot top_n terms of each comparison in barplots
top_n = min(top_n, enrichment_table.set_index("Motif Name").groupby(comp_variable)["log_p_value"].count().min() - 1)
top_data = enrichment_table.set_index("Motif Name").groupby(comp_variable)["log_p_value"].nlargest(top_n).reset_index()
if barplots:
n = len(enrichment_table[comp_variable].drop_duplicates())
n_side = int(np.ceil(np.sqrt(n)))
fig, axis = plt.subplots(n_side, n_side, figsize=(4 * n_side, n_side * max(5, 0.12 * top_n)), sharex=False, sharey=False)
axis = iter(axis.flatten())
for i, comp in enumerate(top_data[comp_variable].drop_duplicates().sort_values()):
df2 = top_data.loc[top_data[comp_variable] == comp, :]
ax = next(axis)
sns.barplot(
df2["log_p_value"], df2["Motif Name"], estimator=max,
orient="horizontal", ax=ax, color=sns.color_palette("colorblind")[0])
ax.set_title(comp)
for ax in axis:
ax.set_visible(False)
sns.despine(fig)
fig.savefig(
os.path.join(output_dir, output_prefix + ".homer_consensus.barplot.top_{}.svg".format(top_n)),
bbox_inches="tight", dpi=300)
# Significance vs fold enrichment over background
enrichment_table.loc[:, 'enrichment_over_background'] = (
enrichment_table['% of Target Sequences with Motif'] /
enrichment_table['% of Background Sequences with Motif'])
n = len(enrichment_table[comp_variable].drop_duplicates())
n_side = int(np.ceil(np.sqrt(n)))
fig, axis = plt.subplots(n_side, n_side, figsize=(3 * n_side, 3 * n_side), sharex=False, sharey=False)
axis = axis.flatten()
for i, comp in enumerate(enrichment_table[comp_variable].drop_duplicates().sort_values()):
enr = enrichment_table[enrichment_table[comp_variable] == comp]
enr.loc[:, 'Motif Name'] = enr['Motif Name'].str.replace(".*BestGuess:", "").str.replace(r"-ChIP-Seq.*", "")
enr.loc[:, 'combined'] = enr[['enrichment_over_background', 'log_p_value']].apply(zscore).mean(axis=1)
cs = axis[i].scatter(
enr['enrichment_over_background'],
enr['log_p_value'],
c=enr['combined'],
s=8, alpha=0.75)
# label top points
for j in enr['combined'].sort_values().tail(5).index:
axis[i].text(
enr.loc[j, 'enrichment_over_background'],
enr.loc[j, 'log_p_value'],
s=enr.loc[j, "Motif Name"], ha="right", fontsize=5)
axis[i].set_title(comp)
for ax in axis.reshape((n_side, n_side))[:, 0]:
ax.set_ylabel("-log10(p-value)")
for ax in axis.reshape((n_side, n_side))[-1, :]:
ax.set_xlabel("Enrichment over background")
sns.despine(fig)
fig.savefig(
os.path.join(output_dir, output_prefix + ".homer_consensus.scatterplot.svg"),
bbox_inches="tight", dpi=300)
# Plot heatmaps of terms for each comparison
if len(enrichment_table[comp_variable].drop_duplicates()) < 2:
return
for label, metric in [
('-log10(p-value) of enrichment\nin', 'log_p_value'),
('Enrichment over background of\n', 'enrichment_over_background')]:
# pivot table
motifs_pivot = pd.pivot_table(
enrichment_table, values=metric, columns="Motif Name", index=comp_variable).fillna(0)
# plot correlation
if correlation_plots:
g = sns.clustermap(motifs_pivot.T.corr(), cbar_kws={"label": "Correlation of enrichemnt\nof differential regions"})
g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90)
g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0)
g.fig.savefig(os.path.join(output_dir, output_prefix + ".homer_consensus.correlation.svg"), bbox_inches="tight", dpi=300)
top = enrichment_table.set_index('Motif Name').groupby(comp_variable)[metric].nlargest(top_n)
top_terms = top.index.get_level_values('Motif Name').unique()
# plot clustered heatmap
shape = motifs_pivot.loc[:, top_terms].shape
g = sns.clustermap(
motifs_pivot.loc[:, top_terms].T, figsize=(max(6, 0.12 * shape[0]), max(6, 0.12 * shape[1])),
xticklabels=True, yticklabels=True,
cbar_kws={"label": label + " differential regions"}, metric="correlation")
g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90, ha="right", fontsize="xx-small")
g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0, fontsize="xx-small")
g.fig.savefig(os.path.join(output_dir, output_prefix + ".homer_consensus.cluster_specific.{}.svg".format(metric)), bbox_inches="tight", dpi=300)
# plot clustered heatmap
shape = motifs_pivot.loc[:, top_terms].shape
if z_score is not None:
g = sns.clustermap(
motifs_pivot.loc[:, top_terms].T, figsize=(max(6, 0.12 * shape[0]), max(6, 0.12 * shape[1])),
xticklabels=True, yticklabels=True,
cmap="RdBu_r", center=0, z_score=z_score,
cbar_kws={"label": "{} Z-score of {} differential regions".format(z_score_label, label)}, metric="correlation")
g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90, ha="right", fontsize="xx-small")
g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0, fontsize="xx-small")
g.fig.savefig(
os.path.join(output_dir, output_prefix + ".homer_consensus.cluster_specific.{}.{}_z_score.svg".format(metric, z_score_label)),
bbox_inches="tight", dpi=300)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment