Created
December 13, 2018 09:45
-
-
Save afrendeiro/6ddee7f48ca0c6c8288c81c65e17f7cf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def plot_differential_enrichment( | |
enrichment_table, | |
enrichment_type, | |
data_type="ATAC-seq", | |
direction_dependent=True, | |
output_dir="results/differential_analysis_{data_type}/enrichments", | |
comp_variable="comparison_name", | |
output_prefix="differential_analysis", | |
rasterized=True, | |
barplots=True, | |
correlation_plots=True, | |
clustermap_metric='correlation', | |
top_n=5, | |
z_score=0, | |
cmap=None): | |
""" | |
Given a table of enrichment terms across several comparisons, produced | |
plots illustrating these enrichments in the various comparisons. | |
# TODO: add plotting of genomic region and chromatin state enrichment. | |
`comp_variable` is the column in the enrichment table that labels groups. | |
# TODO: split function in its smaller parts and call them appropriately. | |
:param enrichment_table: Data frame with enrichment results as produced by | |
``ngs_toolkit.general.differential_enrichment`` or | |
``ngs_toolkit.general.collect_differential_enrichment``. | |
:type enrichment_table: pandas.DataFrame | |
:param enrichment_type: One of 'lola', 'enrichr', 'motif', 'plot_differential_enrichment', great'. | |
:type enrichment_type: str | |
:param data_type: Data type. One of "ATAC-seq" and "RNA-seq". Defaults to "ATAC-seq". | |
:type data_type: str, optional | |
:param direction_dependent: Whether enrichments were made in a direction-dependent way | |
(up-regulated and down-regulated features separately). | |
This implies a column named "direction" exists". Defaults to True. | |
:type direction_dependent: bool, optional | |
:param output_dir: Directory to create output files. Defaults to "results/differential_analysis_{data_type}/enrichments". | |
:type output_dir: str, optional | |
:param comp_variable: Column defining which comparison enrichment terms belong to. Defaults to "comparison_name". | |
:type comp_variable: str, optional | |
:param output_prefix: Prefix to use when creating output files. Defaults to "differential_analysis". | |
:type output_prefix: str, optional | |
:param rasterized: Whether or not to rasterize heatmaps for efficient plotting. Defaults to True. | |
:type rasterized: bool, optional | |
:param barplots: Whether barplots with top enriched terms per comparison should be produced. Defaults to True. | |
:type barplots: bool, optional | |
:param correlation_plots: Whether correlation plots of comparisons across enriched terms should be produced. Defaults to True. | |
:type correlation_plots: bool, optional | |
:param clustermap_metric: Distance metric to use for clustermap clustering, Default to "correlation" (Pearson's). | |
:type clustermap_metric: str, optional | |
:param top_n: Top terms to be used to make barplots. Defaults to 5 | |
:type top_n: number, optional | |
:param z_score: Which dimention/axis to perform Z-score transformation for. Numpy/Pandas conventions are used: | |
`0` is row-wise (in this case across comparisons) and `1` is column-wise (across terms). Defaults to 0. | |
:type z_score: number, optional | |
:param cmap: Colormap to use in heatmaps. Default None. | |
:type cmap: str, optional | |
""" | |
import numpy as np | |
import matplotlib | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from scipy.stats import zscore | |
if enrichment_type not in ["homer_consensus"']: | |
raise AssertionError("`enrichment_type` must be 'homer_consensus'.") | |
if "{data_type}" in output_dir: | |
output_dir = output_dir.format(data_type=data_type) | |
if not os.path.exists(output_dir): | |
os.makedirs(output_dir) | |
if z_score == 0: | |
z_score_label = "Row" | |
elif z_score == 1: | |
z_score_label = "Column" | |
elif z_score is None: | |
pass | |
else: | |
raise ValueError("Argument 'z_score' must be on of 0, 1 or None.") | |
enrichment_table = enrichment_table.copy() | |
if "direction" in enrichment_table.columns and direction_dependent: | |
enrichment_table[comp_variable] = enrichment_table[comp_variable].astype( | |
str) + " " + enrichment_table["direction"].astype(str) | |
if enrichment_type == "homer_consensus": | |
enrichment_table.loc[:, "log_p_value"] = -(enrichment_table["Log P-value"].astype(np.float64)) | |
# Plot top_n terms of each comparison in barplots | |
top_n = min(top_n, enrichment_table.set_index("Motif Name").groupby(comp_variable)["log_p_value"].count().min() - 1) | |
top_data = enrichment_table.set_index("Motif Name").groupby(comp_variable)["log_p_value"].nlargest(top_n).reset_index() | |
if barplots: | |
n = len(enrichment_table[comp_variable].drop_duplicates()) | |
n_side = int(np.ceil(np.sqrt(n))) | |
fig, axis = plt.subplots(n_side, n_side, figsize=(4 * n_side, n_side * max(5, 0.12 * top_n)), sharex=False, sharey=False) | |
axis = iter(axis.flatten()) | |
for i, comp in enumerate(top_data[comp_variable].drop_duplicates().sort_values()): | |
df2 = top_data.loc[top_data[comp_variable] == comp, :] | |
ax = next(axis) | |
sns.barplot( | |
df2["log_p_value"], df2["Motif Name"], estimator=max, | |
orient="horizontal", ax=ax, color=sns.color_palette("colorblind")[0]) | |
ax.set_title(comp) | |
for ax in axis: | |
ax.set_visible(False) | |
sns.despine(fig) | |
fig.savefig( | |
os.path.join(output_dir, output_prefix + ".homer_consensus.barplot.top_{}.svg".format(top_n)), | |
bbox_inches="tight", dpi=300) | |
# Significance vs fold enrichment over background | |
enrichment_table.loc[:, 'enrichment_over_background'] = ( | |
enrichment_table['% of Target Sequences with Motif'] / | |
enrichment_table['% of Background Sequences with Motif']) | |
n = len(enrichment_table[comp_variable].drop_duplicates()) | |
n_side = int(np.ceil(np.sqrt(n))) | |
fig, axis = plt.subplots(n_side, n_side, figsize=(3 * n_side, 3 * n_side), sharex=False, sharey=False) | |
axis = axis.flatten() | |
for i, comp in enumerate(enrichment_table[comp_variable].drop_duplicates().sort_values()): | |
enr = enrichment_table[enrichment_table[comp_variable] == comp] | |
enr.loc[:, 'Motif Name'] = enr['Motif Name'].str.replace(".*BestGuess:", "").str.replace(r"-ChIP-Seq.*", "") | |
enr.loc[:, 'combined'] = enr[['enrichment_over_background', 'log_p_value']].apply(zscore).mean(axis=1) | |
cs = axis[i].scatter( | |
enr['enrichment_over_background'], | |
enr['log_p_value'], | |
c=enr['combined'], | |
s=8, alpha=0.75) | |
# label top points | |
for j in enr['combined'].sort_values().tail(5).index: | |
axis[i].text( | |
enr.loc[j, 'enrichment_over_background'], | |
enr.loc[j, 'log_p_value'], | |
s=enr.loc[j, "Motif Name"], ha="right", fontsize=5) | |
axis[i].set_title(comp) | |
for ax in axis.reshape((n_side, n_side))[:, 0]: | |
ax.set_ylabel("-log10(p-value)") | |
for ax in axis.reshape((n_side, n_side))[-1, :]: | |
ax.set_xlabel("Enrichment over background") | |
sns.despine(fig) | |
fig.savefig( | |
os.path.join(output_dir, output_prefix + ".homer_consensus.scatterplot.svg"), | |
bbox_inches="tight", dpi=300) | |
# Plot heatmaps of terms for each comparison | |
if len(enrichment_table[comp_variable].drop_duplicates()) < 2: | |
return | |
for label, metric in [ | |
('-log10(p-value) of enrichment\nin', 'log_p_value'), | |
('Enrichment over background of\n', 'enrichment_over_background')]: | |
# pivot table | |
motifs_pivot = pd.pivot_table( | |
enrichment_table, values=metric, columns="Motif Name", index=comp_variable).fillna(0) | |
# plot correlation | |
if correlation_plots: | |
g = sns.clustermap(motifs_pivot.T.corr(), cbar_kws={"label": "Correlation of enrichemnt\nof differential regions"}) | |
g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90) | |
g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0) | |
g.fig.savefig(os.path.join(output_dir, output_prefix + ".homer_consensus.correlation.svg"), bbox_inches="tight", dpi=300) | |
top = enrichment_table.set_index('Motif Name').groupby(comp_variable)[metric].nlargest(top_n) | |
top_terms = top.index.get_level_values('Motif Name').unique() | |
# plot clustered heatmap | |
shape = motifs_pivot.loc[:, top_terms].shape | |
g = sns.clustermap( | |
motifs_pivot.loc[:, top_terms].T, figsize=(max(6, 0.12 * shape[0]), max(6, 0.12 * shape[1])), | |
xticklabels=True, yticklabels=True, | |
cbar_kws={"label": label + " differential regions"}, metric="correlation") | |
g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90, ha="right", fontsize="xx-small") | |
g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0, fontsize="xx-small") | |
g.fig.savefig(os.path.join(output_dir, output_prefix + ".homer_consensus.cluster_specific.{}.svg".format(metric)), bbox_inches="tight", dpi=300) | |
# plot clustered heatmap | |
shape = motifs_pivot.loc[:, top_terms].shape | |
if z_score is not None: | |
g = sns.clustermap( | |
motifs_pivot.loc[:, top_terms].T, figsize=(max(6, 0.12 * shape[0]), max(6, 0.12 * shape[1])), | |
xticklabels=True, yticklabels=True, | |
cmap="RdBu_r", center=0, z_score=z_score, | |
cbar_kws={"label": "{} Z-score of {} differential regions".format(z_score_label, label)}, metric="correlation") | |
g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90, ha="right", fontsize="xx-small") | |
g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0, fontsize="xx-small") | |
g.fig.savefig( | |
os.path.join(output_dir, output_prefix + ".homer_consensus.cluster_specific.{}.{}_z_score.svg".format(metric, z_score_label)), | |
bbox_inches="tight", dpi=300) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment