afrendeiro/plot_homer_consensus.py

## plot_homer_consensus.py

def plot_differential_enrichment(
        enrichment_table,
        enrichment_type,
        data_type="ATAC-seq",
        direction_dependent=True,
        output_dir="results/differential_analysis_{data_type}/enrichments",
        comp_variable="comparison_name",
        output_prefix="differential_analysis",
        rasterized=True,
        barplots=True,
        correlation_plots=True,
        clustermap_metric='correlation',
        top_n=5,
        z_score=0,
        cmap=None):
    """
    Given a table of enrichment terms across several comparisons, produced
    plots illustrating these enrichments in the various comparisons.

    # TODO: add plotting of genomic region and chromatin state enrichment.
    `comp_variable` is the column in the enrichment table that labels groups.

    # TODO: split function in its smaller parts and call them appropriately.

    :param enrichment_table: Data frame with enrichment results as produced by
                             ``ngs_toolkit.general.differential_enrichment`` or
                             ``ngs_toolkit.general.collect_differential_enrichment``.
    :type enrichment_table: pandas.DataFrame
    :param enrichment_type: One of 'lola', 'enrichr', 'motif', 'plot_differential_enrichment', great'.
    :type enrichment_type: str
    :param data_type: Data type. One of "ATAC-seq" and "RNA-seq". Defaults to "ATAC-seq".
    :type data_type: str, optional
    :param direction_dependent: Whether enrichments were made in a direction-dependent way
                                (up-regulated and down-regulated features separately).
                                This implies a column named "direction" exists". Defaults to True.
    :type direction_dependent: bool, optional
    :param output_dir: Directory to create output files. Defaults to "results/differential_analysis_{data_type}/enrichments".
    :type output_dir: str, optional
    :param comp_variable: Column defining which comparison enrichment terms belong to. Defaults to "comparison_name".
    :type comp_variable: str, optional
    :param output_prefix: Prefix to use when creating output files. Defaults to "differential_analysis".
    :type output_prefix: str, optional
    :param rasterized: Whether or not to rasterize heatmaps for efficient plotting. Defaults to True.
    :type rasterized: bool, optional
    :param barplots: Whether barplots with top enriched terms per comparison should be produced. Defaults to True.
    :type barplots: bool, optional
    :param correlation_plots: Whether correlation plots of comparisons across enriched terms should be produced. Defaults to True.
    :type correlation_plots: bool, optional
    :param clustermap_metric: Distance metric to use for clustermap clustering, Default to "correlation" (Pearson's).
    :type clustermap_metric: str, optional
    :param top_n: Top terms to be used to make barplots. Defaults to 5
    :type top_n: number, optional
    :param z_score: Which dimention/axis to perform Z-score transformation for. Numpy/Pandas conventions are used:
                    `0` is row-wise (in this case across comparisons) and `1` is column-wise (across terms). Defaults to 0.
    :type z_score: number, optional
    :param cmap: Colormap to use in heatmaps. Default None.
    :type cmap: str, optional
    """
    import numpy as np
    import matplotlib
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy.stats import zscore

    if enrichment_type not in ["homer_consensus"']:
        raise AssertionError("`enrichment_type` must be 'homer_consensus'.")

    if "{data_type}" in output_dir:
        output_dir = output_dir.format(data_type=data_type)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    if z_score == 0:
        z_score_label = "Row"
    elif z_score == 1:
        z_score_label = "Column"
    elif z_score is None:
        pass
    else:
        raise ValueError("Argument 'z_score' must be on of 0, 1 or None.")

    enrichment_table = enrichment_table.copy()
    if "direction" in enrichment_table.columns and direction_dependent:
        enrichment_table[comp_variable] = enrichment_table[comp_variable].astype(
            str) + " " + enrichment_table["direction"].astype(str)

    if enrichment_type == "homer_consensus":
        enrichment_table.loc[:, "log_p_value"] = -(enrichment_table["Log P-value"].astype(np.float64))
        # Plot top_n terms of each comparison in barplots
        top_n = min(top_n, enrichment_table.set_index("Motif Name").groupby(comp_variable)["log_p_value"].count().min() - 1)
        top_data = enrichment_table.set_index("Motif Name").groupby(comp_variable)["log_p_value"].nlargest(top_n).reset_index()

        if barplots:
            n = len(enrichment_table[comp_variable].drop_duplicates())
            n_side = int(np.ceil(np.sqrt(n)))

            fig, axis = plt.subplots(n_side, n_side, figsize=(4 * n_side, n_side * max(5, 0.12 * top_n)), sharex=False, sharey=False)
            axis = iter(axis.flatten())
            for i, comp in enumerate(top_data[comp_variable].drop_duplicates().sort_values()):
                df2 = top_data.loc[top_data[comp_variable] == comp, :]
                ax = next(axis)
                sns.barplot(
                    df2["log_p_value"], df2["Motif Name"], estimator=max,
                    orient="horizontal", ax=ax, color=sns.color_palette("colorblind")[0])
                ax.set_title(comp)
            for ax in axis:
                ax.set_visible(False)
            sns.despine(fig)
            fig.savefig(
                os.path.join(output_dir, output_prefix + ".homer_consensus.barplot.top_{}.svg".format(top_n)),
                bbox_inches="tight", dpi=300)

        # Significance vs fold enrichment over background
        enrichment_table.loc[:, 'enrichment_over_background'] = (
            enrichment_table['% of Target Sequences with Motif'] /
            enrichment_table['% of Background Sequences with Motif'])

        n = len(enrichment_table[comp_variable].drop_duplicates())
        n_side = int(np.ceil(np.sqrt(n)))
        fig, axis = plt.subplots(n_side, n_side, figsize=(3 * n_side, 3 * n_side), sharex=False, sharey=False)
        axis = axis.flatten()
        for i, comp in enumerate(enrichment_table[comp_variable].drop_duplicates().sort_values()):
            enr = enrichment_table[enrichment_table[comp_variable] == comp]
            enr.loc[:, 'Motif Name'] = enr['Motif Name'].str.replace(".*BestGuess:", "").str.replace(r"-ChIP-Seq.*", "")

            enr.loc[:, 'combined'] = enr[['enrichment_over_background', 'log_p_value']].apply(zscore).mean(axis=1)
            cs = axis[i].scatter(
                enr['enrichment_over_background'],
                enr['log_p_value'],
                c=enr['combined'],
                s=8, alpha=0.75)

            # label top points
            for j in enr['combined'].sort_values().tail(5).index:
                axis[i].text(
                    enr.loc[j, 'enrichment_over_background'],
                    enr.loc[j, 'log_p_value'],
                    s=enr.loc[j, "Motif Name"], ha="right", fontsize=5)
            axis[i].set_title(comp)

        for ax in axis.reshape((n_side, n_side))[:, 0]:
            ax.set_ylabel("-log10(p-value)")
        for ax in axis.reshape((n_side, n_side))[-1, :]:
            ax.set_xlabel("Enrichment over background")
        sns.despine(fig)
        fig.savefig(
            os.path.join(output_dir, output_prefix + ".homer_consensus.scatterplot.svg"),
            bbox_inches="tight", dpi=300)

        # Plot heatmaps of terms for each comparison
        if len(enrichment_table[comp_variable].drop_duplicates()) < 2:
            return

        for label, metric in [
                ('-log10(p-value) of enrichment\nin', 'log_p_value'),
                ('Enrichment over background of\n', 'enrichment_over_background')]:
            # pivot table
            motifs_pivot = pd.pivot_table(
                enrichment_table, values=metric, columns="Motif Name", index=comp_variable).fillna(0)

            # plot correlation
            if correlation_plots:
                g = sns.clustermap(motifs_pivot.T.corr(), cbar_kws={"label": "Correlation of enrichemnt\nof differential regions"})
                g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90)
                g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0)
                g.fig.savefig(os.path.join(output_dir, output_prefix + ".homer_consensus.correlation.svg"), bbox_inches="tight", dpi=300)

            top = enrichment_table.set_index('Motif Name').groupby(comp_variable)[metric].nlargest(top_n)
            top_terms = top.index.get_level_values('Motif Name').unique()

            # plot clustered heatmap
            shape = motifs_pivot.loc[:, top_terms].shape
            g = sns.clustermap(
                motifs_pivot.loc[:, top_terms].T, figsize=(max(6, 0.12 * shape[0]), max(6, 0.12 * shape[1])),
                xticklabels=True, yticklabels=True,
                cbar_kws={"label": label + " differential regions"}, metric="correlation")
            g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90, ha="right", fontsize="xx-small")
            g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0, fontsize="xx-small")
            g.fig.savefig(os.path.join(output_dir, output_prefix + ".homer_consensus.cluster_specific.{}.svg".format(metric)), bbox_inches="tight", dpi=300)

            # plot clustered heatmap
            shape = motifs_pivot.loc[:, top_terms].shape
            if z_score is not None:
                g = sns.clustermap(
                    motifs_pivot.loc[:, top_terms].T, figsize=(max(6, 0.12 * shape[0]), max(6, 0.12 * shape[1])),
                    xticklabels=True, yticklabels=True,
                    cmap="RdBu_r", center=0, z_score=z_score,
                    cbar_kws={"label": "{} Z-score of {} differential regions".format(z_score_label, label)}, metric="correlation")
                g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90, ha="right", fontsize="xx-small")
                g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0, fontsize="xx-small")
                g.fig.savefig(
                    os.path.join(output_dir, output_prefix + ".homer_consensus.cluster_specific.{}.{}_z_score.svg".format(metric, z_score_label)),
                    bbox_inches="tight", dpi=300)

	def plot_differential_enrichment(
	enrichment_table,
	enrichment_type,
	data_type="ATAC-seq",
	direction_dependent=True,
	output_dir="results/differential_analysis_{data_type}/enrichments",
	comp_variable="comparison_name",
	output_prefix="differential_analysis",
	rasterized=True,
	barplots=True,
	correlation_plots=True,
	clustermap_metric='correlation',
	top_n=5,
	z_score=0,
	cmap=None):
	"""
	Given a table of enrichment terms across several comparisons, produced
	plots illustrating these enrichments in the various comparisons.

	# TODO: add plotting of genomic region and chromatin state enrichment.
	`comp_variable` is the column in the enrichment table that labels groups.

	# TODO: split function in its smaller parts and call them appropriately.

	:param enrichment_table: Data frame with enrichment results as produced by
	``ngs_toolkit.general.differential_enrichment`` or
	``ngs_toolkit.general.collect_differential_enrichment``.
	:type enrichment_table: pandas.DataFrame
	:param enrichment_type: One of 'lola', 'enrichr', 'motif', 'plot_differential_enrichment', great'.
	:type enrichment_type: str
	:param data_type: Data type. One of "ATAC-seq" and "RNA-seq". Defaults to "ATAC-seq".
	:type data_type: str, optional
	:param direction_dependent: Whether enrichments were made in a direction-dependent way
	(up-regulated and down-regulated features separately).
	This implies a column named "direction" exists". Defaults to True.
	:type direction_dependent: bool, optional
	:param output_dir: Directory to create output files. Defaults to "results/differential_analysis_{data_type}/enrichments".
	:type output_dir: str, optional
	:param comp_variable: Column defining which comparison enrichment terms belong to. Defaults to "comparison_name".
	:type comp_variable: str, optional
	:param output_prefix: Prefix to use when creating output files. Defaults to "differential_analysis".
	:type output_prefix: str, optional
	:param rasterized: Whether or not to rasterize heatmaps for efficient plotting. Defaults to True.
	:type rasterized: bool, optional
	:param barplots: Whether barplots with top enriched terms per comparison should be produced. Defaults to True.
	:type barplots: bool, optional
	:param correlation_plots: Whether correlation plots of comparisons across enriched terms should be produced. Defaults to True.
	:type correlation_plots: bool, optional
	:param clustermap_metric: Distance metric to use for clustermap clustering, Default to "correlation" (Pearson's).
	:type clustermap_metric: str, optional
	:param top_n: Top terms to be used to make barplots. Defaults to 5
	:type top_n: number, optional
	:param z_score: Which dimention/axis to perform Z-score transformation for. Numpy/Pandas conventions are used:
	`0` is row-wise (in this case across comparisons) and `1` is column-wise (across terms). Defaults to 0.
	:type z_score: number, optional
	:param cmap: Colormap to use in heatmaps. Default None.
	:type cmap: str, optional
	"""
	import numpy as np
	import matplotlib
	import matplotlib.pyplot as plt
	import seaborn as sns
	from scipy.stats import zscore

	if enrichment_type not in ["homer_consensus"']:
	raise AssertionError("`enrichment_type` must be 'homer_consensus'.")

	if "{data_type}" in output_dir:
	output_dir = output_dir.format(data_type=data_type)
	if not os.path.exists(output_dir):
	os.makedirs(output_dir)

	if z_score == 0:
	z_score_label = "Row"
	elif z_score == 1:
	z_score_label = "Column"
	elif z_score is None:
	pass
	else:
	raise ValueError("Argument 'z_score' must be on of 0, 1 or None.")

	enrichment_table = enrichment_table.copy()
	if "direction" in enrichment_table.columns and direction_dependent:
	enrichment_table[comp_variable] = enrichment_table[comp_variable].astype(
	str) + " " + enrichment_table["direction"].astype(str)

	if enrichment_type == "homer_consensus":
	enrichment_table.loc[:, "log_p_value"] = -(enrichment_table["Log P-value"].astype(np.float64))
	# Plot top_n terms of each comparison in barplots
	top_n = min(top_n, enrichment_table.set_index("Motif Name").groupby(comp_variable)["log_p_value"].count().min() - 1)
	top_data = enrichment_table.set_index("Motif Name").groupby(comp_variable)["log_p_value"].nlargest(top_n).reset_index()

	if barplots:
	n = len(enrichment_table[comp_variable].drop_duplicates())
	n_side = int(np.ceil(np.sqrt(n)))

	fig, axis = plt.subplots(n_side, n_side, figsize=(4 * n_side, n_side * max(5, 0.12 * top_n)), sharex=False, sharey=False)
	axis = iter(axis.flatten())
	for i, comp in enumerate(top_data[comp_variable].drop_duplicates().sort_values()):
	df2 = top_data.loc[top_data[comp_variable] == comp, :]
	ax = next(axis)
	sns.barplot(
	df2["log_p_value"], df2["Motif Name"], estimator=max,
	orient="horizontal", ax=ax, color=sns.color_palette("colorblind")[0])
	ax.set_title(comp)
	for ax in axis:
	ax.set_visible(False)
	sns.despine(fig)
	fig.savefig(
	os.path.join(output_dir, output_prefix + ".homer_consensus.barplot.top_{}.svg".format(top_n)),
	bbox_inches="tight", dpi=300)

	# Significance vs fold enrichment over background
	enrichment_table.loc[:, 'enrichment_over_background'] = (
	enrichment_table['% of Target Sequences with Motif'] /
	enrichment_table['% of Background Sequences with Motif'])

	n = len(enrichment_table[comp_variable].drop_duplicates())
	n_side = int(np.ceil(np.sqrt(n)))
	fig, axis = plt.subplots(n_side, n_side, figsize=(3 * n_side, 3 * n_side), sharex=False, sharey=False)
	axis = axis.flatten()
	for i, comp in enumerate(enrichment_table[comp_variable].drop_duplicates().sort_values()):
	enr = enrichment_table[enrichment_table[comp_variable] == comp]
	enr.loc[:, 'Motif Name'] = enr['Motif Name'].str.replace(".BestGuess:", "").str.replace(r"-ChIP-Seq.", "")

	enr.loc[:, 'combined'] = enr[['enrichment_over_background', 'log_p_value']].apply(zscore).mean(axis=1)
	cs = axis[i].scatter(
	enr['enrichment_over_background'],
	enr['log_p_value'],
	c=enr['combined'],
	s=8, alpha=0.75)

	# label top points
	for j in enr['combined'].sort_values().tail(5).index:
	axis[i].text(
	enr.loc[j, 'enrichment_over_background'],
	enr.loc[j, 'log_p_value'],
	s=enr.loc[j, "Motif Name"], ha="right", fontsize=5)
	axis[i].set_title(comp)

	for ax in axis.reshape((n_side, n_side))[:, 0]:
	ax.set_ylabel("-log10(p-value)")
	for ax in axis.reshape((n_side, n_side))[-1, :]:
	ax.set_xlabel("Enrichment over background")
	sns.despine(fig)
	fig.savefig(
	os.path.join(output_dir, output_prefix + ".homer_consensus.scatterplot.svg"),
	bbox_inches="tight", dpi=300)

	# Plot heatmaps of terms for each comparison
	if len(enrichment_table[comp_variable].drop_duplicates()) < 2:
	return

	for label, metric in [
	('-log10(p-value) of enrichment\nin', 'log_p_value'),
	('Enrichment over background of\n', 'enrichment_over_background')]:
	# pivot table
	motifs_pivot = pd.pivot_table(
	enrichment_table, values=metric, columns="Motif Name", index=comp_variable).fillna(0)

	# plot correlation
	if correlation_plots:
	g = sns.clustermap(motifs_pivot.T.corr(), cbar_kws={"label": "Correlation of enrichemnt\nof differential regions"})
	g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90)
	g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0)
	g.fig.savefig(os.path.join(output_dir, output_prefix + ".homer_consensus.correlation.svg"), bbox_inches="tight", dpi=300)

	top = enrichment_table.set_index('Motif Name').groupby(comp_variable)[metric].nlargest(top_n)
	top_terms = top.index.get_level_values('Motif Name').unique()

	# plot clustered heatmap
	shape = motifs_pivot.loc[:, top_terms].shape
	g = sns.clustermap(
	motifs_pivot.loc[:, top_terms].T, figsize=(max(6, 0.12 * shape[0]), max(6, 0.12 * shape[1])),
	xticklabels=True, yticklabels=True,
	cbar_kws={"label": label + " differential regions"}, metric="correlation")
	g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90, ha="right", fontsize="xx-small")
	g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0, fontsize="xx-small")
	g.fig.savefig(os.path.join(output_dir, output_prefix + ".homer_consensus.cluster_specific.{}.svg".format(metric)), bbox_inches="tight", dpi=300)

	# plot clustered heatmap
	shape = motifs_pivot.loc[:, top_terms].shape
	if z_score is not None:
	g = sns.clustermap(
	motifs_pivot.loc[:, top_terms].T, figsize=(max(6, 0.12 * shape[0]), max(6, 0.12 * shape[1])),
	xticklabels=True, yticklabels=True,
	cmap="RdBu_r", center=0, z_score=z_score,
	cbar_kws={"label": "{} Z-score of {} differential regions".format(z_score_label, label)}, metric="correlation")
	g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90, ha="right", fontsize="xx-small")
	g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0, fontsize="xx-small")
	g.fig.savefig(
	os.path.join(output_dir, output_prefix + ".homer_consensus.cluster_specific.{}.{}_z_score.svg".format(metric, z_score_label)),
	bbox_inches="tight", dpi=300)