Skip to content

Instantly share code, notes, and snippets.

@dblyon
Last active August 14, 2022 13:32
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 7 You must be signed in to fork a gist
  • Save dblyon/402a5fe1e1e211f34d5b230e0a4c93cc to your computer and use it in GitHub Desktop.
Save dblyon/402a5fe1e1e211f34d5b230e0a4c93cc to your computer and use it in GitHub Desktop.
Volcano plot: log2(ratio) vs. log10(p-value)
class Volcano(object):
"""
create a Volcano plot from log2(ratios) and corresponding -log10(p_values)
ToDo: take care of infinite ratios
e.g. usage
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
# when using jupyter notebook:
# %matplotlib inline
dict_ = {'p_val': {1: 0.208057931774703, 2: 0.063320586966883294, 3: 0.11424685505629198, 4: 0.46130291511303301, 5: 0.31662387800522196, 6: 0.35821379490648098, 7: 0.0559720700871537, 8: 0.048805611096553, 9: 0.27635717881946303, 336: 7.1364464137191392, 264: 0.50054528789819708, 530: 4.9027283450578603, 83: 4.4128053670050704, 565: 5.7615691096534203}, 'ratio': {1: -0.22443504333496, 2: 0.074678929646808001, 3: 0.27419026692708204, 4: 0.48245075770786605, 5: 0.274400329589845, 6: -0.53869597117106194, 7: 0.13976793819003699, 8: 0.20341746012369896, 9: 0.54393492804633292, 336: 4.2886840820312502, 264: -3.9391211668650299, 530: 4.5683480792575404, 83: 2.5429632663726802, 565: 4.04515149858263}, 's_val': {1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0, 336: 11.159643571086351, 264: 0.0, 530: 6.9983089690857891, 83: 1.0852958656755318, 565: 7.2227303405181722}, 'label': {1: 'CRYAB', 2: 'HDLBP', 3: 'TAF15', 4: 'GNAO1', 5: 'KHSRP', 6: 'HSPA4', 7: 'PABPC1;PABPC4', 8: 'MINOS1', 9: 'SPCS2', 336: 'APOL2', 264: 'WISP3', 530: 'WARS', 83: 'HLA-C', 565: 'GBP1'}}
df = pd.DataFrame(dict_)
volc = Volcano(df["ratio"], df["p_val"], df["label"], s_curve_x_axis_overplot=0.5, s_curve_y_axis_overplot=0.5)
fig = volc.get_fig()
for S-curve calculation see Jan Christian Refsgaard's PhD thesis page 90-91
from Perseus documentation:
S0: Artificial within groups variance (default: 0).
It controls the relative importance of t-test p-value and difference between means.
At s0=0 only the p-value matters, while at nonzero s0 also the difference of means
plays a role. See Tusher, Tibshirani and Chu (2001) PNAS 98, pp5116-21 for details.
"""
def __init__(self, ratio, p_val, label=None, s_curve_x_axis_overplot=0.5, s_curve_y_axis_overplot=0.5):
"""
careful use ratio not difference as in Perseus
ratio of 0.5 instead of difference -2
:param ratio: Pandas.Series or Numpy.Array or List of log2(ratios)
:param p_val: Pandas.Series or Numpy.Array or List of -log10(p-values)
:param label: Pandas.Series or Numpy.Array or ListOfString
"""
assert len(ratio) == len(p_val)
self.df = pd.DataFrame({"ratio": ratio, "p_val": p_val})
if label is not None:
self.df["label"] = label
self.s_curve_y_axis_overplot = s_curve_y_axis_overplot
self.p_val_cutoff = self.get_p_val_cutoff()
self.ratio_cutoff = self.get_ratio_cutoff()
self.df["s_val"] = self.df.apply(self.calc_s_from_row, axis=1)
self.ratio_for_s = pd.Series(np.linspace(self.df["ratio"].min() - s_curve_x_axis_overplot, self.df["ratio"].max() + s_curve_x_axis_overplot, num=1000))
self.p_for_s_larger_1 = self.ratio_for_s.apply(self.calc_p_for_s_equals_1)
def get_p_val_cutoff(self):
"""
p_val_cutoff = 0.05
pc = 3.5 + median(p_val(50% lowest log2_ratios)) --> is what Jan uses for whatever reason ???
-log10_pval of 2.0 --> pval of 0.01
"""
### hard coded cutoff of 1%
# return math.log(0.01, 10) * -1
### Jan's cutoff, but how to justify???
quant = self.df["ratio"].quantile(0.5)
return 2.0 + self.df.loc[self.df["ratio"] < quant, "p_val"].median()
def get_ratio_cutoff(self):
"""
log2_ratio_cutoff = 2.0
ratio_cutoff_high = 2 + median(ratio(50% lowest log10_p_values))
ratio_cutoff_low = 0.5 - median(ratio(50% lowest log10_p_values))
"""
### hard coded cutoff of 2 fold enrichment or depletion
# return math.log(0.5, 2), math.log(2, 2)
### Jan's cutoff, how to justify???
quant = self.df["p_val"].quantile(0.5)
median_ = self.df.loc[self.df["p_val"] < quant, "ratio"].median()
ratio_cutoff_high = 2.0 + median_
ratio_cutoff_low = -2.0 - median_
return ratio_cutoff_low, ratio_cutoff_high
def calc_s_from_row(self, row):
p_val = row["p_val"]
ratio = row["ratio"]
return self.calc_s(p_val, ratio)
def calc_s(self, p_val, ratio):
"""
so the algorithmn for finding stuff with s > 1 is:
discard stuff below the ratio_cutoff
discard stuff below the p-val cutoff
do the calcuation for the stuff above BOTH cutoffs and accept all with s > 1
s = (p_val - p_val_cutoff) * (ratio - ratio_cutoff)
:param p_val: Float(-log10 p-value)
:param ratio: Float(log2 ratio)
:return: Float
"""
ratio_cutoff_low, ratio_cutoff_high = self.ratio_cutoff
if ratio > 0:
ratio_delta = ratio - ratio_cutoff_high
if ratio_delta < 0:
return 0
elif ratio < 0:
ratio_delta = ratio - ratio_cutoff_low
if ratio_delta > 0:
return 0
ratio_delta = abs(ratio_delta)
p_val_delta = p_val - self.p_val_cutoff
if p_val_delta < 0:
return 0
return p_val_delta * ratio_delta
def calc_p_for_s_equals_1(self, ratio):
"""
:param ratio: Float(log2 ratio)
:return: Float
"""
ratio_cutoff_low, ratio_cutoff_high = self.ratio_cutoff
ratio_delta_high = ratio - ratio_cutoff_high
ratio_delta_low = ratio - ratio_cutoff_low
if ratio > ratio_cutoff_high:
return (1.0 / ratio_delta_high) + self.p_val_cutoff
elif ratio < ratio_cutoff_low:
return (1.0 / (ratio_delta_low * -1)) + self.p_val_cutoff
else:
return np.nan
def get_fig(self, title="Volcano plot", s_value_cutoff=1.0):
fig, ax1 = plt.subplots(figsize=(12, 12))
ax1.set_title(title, fontsize=22, fontweight="bold")
cond_p_val = self.p_for_s_larger_1 <= (self.df["p_val"].max() + self.s_curve_y_axis_overplot)
cond_pos = self.ratio_for_s > 0
x1 = self.ratio_for_s[cond_pos & cond_p_val]
y1 = self.p_for_s_larger_1[cond_pos & cond_p_val]
cond_neg = self.ratio_for_s < 0
x2 = self.ratio_for_s[cond_neg & cond_p_val]
y2 = self.p_for_s_larger_1[cond_neg & cond_p_val]
x3 = self.df["ratio"]
y3 = self.df["p_val"]
ax1.plot(x1, y1, 'r-', x2, y2, 'r-', x3, y3, 'b.', markersize=10) # , alpha=0.7)
ax1.set_xlabel('log2(ratio)', fontsize=20)
ax1.set_ylabel('-log10(p-value)', fontsize=20)
if "label" in self.df.columns:
cond = self.df["s_val"] >= s_value_cutoff
for index_, row in self.df[cond].iterrows():
label = row["label"]
x_coord = row["ratio"]
y_coord = row["p_val"]
ax1.annotate(label, xy=(x_coord, y_coord), xycoords='data', xytext=(5, 5),
textcoords='offset points', arrowprops=dict(arrowstyle="-"), fontsize=12)
return fig
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment