# In []: | |
import pandas as pd | |
import numpy as np | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
import scipy as sp | |
A = np.random.normal(5, 1, 100) | |
B = np.random.normal(3, 1.5, 100) | |
C = np.random.normal(4, 2, 100) | |
df = pd.DataFrame(np.array([A, B, C]), index=["A", "B", "C"]).T | |
df.head(10) | |
df.plot(kind="kde", figsize=(8,5), title="Before applying quantile normalization") | |
#plt.savefig("before.png", transparent=False) | |
# In []: | |
# Quantile normalization | |
## 1. order values of each columns | |
sort_idx = np.argsort(df, axis=0) | |
orig_idx = np.argsort(sort_idx, axis=0) | |
sorted_A = df.A[sort_idx.A].reset_index(drop=True) | |
sorted_B = df.B[sort_idx.B].reset_index(drop=True) | |
sorted_C = df.C[sort_idx.C].reset_index(drop=True) | |
df_sorted = pd.concat([sorted_A, sorted_B, sorted_C], axis=1) | |
df_sorted.head(10) | |
## 2. (w/o reference) calculate median of each rows | |
medians = df_sorted.median(axis=1) | |
medians.head(10) | |
## 2. (w/ reference) calculate quantile of reference distribution | |
## in this case, 101-quantile of standard normal distribution | |
points = np.linspace(0, 1.0, 101, endpoint=False) | |
reference_quantile = sp.stats.norm.ppf(points) | |
reference_quantile = reference_quantile[1:] | |
## 3 (w/o ref). replace original values to calculated median | |
normalized_A = medians[orig_idx.A].reset_index(drop=True) | |
normalized_B = medians[orig_idx.B].reset_index(drop=True) | |
normalized_C = medians[orig_idx.C].reset_index(drop=True) | |
df_normalized = pd.DataFrame([normalized_A, normalized_B, normalized_C], index=["A", "B", "C"]).T | |
df_normalized.head(10) | |
df_normalized.plot(kind="kde", figsize=(8,5), title="After applying quantile normalization") | |
plt.savefig("after.png", transparent=False) | |
## 3 (w/ ref). replace original values to calculated median | |
normalized_A = reference_quantile[orig_idx.A] | |
normalized_B = reference_quantile[orig_idx.B] | |
normalized_C = reference_quantile[orig_idx.C] | |
df_normalized = pd.DataFrame([normalized_A, normalized_B, normalized_C], index=["A", "B", "C"]).T | |
df_normalized.head(10) | |
df_normalized.plot(kind="kde", figsize=(8,5), title="After applying quantile normalization") | |
plt.savefig("after_ref.png", transparent=False) | |
# In []: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment