Skip to content

Instantly share code, notes, and snippets.

@endrebak
Created January 15, 2020 10:02
Show Gist options
  • Save endrebak/4bc5f26e47728b4c3d7208010139cf52 to your computer and use it in GitHub Desktop.
Save endrebak/4bc5f26e47728b4c3d7208010139cf52 to your computer and use it in GitHub Desktop.
rule compute_kde:
input:
"{prefix}/data/{genome}/HMM_states/{statistic}/correlations_all.gz"
output:
"{prefix}/data/{genome}/HMM_states/{statistic}/cutoff.txt"
run:
f = input[0]
o = output[0]
from scipy.stats import gaussian_kde
import numpy as np
import pandas as pd
df = pd.read_table(f, sep="\t")
values = df.CorrelationSum.sort_values()
print("gaussian")
gk = gaussian_kde(values)
vals = np.linspace(values.min(), values.max(), 1000)
print("integrate_box_1d")
integrals = []
result = 0
previous = 0
for i, v in enumerate(vals):
result += gk.integrate_box_1d(previous, v)
integrals.append(result)
if i % 50 == 0:
print("----" * 5)
print("i", i)
# print("previous", previous)
print("result", result)
previous = v
res = np.array(integrals)
result = []
for cutoff in cutoffs:
print("Computing cutoff", cutoff)
cutoff_idx = len(res[res < cutoff])
cutoff_value = vals[cutoff_idx]
number = (values < cutoff_value).sum()
percentage = 100 * (number / len(values))
result.append({"Cutoff": cutoff, "CutoffValue": cutoff_value, "Number": number, "Percentage": percentage})
result = pd.DataFrame.from_dict(result)
print(result)
result.to_csv(o, sep="\t", index=False, float_format="%.3f")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment