Created
February 5, 2024 02:43
-
-
Save MarcinKonowalczyk/cd22fe65754b4940d9eed0af2b0ff34f to your computer and use it in GitHub Desktop.
Empirically determined how many subsequence samples it takes to sample entire sequence
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Test how many subsequences we want to sample from a sequence to, approximately, cover the entire sequence. | |
import numpy as np | |
# ====== | |
N = 1982 # sequence length | |
L = 79 # sequence length | |
# ====== | |
R = 100 # number of repetitions to average out randomness | |
M = 10 * N // L # number of subsequences to sample | |
n_zeros = np.zeros((M, R)) | |
for r in range(R): | |
seq = np.zeros(N) | |
for i in range(M): | |
si = np.random.randint(0, N - L + 1) | |
seq[si : si + L] += 1 | |
n_zeros[i, r] = np.sum(seq == 0) | |
x = np.arange(1, M + 1) | |
y = n_zeros.mean(axis=1) / N | |
dy = n_zeros.std(axis=1) / N | |
x = np.array([0] + list(x)) | |
y = np.array([1] + list(y)) | |
dy = np.array([0] + list(dy)) | |
# fit an exponential function to the data | |
from scipy.optimize import curve_fit | |
import inspect | |
import colorama | |
def f(x, a, b1, b2): | |
"""Exponential function going through (0, 1)""" | |
return a * np.exp(-b1 * x) + (1 - a) * np.exp(-b2 * x) | |
pnames = inspect.getfullargspec(f).args[1:] | |
popt, pcov = curve_fit(f, x, y) | |
popt = {name: value for name, value in zip(pnames, popt)} | |
yf = f(x, **popt) | |
print("Covariance matrix:") | |
print(pcov) | |
print("Fit parameters:") | |
print(popt) | |
b = max(popt["b1"], popt["b2"]) | |
print(f"1/b = {1/b:.2f} ({np.ceil(1/b)} subsequences)") | |
print(f"covered after 3 * 1/b = {3/b:.2f} subsequences: {1 - f(3 / b, **popt):.2f}") | |
print(f"covered after 5 * 1/b = {5/b:.2f} subsequences: {1 - f(5 / b, **popt):.2f}") | |
nl = N / L | |
print(f"N/L = {nl:.2f}") | |
print(f"covered after 3 * N/L = {3 * nl:.2f} subsequences: {1 - f(3 * nl, **popt):.2f}") | |
delta = abs(nl - 1 / b) / nl * 100 | |
print(f"delta = {delta:.2f}%") | |
if delta < 5: | |
# print("The two estimates are close to each other.") | |
print( | |
colorama.Fore.GREEN | |
+ "The two estimates are close to each other." | |
+ colorama.Style.RESET_ALL | |
) | |
else: | |
# print("The two estimates are not close to each other.") | |
print( | |
colorama.Fore.RED | |
+ "The two estimates are not close to each other." | |
+ colorama.Style.RESET_ALL | |
) | |
# plot the data | |
import matplotlib.pyplot as plt | |
plt.plot(x, y, "b-", label="data") | |
plt.plot(x, yf, "k--", label="fit") | |
# Add a shaded area to show the standard deviation | |
plt.fill_between(x, y - dy, y + dy, alpha=0.3) | |
plt.xlabel("Number of subsequences") | |
plt.ylabel("Fraction of sequence not covered") | |
plt.grid() | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment