Skip to content

Instantly share code, notes, and snippets.

@MarcinKonowalczyk
Created February 5, 2024 02:43
Show Gist options
  • Save MarcinKonowalczyk/cd22fe65754b4940d9eed0af2b0ff34f to your computer and use it in GitHub Desktop.
Save MarcinKonowalczyk/cd22fe65754b4940d9eed0af2b0ff34f to your computer and use it in GitHub Desktop.
Empirically determined how many subsequence samples it takes to sample entire sequence
# Test how many subsequences we want to sample from a sequence to, approximately, cover the entire sequence.
import numpy as np
# ======
N = 1982 # sequence length
L = 79 # sequence length
# ======
R = 100 # number of repetitions to average out randomness
M = 10 * N // L # number of subsequences to sample
n_zeros = np.zeros((M, R))
for r in range(R):
seq = np.zeros(N)
for i in range(M):
si = np.random.randint(0, N - L + 1)
seq[si : si + L] += 1
n_zeros[i, r] = np.sum(seq == 0)
x = np.arange(1, M + 1)
y = n_zeros.mean(axis=1) / N
dy = n_zeros.std(axis=1) / N
x = np.array([0] + list(x))
y = np.array([1] + list(y))
dy = np.array([0] + list(dy))
# fit an exponential function to the data
from scipy.optimize import curve_fit
import inspect
import colorama
def f(x, a, b1, b2):
"""Exponential function going through (0, 1)"""
return a * np.exp(-b1 * x) + (1 - a) * np.exp(-b2 * x)
pnames = inspect.getfullargspec(f).args[1:]
popt, pcov = curve_fit(f, x, y)
popt = {name: value for name, value in zip(pnames, popt)}
yf = f(x, **popt)
print("Covariance matrix:")
print(pcov)
print("Fit parameters:")
print(popt)
b = max(popt["b1"], popt["b2"])
print(f"1/b = {1/b:.2f} ({np.ceil(1/b)} subsequences)")
print(f"covered after 3 * 1/b = {3/b:.2f} subsequences: {1 - f(3 / b, **popt):.2f}")
print(f"covered after 5 * 1/b = {5/b:.2f} subsequences: {1 - f(5 / b, **popt):.2f}")
nl = N / L
print(f"N/L = {nl:.2f}")
print(f"covered after 3 * N/L = {3 * nl:.2f} subsequences: {1 - f(3 * nl, **popt):.2f}")
delta = abs(nl - 1 / b) / nl * 100
print(f"delta = {delta:.2f}%")
if delta < 5:
# print("The two estimates are close to each other.")
print(
colorama.Fore.GREEN
+ "The two estimates are close to each other."
+ colorama.Style.RESET_ALL
)
else:
# print("The two estimates are not close to each other.")
print(
colorama.Fore.RED
+ "The two estimates are not close to each other."
+ colorama.Style.RESET_ALL
)
# plot the data
import matplotlib.pyplot as plt
plt.plot(x, y, "b-", label="data")
plt.plot(x, yf, "k--", label="fit")
# Add a shaded area to show the standard deviation
plt.fill_between(x, y - dy, y + dy, alpha=0.3)
plt.xlabel("Number of subsequences")
plt.ylabel("Fraction of sequence not covered")
plt.grid()
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment