MarcinKonowalczyk/sequence_sampling_experiment.py

## sequence_sampling_experiment.py
# Test how many subsequences we want to sample from a sequence to, approximately, cover the entire sequence.

import numpy as np

# ======
N = 1982  # sequence length
L = 79  # sequence length
# ======

R = 100  # number of repetitions to average out randomness
M = 10 * N // L  # number of subsequences to sample

n_zeros = np.zeros((M, R))

for r in range(R):
    seq = np.zeros(N)
    for i in range(M):
        si = np.random.randint(0, N - L + 1)
        seq[si : si + L] += 1
        n_zeros[i, r] = np.sum(seq == 0)


x = np.arange(1, M + 1)
y = n_zeros.mean(axis=1) / N
dy = n_zeros.std(axis=1) / N

x = np.array([0] + list(x))
y = np.array([1] + list(y))
dy = np.array([0] + list(dy))

# fit an exponential function to the data
from scipy.optimize import curve_fit
import inspect
import colorama


def f(x, a, b1, b2):
    """Exponential function going through (0, 1)"""
    return a * np.exp(-b1 * x) + (1 - a) * np.exp(-b2 * x)


pnames = inspect.getfullargspec(f).args[1:]
popt, pcov = curve_fit(f, x, y)

popt = {name: value for name, value in zip(pnames, popt)}
yf = f(x, **popt)

print("Covariance matrix:")
print(pcov)
print("Fit parameters:")
print(popt)

b = max(popt["b1"], popt["b2"])
print(f"1/b = {1/b:.2f} ({np.ceil(1/b)} subsequences)")
print(f"covered after 3 * 1/b = {3/b:.2f} subsequences: {1 - f(3 / b, **popt):.2f}")
print(f"covered after 5 * 1/b = {5/b:.2f} subsequences: {1 - f(5 / b, **popt):.2f}")

nl = N / L
print(f"N/L = {nl:.2f}")
print(f"covered after 3 * N/L = {3 * nl:.2f} subsequences: {1 - f(3 * nl, **popt):.2f}")

delta = abs(nl - 1 / b) / nl * 100
print(f"delta = {delta:.2f}%")
if delta < 5:
    # print("The two estimates are close to each other.")
    print(
        colorama.Fore.GREEN
        + "The two estimates are close to each other."
        + colorama.Style.RESET_ALL
    )
else:
    # print("The two estimates are not close to each other.")
    print(
        colorama.Fore.RED
        + "The two estimates are not close to each other."
        + colorama.Style.RESET_ALL
    )

# plot the data
import matplotlib.pyplot as plt

plt.plot(x, y, "b-", label="data")
plt.plot(x, yf, "k--", label="fit")

# Add a shaded area to show the standard deviation
plt.fill_between(x, y - dy, y + dy, alpha=0.3)

plt.xlabel("Number of subsequences")
plt.ylabel("Fraction of sequence not covered")
plt.grid()
plt.show()
	# Test how many subsequences we want to sample from a sequence to, approximately, cover the entire sequence.

	import numpy as np

	# ======
	N = 1982 # sequence length
	L = 79 # sequence length
	# ======

	R = 100 # number of repetitions to average out randomness
	M = 10 * N // L # number of subsequences to sample

	n_zeros = np.zeros((M, R))

	for r in range(R):
	seq = np.zeros(N)
	for i in range(M):
	si = np.random.randint(0, N - L + 1)
	seq[si : si + L] += 1
	n_zeros[i, r] = np.sum(seq == 0)


	x = np.arange(1, M + 1)
	y = n_zeros.mean(axis=1) / N
	dy = n_zeros.std(axis=1) / N

	x = np.array([0] + list(x))
	y = np.array([1] + list(y))
	dy = np.array([0] + list(dy))

	# fit an exponential function to the data
	from scipy.optimize import curve_fit
	import inspect
	import colorama


	def f(x, a, b1, b2):
	"""Exponential function going through (0, 1)"""
	return a * np.exp(-b1 * x) + (1 - a) * np.exp(-b2 * x)


	pnames = inspect.getfullargspec(f).args[1:]
	popt, pcov = curve_fit(f, x, y)

	popt = {name: value for name, value in zip(pnames, popt)}
	yf = f(x, **popt)

	print("Covariance matrix:")
	print(pcov)
	print("Fit parameters:")
	print(popt)

	b = max(popt["b1"], popt["b2"])
	print(f"1/b = {1/b:.2f} ({np.ceil(1/b)} subsequences)")
	print(f"covered after 3 * 1/b = {3/b:.2f} subsequences: {1 - f(3 / b, **popt):.2f}")
	print(f"covered after 5 * 1/b = {5/b:.2f} subsequences: {1 - f(5 / b, **popt):.2f}")

	nl = N / L
	print(f"N/L = {nl:.2f}")
	print(f"covered after 3 * N/L = {3 * nl:.2f} subsequences: {1 - f(3 * nl, **popt):.2f}")

	delta = abs(nl - 1 / b) / nl * 100
	print(f"delta = {delta:.2f}%")
	if delta < 5:
	# print("The two estimates are close to each other.")
	print(
	colorama.Fore.GREEN
	+ "The two estimates are close to each other."
	+ colorama.Style.RESET_ALL
	)
	else:
	# print("The two estimates are not close to each other.")
	print(
	colorama.Fore.RED
	+ "The two estimates are not close to each other."
	+ colorama.Style.RESET_ALL
	)

	# plot the data
	import matplotlib.pyplot as plt

	plt.plot(x, y, "b-", label="data")
	plt.plot(x, yf, "k--", label="fit")

	# Add a shaded area to show the standard deviation
	plt.fill_between(x, y - dy, y + dy, alpha=0.3)

	plt.xlabel("Number of subsequences")
	plt.ylabel("Fraction of sequence not covered")
	plt.grid()
	plt.show()