Skip to content

Instantly share code, notes, and snippets.

@behzadnouri
Created August 3, 2022 18:04
Show Gist options
  • Save behzadnouri/b9c7133f4a4d36394d318ebcacf3c734 to your computer and use it in GitHub Desktop.
Save behzadnouri/b9c7133f4a4d36394d318ebcacf3c734 to your computer and use it in GitHub Desktop.
erasure recovery probabilities with fault rates
import numpy as np
from scipy.stats import binom
# binom.cdf(k=32, n=64, p=0.44444444444444453)
# p == 1/3, faulty nodes
# 2 hops away = (1 - p)**2 = 0.44444444444444453
# 32/64 = 0.22060211153
# for 8 shreds: exp(log(0.22060211153) / 4) = 0.6853342180737193
# 1:6 = 0.97059880589**32
# 2:8 = 0.93284912456**16
# 8:20 = 0.73168863611**4
# 16:35 = 0.50497585388**2
# only need: 1, 2, 4, 8, 16
# fault_rate = 1 / 3
# fault_rate = 1 - (1 - fault_rate)**2 # 2 hops away
# prob_recover = get_prob_recover(32, 64, fault_rate)
# Probability of recovering data many shreds in a batch of size batch-size.
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.binom.html
def get_prob_recover(data, batch_size, fault_rate):
return 1 - binom.cdf(k=data - 1, n=batch_size, p=1 - fault_rate)
# # Smallest batch size which will recover data many shreds with probability of
# # at lest 'prob'.
# def get_batch_size(data, fault_rate, prob):
# for batch_size in range(data, 256):
# if prob_recover(data, batch_size, fault_rate) >= prob:
# return batch_size
# Individual success probability of individual chunks.
def get_chunk_batch_size(data, batch_size, chunk_size, fault_rate):
prob_recover = get_prob_recover(data, batch_size, fault_rate)
num_chunks = data / chunk_size
prob = np.exp(np.log(prob_recover) / num_chunks)
for chunk_batch_size in range(chunk_size, 256):
if get_prob_recover(chunk_size, chunk_batch_size, fault_rate) >= prob:
return chunk_batch_size
# 0.10 is the lowest probablity that 32:64 is not == 1.
# run_get_chunk_batch_size(32, 64, 0.10)
def run_get_chunk_batch_size(data, batch_size, fault_rate):
print("{:0.3f}".format(fault_rate))
for chunk_size in [1, 2, 4, 8, 16, 32]:
chunk_batch_size = get_chunk_batch_size(data, batch_size, chunk_size,
fault_rate)
print("{:2}:{:2}".format(chunk_size, chunk_batch_size))
def find_chunk_batch_size(data, batch_size, chunk_size):
node_fault_rate = 1 / 3
transmit_fault_rate = 1 - (1 - node_fault_rate)**2 # 2 hops away
out = 0
for fault_rate in np.linspace(start=0, stop=transmit_fault_rate, num=1000):
chunk_batch_size = get_chunk_batch_size(data, batch_size, chunk_size, fault_rate)
if chunk_batch_size > out:
# print(fault_rate)
out = chunk_batch_size
return out
# 1:18
# 2:20
# 4:23
# 8:30
# 16:42
# 32:65
def run_find_chunk_batch_size(data, batch_size):
# for chunk_size in [1, 2, 4, 8, 16, 32]:
for chunk_size in range(1, 33):
chunk_batch_size = find_chunk_batch_size(data, batch_size, chunk_size)
print("{:2}:{:2}".format(chunk_size, chunk_batch_size))
# 0, 18, 20, 22, 23, 25, 27, 28, 30, 32, 33, 35, 36, 38, 39, 41, 42, 43, 45, 46,
# 48, 49, 51, 52, 53, 55, 56, 58, 59, 60, 62, 63, 64
def batch(n):
if n < 32:
return [n]
k = n // 32
out = []
while n > 0:
out.append((n + k - 1) // k);
n -= out[-1]
k -= 1
return out
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment