-
-
Save behzadnouri/b9c7133f4a4d36394d318ebcacf3c734 to your computer and use it in GitHub Desktop.
erasure recovery probabilities with fault rates
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from scipy.stats import binom | |
# binom.cdf(k=32, n=64, p=0.44444444444444453) | |
# p == 1/3, faulty nodes | |
# 2 hops away = (1 - p)**2 = 0.44444444444444453 | |
# 32/64 = 0.22060211153 | |
# for 8 shreds: exp(log(0.22060211153) / 4) = 0.6853342180737193 | |
# 1:6 = 0.97059880589**32 | |
# 2:8 = 0.93284912456**16 | |
# 8:20 = 0.73168863611**4 | |
# 16:35 = 0.50497585388**2 | |
# only need: 1, 2, 4, 8, 16 | |
# fault_rate = 1 / 3 | |
# fault_rate = 1 - (1 - fault_rate)**2 # 2 hops away | |
# prob_recover = get_prob_recover(32, 64, fault_rate) | |
# Probability of recovering data many shreds in a batch of size batch-size. | |
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.binom.html | |
def get_prob_recover(data, batch_size, fault_rate): | |
return 1 - binom.cdf(k=data - 1, n=batch_size, p=1 - fault_rate) | |
# # Smallest batch size which will recover data many shreds with probability of | |
# # at lest 'prob'. | |
# def get_batch_size(data, fault_rate, prob): | |
# for batch_size in range(data, 256): | |
# if prob_recover(data, batch_size, fault_rate) >= prob: | |
# return batch_size | |
# Individual success probability of individual chunks. | |
def get_chunk_batch_size(data, batch_size, chunk_size, fault_rate): | |
prob_recover = get_prob_recover(data, batch_size, fault_rate) | |
num_chunks = data / chunk_size | |
prob = np.exp(np.log(prob_recover) / num_chunks) | |
for chunk_batch_size in range(chunk_size, 256): | |
if get_prob_recover(chunk_size, chunk_batch_size, fault_rate) >= prob: | |
return chunk_batch_size | |
# 0.10 is the lowest probablity that 32:64 is not == 1. | |
# run_get_chunk_batch_size(32, 64, 0.10) | |
def run_get_chunk_batch_size(data, batch_size, fault_rate): | |
print("{:0.3f}".format(fault_rate)) | |
for chunk_size in [1, 2, 4, 8, 16, 32]: | |
chunk_batch_size = get_chunk_batch_size(data, batch_size, chunk_size, | |
fault_rate) | |
print("{:2}:{:2}".format(chunk_size, chunk_batch_size)) | |
def find_chunk_batch_size(data, batch_size, chunk_size): | |
node_fault_rate = 1 / 3 | |
transmit_fault_rate = 1 - (1 - node_fault_rate)**2 # 2 hops away | |
out = 0 | |
for fault_rate in np.linspace(start=0, stop=transmit_fault_rate, num=1000): | |
chunk_batch_size = get_chunk_batch_size(data, batch_size, chunk_size, fault_rate) | |
if chunk_batch_size > out: | |
# print(fault_rate) | |
out = chunk_batch_size | |
return out | |
# 1:18 | |
# 2:20 | |
# 4:23 | |
# 8:30 | |
# 16:42 | |
# 32:65 | |
def run_find_chunk_batch_size(data, batch_size): | |
# for chunk_size in [1, 2, 4, 8, 16, 32]: | |
for chunk_size in range(1, 33): | |
chunk_batch_size = find_chunk_batch_size(data, batch_size, chunk_size) | |
print("{:2}:{:2}".format(chunk_size, chunk_batch_size)) | |
# 0, 18, 20, 22, 23, 25, 27, 28, 30, 32, 33, 35, 36, 38, 39, 41, 42, 43, 45, 46, | |
# 48, 49, 51, 52, 53, 55, 56, 58, 59, 60, 62, 63, 64 | |
def batch(n): | |
if n < 32: | |
return [n] | |
k = n // 32 | |
out = [] | |
while n > 0: | |
out.append((n + k - 1) // k); | |
n -= out[-1] | |
k -= 1 | |
return out |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment