Created
June 5, 2024 17:39
-
-
Save sentientwaffle/c90df9e9ff49ef5f970e66d32c7e071c to your computer and use it in GitHub Desktop.
Scrubber cycle interval model/calculator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
from fractions import Fraction | |
# Model the probability that the cluster experiences data loss due to bitrot. | |
# Specifically, that every copy of any grid block is corrupted before the scrubber repairs them. | |
# | |
# Optimistic assumptions (see below): | |
# - Faults are independent (i.e. uncorrelated) in space and time. ¹ | |
# - Faults are independent between replicas. ² | |
# | |
# Pessimistic assumptions: | |
# - There are only 3 (quorum_replication) copies of each sector. | |
# - Scrub randomization is ignored. | |
# - The simulated fault rate is much greater than a real disk's. (See `sector_faults_per_year`). | |
# - Reads due to other workloads besides the scrubber (e.g. compaction) can also trigger repairs, | |
# but they are not modelled. | |
# | |
# ¹: SSD faults are not independent (in either time or space). | |
# See, for example: | |
# - "An In-Depth Study of Correlated Failures in Production SSD-Based Data Centers" | |
# (https://www.usenix.org/system/files/fast21-han.pdf) | |
# - "Flash Reliability in Production: The Expected and the Unexpected" | |
# (https://www.usenix.org/system/files/conference/fast16/fast16-papers-schroeder.pdf) | |
# That being said, for the purposes of modelling scrubbing, it is a decent approximation, | |
# assuming that *the disk faults between replicas are independent*. | |
# | |
# ²: To mitigate the risk of correlated errors in production, replicas could use different SSD models. | |
#################################################################################################### | |
# Parameters | |
# The number of years that the test is "running". | |
# As the test runs longer, the probability that the cluster will experience data loss increases. | |
test_duration_years = 100 | |
# The number of days between scrubs of a particular sector. | |
# Equivalently, the number of days to scrub the entire data file. | |
cycle_interval_days = 180 | |
# The total size of the data file. | |
# Note that since this parameter is separate from the faults/year rate, increasing storage_size | |
# actually reduces the likelihood of data loss. | |
storage_size = 16 * (1024 ** 4) | |
# The expected (average) number of sector faults per year. | |
# I can't find any good, recent statistics for faults on SSDs. | |
# | |
# Most papers express the fault rate as "UBER" (uncorrectable bit errors per total bits read). | |
# But "Flash Reliability in Production: The Expected and the Unexpected" §5.1 finds that UBER's | |
# underlying assumption that the uncorrectable errors is correlated to the number of bytes read | |
# is false. (That paper only shares "fraction of drives affected by an error", which is too coarse.) | |
# | |
# Instead, the parameter is chosen conservatively greater than the "true" number by at least an | |
# order of magnitude. | |
sector_faults_per_year = 10_000 | |
# A block has multiple sectors. If any of a block's sectors are corrupt, then the block is corrupt. | |
# Increasing this increases the likelihood of data loss. | |
# | |
# (Intuitively, a single bitrot within 1GiB is more likely than a single bitrot within 1KiB.) | |
block_size = 512 * 1024 | |
# The total number of copies of each sector. | |
# The cluster is recoverable if a sector's number of faults is less than `replicas_total`. | |
# Set to 3 rather than 6 since 3 is the quorum_replication. | |
replicas_total = 3 | |
sector_size = 4 * 1024 | |
#################################################################################################### | |
# Computation | |
block_sectors = block_size // sector_size | |
storage_sectors = storage_size // sector_size | |
storage_blocks = storage_size // block_size | |
test_duration_days = test_duration_years * 365 | |
test_duration_cycles = math.ceil(test_duration_days / cycle_interval_days) | |
sector_faults_per_cycle = math.ceil(sector_faults_per_year * cycle_interval_days / 365) | |
# P(a particular sector is uncorrupted for an entire cycle) | |
# | |
# If we inject a fault randomly into S sectors, then the probability that the fault misses one | |
# sector in particular is `(S-1)/S`. | |
# If we repeat this N times, then the chance that it misses that sector every time is `((S-1)/S)^N`. | |
p_sector_healthy_per_cycle = Fraction((storage_sectors - 1) / storage_sectors) ** sector_faults_per_cycle | |
print("p_sector_healthy_per_cycle", float(p_sector_healthy_per_cycle)) | |
# P(a particular block is uncorrupted for an entire cycle) | |
# If any of the block's sectors is corrupted, then the whole block is corrupted. | |
p_block_healthy_per_cycle = p_sector_healthy_per_cycle ** block_sectors | |
print("p_block_healthy_per_cycle", float(p_block_healthy_per_cycle)) | |
p_block_corrupt_per_cycle = Fraction(1.0) - p_block_healthy_per_cycle | |
print("p_block_corrupt_per_cycle", float(p_block_corrupt_per_cycle)) | |
# P(a particular block is corrupted on all replicas in a cluster during a single cycle) | |
p_cluster_block_corrupt_per_cycle = p_block_corrupt_per_cycle ** replicas_total | |
print("p_cluster_block_corrupt_per_cycle", float(p_cluster_block_corrupt_per_cycle)) | |
# P(a particular block is uncorrupted on at least one replica in the cluster during a single cycle) | |
p_cluster_block_healthy_per_cycle = Fraction(1.0) - p_cluster_block_corrupt_per_cycle | |
print("p_cluster_block_healthy_per_cycle", float(p_cluster_block_healthy_per_cycle)) | |
# "Round" to the nearest float since otherwise the next computation takes ages.) | |
p_cluster_block_healthy_per_cycle = float(p_cluster_block_healthy_per_cycle) | |
# P(a particular block is uncorrupted on at least one replica in the cluster for all cycles) | |
# Note that each cycle can be considered independently because we assume that if is at the end of | |
# the cycle there is at least one healthy copy, then all of the corrupt copies are repaired. | |
p_cluster_block_healthy_per_span = p_cluster_block_healthy_per_cycle ** test_duration_cycles | |
print("p_cluster_block_healthy_per_span", float(p_cluster_block_healthy_per_span)) | |
# P(each block is uncorrupted on at least one replica in the cluster for all cycles) | |
p_cluster_blocks_healthy_per_span = p_cluster_block_healthy_per_span ** storage_blocks | |
print("p_cluster_blocks_healthy_per_span", float(p_cluster_blocks_healthy_per_span)) | |
# P(at some point during all cycles, a block is corrupt across all replicas) | |
p_cluster_blocks_corrupt_per_span = Fraction(1.0) - p_cluster_blocks_healthy_per_span | |
print(f"P(data loss) = {p_cluster_blocks_corrupt_per_span:e}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment