sentientwaffle/scrub3.py

## scrub3.py
import math
from fractions import Fraction

# Model the probability that the cluster experiences data loss due to bitrot.
# Specifically, that every copy of any grid block is corrupted before the scrubber repairs them.
#
# Optimistic assumptions (see below):
# - Faults are independent (i.e. uncorrelated) in space and time. ¹
# - Faults are independent between replicas. ²
#
# Pessimistic assumptions:
# - There are only 3 (quorum_replication) copies of each sector.
# - Scrub randomization is ignored.
# - The simulated fault rate is much greater than a real disk's. (See `sector_faults_per_year`).
# - Reads due to other workloads besides the scrubber (e.g. compaction) can also trigger repairs,
#   but they are not modelled.
#
# ¹: SSD faults are not independent (in either time or space).
# See, for example:
# - "An In-Depth Study of Correlated Failures in Production SSD-Based Data Centers"
#   (https://www.usenix.org/system/files/fast21-han.pdf)
# - "Flash Reliability in Production: The Expected and the Unexpected"
#   (https://www.usenix.org/system/files/conference/fast16/fast16-papers-schroeder.pdf)
# That being said, for the purposes of modelling scrubbing, it is a decent approximation,
# assuming that *the disk faults between replicas are independent*.
#
# ²: To mitigate the risk of correlated errors in production, replicas could use different SSD models.

####################################################################################################
# Parameters

# The number of years that the test is "running".
# As the test runs longer, the probability that the cluster will experience data loss increases.
test_duration_years = 100

# The number of days between scrubs of a particular sector.
# Equivalently, the number of days to scrub the entire data file.
cycle_interval_days = 180

# The total size of the data file.
# Note that since this parameter is separate from the faults/year rate, increasing storage_size
# actually reduces the likelihood of data loss.
storage_size = 16 * (1024 ** 4)

# The expected (average) number of sector faults per year.
# I can't find any good, recent statistics for faults on SSDs.
#
# Most papers express the fault rate as "UBER" (uncorrectable bit errors per total bits read).
# But "Flash Reliability in Production: The Expected and the Unexpected" §5.1 finds that UBER's
# underlying assumption that the uncorrectable errors is correlated to the number of bytes read
# is false. (That paper only shares "fraction of drives affected by an error", which is too coarse.)
#
# Instead, the parameter is chosen conservatively greater than the "true" number by at least an
# order of magnitude.
sector_faults_per_year = 10_000

# A block has multiple sectors. If any of a block's sectors are corrupt, then the block is corrupt.
# Increasing this increases the likelihood of data loss.
#
# (Intuitively, a single bitrot within 1GiB is more likely than a single bitrot within 1KiB.)
block_size = 512 * 1024

# The total number of copies of each sector.
# The cluster is recoverable if a sector's number of faults is less than `replicas_total`.
# Set to 3 rather than 6 since 3 is the quorum_replication.
replicas_total = 3

sector_size = 4 * 1024

####################################################################################################
# Computation

block_sectors = block_size // sector_size
storage_sectors = storage_size // sector_size
storage_blocks = storage_size // block_size
test_duration_days = test_duration_years * 365
test_duration_cycles = math.ceil(test_duration_days / cycle_interval_days)
sector_faults_per_cycle = math.ceil(sector_faults_per_year * cycle_interval_days / 365)

# P(a particular sector is uncorrupted for an entire cycle)
#
# If we inject a fault randomly into S sectors, then the probability that the fault misses one
# sector in particular is `(S-1)/S`.
# If we repeat this N times, then the chance that it misses that sector every time is `((S-1)/S)^N`.
p_sector_healthy_per_cycle = Fraction((storage_sectors - 1) / storage_sectors) ** sector_faults_per_cycle
print("p_sector_healthy_per_cycle", float(p_sector_healthy_per_cycle))

# P(a particular block is uncorrupted for an entire cycle)
# If any of the block's sectors is corrupted, then the whole block is corrupted.
p_block_healthy_per_cycle = p_sector_healthy_per_cycle ** block_sectors
print("p_block_healthy_per_cycle", float(p_block_healthy_per_cycle))

p_block_corrupt_per_cycle = Fraction(1.0) - p_block_healthy_per_cycle
print("p_block_corrupt_per_cycle", float(p_block_corrupt_per_cycle))
# P(a particular block is corrupted on all replicas in a cluster during a single cycle)
p_cluster_block_corrupt_per_cycle = p_block_corrupt_per_cycle ** replicas_total
print("p_cluster_block_corrupt_per_cycle", float(p_cluster_block_corrupt_per_cycle))
# P(a particular block is uncorrupted on at least one replica in the cluster during a single cycle)
p_cluster_block_healthy_per_cycle = Fraction(1.0) - p_cluster_block_corrupt_per_cycle
print("p_cluster_block_healthy_per_cycle", float(p_cluster_block_healthy_per_cycle))

# "Round" to the nearest float since otherwise the next computation takes ages.)
p_cluster_block_healthy_per_cycle = float(p_cluster_block_healthy_per_cycle)

# P(a particular block is uncorrupted on at least one replica in the cluster for all cycles)
# Note that each cycle can be considered independently because we assume that if is at the end of
# the cycle there is at least one healthy copy, then all of the corrupt copies are repaired.
p_cluster_block_healthy_per_span = p_cluster_block_healthy_per_cycle ** test_duration_cycles
print("p_cluster_block_healthy_per_span", float(p_cluster_block_healthy_per_span))

# P(each block is uncorrupted on at least one replica in the cluster for all cycles)
p_cluster_blocks_healthy_per_span = p_cluster_block_healthy_per_span ** storage_blocks
print("p_cluster_blocks_healthy_per_span", float(p_cluster_blocks_healthy_per_span))

# P(at some point during all cycles, a block is corrupt across all replicas)
p_cluster_blocks_corrupt_per_span = Fraction(1.0) - p_cluster_blocks_healthy_per_span

print(f"P(data loss) = {p_cluster_blocks_corrupt_per_span:e}")
	import math
	from fractions import Fraction

	# Model the probability that the cluster experiences data loss due to bitrot.
	# Specifically, that every copy of any grid block is corrupted before the scrubber repairs them.
	#
	# Optimistic assumptions (see below):
	# - Faults are independent (i.e. uncorrelated) in space and time. ¹
	# - Faults are independent between replicas. ²
	#
	# Pessimistic assumptions:
	# - There are only 3 (quorum_replication) copies of each sector.
	# - Scrub randomization is ignored.
	# - The simulated fault rate is much greater than a real disk's. (See `sector_faults_per_year`).
	# - Reads due to other workloads besides the scrubber (e.g. compaction) can also trigger repairs,
	# but they are not modelled.
	#
	# ¹: SSD faults are not independent (in either time or space).
	# See, for example:
	# - "An In-Depth Study of Correlated Failures in Production SSD-Based Data Centers"
	# (https://www.usenix.org/system/files/fast21-han.pdf)
	# - "Flash Reliability in Production: The Expected and the Unexpected"
	# (https://www.usenix.org/system/files/conference/fast16/fast16-papers-schroeder.pdf)
	# That being said, for the purposes of modelling scrubbing, it is a decent approximation,
	# assuming that the disk faults between replicas are independent.
	#
	# ²: To mitigate the risk of correlated errors in production, replicas could use different SSD models.

	####################################################################################################
	# Parameters

	# The number of years that the test is "running".
	# As the test runs longer, the probability that the cluster will experience data loss increases.
	test_duration_years = 100

	# The number of days between scrubs of a particular sector.
	# Equivalently, the number of days to scrub the entire data file.
	cycle_interval_days = 180

	# The total size of the data file.
	# Note that since this parameter is separate from the faults/year rate, increasing storage_size
	# actually reduces the likelihood of data loss.
	storage_size = 16 * (1024 ** 4)

	# The expected (average) number of sector faults per year.
	# I can't find any good, recent statistics for faults on SSDs.
	#
	# Most papers express the fault rate as "UBER" (uncorrectable bit errors per total bits read).
	# But "Flash Reliability in Production: The Expected and the Unexpected" §5.1 finds that UBER's
	# underlying assumption that the uncorrectable errors is correlated to the number of bytes read
	# is false. (That paper only shares "fraction of drives affected by an error", which is too coarse.)
	#
	# Instead, the parameter is chosen conservatively greater than the "true" number by at least an
	# order of magnitude.
	sector_faults_per_year = 10_000

	# A block has multiple sectors. If any of a block's sectors are corrupt, then the block is corrupt.
	# Increasing this increases the likelihood of data loss.
	#
	# (Intuitively, a single bitrot within 1GiB is more likely than a single bitrot within 1KiB.)
	block_size = 512 * 1024

	# The total number of copies of each sector.
	# The cluster is recoverable if a sector's number of faults is less than `replicas_total`.
	# Set to 3 rather than 6 since 3 is the quorum_replication.
	replicas_total = 3

	sector_size = 4 * 1024

	####################################################################################################
	# Computation

	block_sectors = block_size // sector_size
	storage_sectors = storage_size // sector_size
	storage_blocks = storage_size // block_size
	test_duration_days = test_duration_years * 365
	test_duration_cycles = math.ceil(test_duration_days / cycle_interval_days)
	sector_faults_per_cycle = math.ceil(sector_faults_per_year * cycle_interval_days / 365)

	# P(a particular sector is uncorrupted for an entire cycle)
	#
	# If we inject a fault randomly into S sectors, then the probability that the fault misses one
	# sector in particular is `(S-1)/S`.
	# If we repeat this N times, then the chance that it misses that sector every time is `((S-1)/S)^N`.
	p_sector_healthy_per_cycle = Fraction((storage_sectors - 1) / storage_sectors) ** sector_faults_per_cycle
	print("p_sector_healthy_per_cycle", float(p_sector_healthy_per_cycle))

	# P(a particular block is uncorrupted for an entire cycle)
	# If any of the block's sectors is corrupted, then the whole block is corrupted.
	p_block_healthy_per_cycle = p_sector_healthy_per_cycle ** block_sectors
	print("p_block_healthy_per_cycle", float(p_block_healthy_per_cycle))

	p_block_corrupt_per_cycle = Fraction(1.0) - p_block_healthy_per_cycle
	print("p_block_corrupt_per_cycle", float(p_block_corrupt_per_cycle))
	# P(a particular block is corrupted on all replicas in a cluster during a single cycle)
	p_cluster_block_corrupt_per_cycle = p_block_corrupt_per_cycle ** replicas_total
	print("p_cluster_block_corrupt_per_cycle", float(p_cluster_block_corrupt_per_cycle))
	# P(a particular block is uncorrupted on at least one replica in the cluster during a single cycle)
	p_cluster_block_healthy_per_cycle = Fraction(1.0) - p_cluster_block_corrupt_per_cycle
	print("p_cluster_block_healthy_per_cycle", float(p_cluster_block_healthy_per_cycle))

	# "Round" to the nearest float since otherwise the next computation takes ages.)
	p_cluster_block_healthy_per_cycle = float(p_cluster_block_healthy_per_cycle)

	# P(a particular block is uncorrupted on at least one replica in the cluster for all cycles)
	# Note that each cycle can be considered independently because we assume that if is at the end of
	# the cycle there is at least one healthy copy, then all of the corrupt copies are repaired.
	p_cluster_block_healthy_per_span = p_cluster_block_healthy_per_cycle ** test_duration_cycles
	print("p_cluster_block_healthy_per_span", float(p_cluster_block_healthy_per_span))

	# P(each block is uncorrupted on at least one replica in the cluster for all cycles)
	p_cluster_blocks_healthy_per_span = p_cluster_block_healthy_per_span ** storage_blocks
	print("p_cluster_blocks_healthy_per_span", float(p_cluster_blocks_healthy_per_span))

	# P(at some point during all cycles, a block is corrupt across all replicas)
	p_cluster_blocks_corrupt_per_span = Fraction(1.0) - p_cluster_blocks_healthy_per_span

	print(f"P(data loss) = {p_cluster_blocks_corrupt_per_span:e}")