|
from dataclasses import dataclass |
|
from functools import partial |
|
import hashlib |
|
import math |
|
from os import write |
|
import random |
|
import string |
|
|
|
# See below BlockDevice for FileStore skeleton |
|
|
|
def get_random_string(length: int) -> str: |
|
return "".join( |
|
random.choice(string.ascii_lowercase) for _ in range(length) |
|
) |
|
|
|
@dataclass |
|
class BlockDevice: |
|
block_size: int |
|
blocks: list[str] |
|
corruption_rate: float |
|
|
|
@staticmethod |
|
def get_new_block_device( |
|
block_count=1024 * 1024, block_size=8, corruption_rate=0.01 |
|
): |
|
"""Create a new block device with the desired parameters.""" |
|
blocks = [get_random_string(block_size) for _ in range(block_count)] |
|
return BlockDevice( |
|
block_size=block_size, blocks=blocks, corruption_rate=corruption_rate |
|
) |
|
|
|
@property |
|
def block_count(self): |
|
"""How many blocks does this device contain.""" |
|
return len(self.blocks) |
|
|
|
def read(self, index: int) -> str: |
|
"""Read the block at the specified index. |
|
|
|
This method may corrupt the underlying block in a consistent |
|
manner, such that the original data is unretrievable in the |
|
future, as well. |
|
|
|
:param index: Index of the block to read. |
|
:returns: Content of the block. |
|
""" |
|
uncorrupted = self.blocks[index] |
|
return self._maybe_corrupt(uncorrupted, index) |
|
|
|
def write(self, index: int, content: str) -> str: |
|
"""Write the specified content to the specified block index. |
|
|
|
This method may corrupt the data before persisteng it to the |
|
underlying block; however, it will return the content that |
|
was actually written. |
|
|
|
:: Note: The input content will be truncated to the block_size |
|
of this block device. |
|
|
|
:param index: Block index to write into. |
|
:param content: Content to be written, only uses the first block_size |
|
characters. |
|
|
|
:returns: What content was written to the block device |
|
""" |
|
truncated = content[: self.block_size] |
|
current_content = self.blocks[index] |
|
new_content = truncated + current_content[len(truncated) :] |
|
return self._maybe_corrupt(new_content, index) |
|
|
|
def _maybe_corrupt(self, content: str, index: int) -> str: |
|
should_corrupt = random.uniform(0, 1) < self.corruption_rate |
|
if should_corrupt: |
|
maybe_corrupted = get_random_string(self.block_size) |
|
else: |
|
maybe_corrupted = content |
|
self.blocks[index] = maybe_corrupted |
|
return maybe_corrupted |
|
|
|
|
|
# Create a block device with 1024 blocks that can each fit an |
|
# character string; every read/write has a 1% chance of corruption. |
|
block_device = BlockDevice.get_new_block_device( |
|
block_count=1024, block_size=8, corruption_rate=0.01) |
|
|
|
# Write content to block 0. |
|
print(f"Writing '12345678' to the block device: {block_device.write(index=0, content='12345678')}") |
|
|
|
# Read content from block 0. |
|
print(f"Reading from the block device: {block_device.read(index=0)}") |
|
|
|
# If content length exceeds the block size, the content is truncated. |
|
print(f"Writing '123456789' to the block device: {block_device.write(index=0, content='123456789')}") |
|
|
|
########################## |
|
# Begin Implementing Below |
|
########################## |
|
|
|
class FileStore: |
|
|
|
def __init__(self, failure_rate=0.01, retries=3) -> None: |
|
self.blockdevice = BlockDevice.get_new_block_device(corruption_rate=failure_rate) |
|
# File map contains the the filename and the blocks it maps to |
|
# for the file contents |
|
self.filemap = {} |
|
# partials is a dictionary that contains information about blocks |
|
# that were partially written to but did not fully take up |
|
# the entire block_size |
|
self.partials = {} |
|
self.next_free_block = 0 |
|
# Dictionary containing a hash for each block to check for data |
|
# corruption |
|
self.checksums = {} |
|
self.retries = retries |
|
|
|
def add_checksum(self, block_index: int, content: str): |
|
# Use md5 checksum to calculate a hash, that can |
|
# be used to quickly check for corruption, and |
|
# save it to the checksums dictionary |
|
checksum = hashlib.md5(content.encode()).hexdigest() |
|
self.checksums[block_index] = checksum |
|
return checksum |
|
|
|
def attempt_write(self, block_index: int, content: str) -> str: |
|
write_check = None |
|
checksum = self.add_checksum(block_index, content) |
|
result = "" |
|
retry = 0 |
|
while write_check != checksum and retry != self.retries: |
|
result = self.blockdevice.write(block_index, content)[:len(content)] |
|
write_check = hashlib.md5(result.encode()).hexdigest() |
|
retry += 1 |
|
|
|
if retry == self.retries and write_check != checksum: |
|
raise Exception(f"Write failed for block {block_index}, retries exceeded") |
|
return result |
|
|
|
def attempt_read(self, block_index: int) -> str: |
|
read_check = None |
|
checksum = self.checksums[block_index] |
|
is_partial = block_index in self.partials |
|
if is_partial: |
|
partial_index = block_index |
|
tries = 0 |
|
result = "" |
|
need_rewrite = False |
|
# We only get two shots at reading a block, the main |
|
# block and then the backup block |
|
# TODO - make more copies of the block so tries can be more than 2 |
|
while read_check != checksum and tries != 2: |
|
result = self.blockdevice.read(block_index) |
|
if is_partial: |
|
result = result[:self.partials[partial_index]] |
|
read_check = hashlib.md5(result.encode()).hexdigest() |
|
if read_check != checksum: |
|
# Read failed, and the block is now corrupt. Try the backup |
|
block_index += 1 |
|
need_rewrite = True |
|
tries += 1 |
|
|
|
if read_check != checksum: |
|
raise Exception(f"Read has failed and data loss has occured for block {block_index}") |
|
# See if we need to write back data that was lost upon read |
|
if need_rewrite: |
|
self.attempt_write(block_index-1, result) |
|
return result |
|
|
|
|
|
def put(self, file_name: str, file_content: str) -> str: |
|
# Calculate the number of blocks this file_content will require |
|
chunks = math.ceil(len(file_content) / self.blockdevice.block_size) |
|
|
|
# We consume two blocks, one as the primary and one as the backup |
|
# so the amount of space we have is half the number of blocks |
|
# that the block device provides |
|
# TODO - replace with a parity scheme that doesn't waste half the blocks |
|
if self.next_free_block + chunks > len(self.blockdevice.blocks) // 2 - 1: |
|
raise ValueError("Insufficient space") |
|
|
|
# Creates a new file with the specified name and content. |
|
result = "" |
|
self.filemap[file_name] = [] |
|
|
|
for i in range(0, chunks): |
|
# last chunk |
|
if i == chunks - 1: |
|
# grab the last part of file_content, which can be less than a |
|
# full block (block_size) |
|
content = file_content[(chunks - 1)*self.blockdevice.block_size:] |
|
# If chunk is smaller than block_size we need to track |
|
# its length to determine where junk data begins |
|
if len(content) < self.blockdevice.block_size: |
|
self.partials[self.next_free_block] = len(content) |
|
else: |
|
# Grab the i'th chunk of file_content, which is block_size long |
|
content = file_content[(i*self.blockdevice.block_size):(i+1)*self.blockdevice.block_size] |
|
|
|
result = result + self.attempt_write(self.next_free_block, content) |
|
|
|
# Write a backup block |
|
self.attempt_write(self.next_free_block+1, content) |
|
|
|
# Add this block that was just written to the file map |
|
self.filemap[file_name].append(self.next_free_block) |
|
# Advance to the next free block |
|
self.next_free_block += 2 |
|
return result |
|
|
|
|
|
def get(self, file_name: str) -> str: |
|
content = "" |
|
if file_name not in self.filemap: |
|
raise ValueError("Invalid file_name") |
|
# Grab the blocks that make up the file |
|
for i in self.filemap[file_name]: |
|
chunk = self.attempt_read(i) |
|
content = content + chunk |
|
return content |
|
|
|
|
|
def delete(self, file_name: str) -> bool: |
|
# For every block that a file used to use |
|
for i in self.filemap[file_name]: |
|
# Delete the checksum metadata for the block |
|
del self.checksums[i] |
|
# Delete the partial metadata for the block |
|
if i in self.partials: |
|
del self.partials[i] |
|
del self.filemap[file_name] |
|
return True |
|
|
|
import pytest |
|
import unittest |
|
|
|
class BlockDeviceTest(unittest.TestCase): |
|
def setUp(self) -> None: |
|
self.block_device = BlockDevice.get_new_block_device( |
|
block_count=1024, block_size=8, corruption_rate=0.01) |
|
|
|
def test_write(self): |
|
self.assertEqual( |
|
self.block_device.write(0, '1234abcd'), |
|
'1234abcd' |
|
) |
|
|
|
def test_read(self): |
|
self.assertEqual( |
|
len(self.block_device.read(1)), |
|
8 |
|
) |
|
|
|
class FileStoreTest(unittest.TestCase): |
|
|
|
def setUp(self) -> None: |
|
self.file_store = FileStore() |
|
|
|
def test_put(self): |
|
self.assertEqual( |
|
self.file_store.put( |
|
"hello.txt", |
|
'lorem ipsum dolor sit amet' |
|
), |
|
"lorem ipsum dolor sit amet" |
|
) |
|
|
|
@pytest.mark.skip(reason="CPU intensive") |
|
def test_put_multi(self): |
|
self.assertEqual( |
|
self.file_store.put( |
|
"test1.txt", |
|
'foobar12365799' |
|
), |
|
"foobar12365799" |
|
) |
|
self.assertEqual( |
|
self.file_store.put( |
|
"test2.txt", |
|
'foobar987654213' |
|
), |
|
"foobar987654213" |
|
) |
|
|
|
|
|
def test_get(self): |
|
self.file_store.put( |
|
"hello.txt", |
|
'lorem ipsum dolor sit amet' |
|
) |
|
self.assertEqual( |
|
self.file_store.get("hello.txt"), |
|
"lorem ipsum dolor sit amet" |
|
) |
|
|
|
def test_delete(self): |
|
self.file_store.put( |
|
"hello.txt", |
|
'lorem ipsum dolor sit amet' |
|
) |
|
self.assertEqual( |
|
self.file_store.get("hello.txt"), |
|
"lorem ipsum dolor sit amet" |
|
) |
|
self.assertTrue(self.file_store.delete("hello.txt")) |
|
with self.assertRaises(ValueError): |
|
self.file_store.get("hello.txt") |
|
|
|
|
|
class Cheating(FileStoreTest): |
|
|
|
def setUp(self) -> None: |
|
self.file_store = FileStore(failure_rate=0.0) |
|
|
|
class VeryUnreliable(FileStoreTest): |
|
|
|
def setUp(self) -> None: |
|
self.file_store = FileStore(failure_rate=.3) |
|
|
|
pytest.main(['--verbose','-s', __file__]) |