sc68cal/main.py

## readme.md

      
    Raw
  

              readme.md
            
          
    Resilient File Storage on Unreliable Block Device

In this question, we want you to implement a simple key-value store that will resiliently write and read ASCII text files to/from an unreliable block storage device that we have provided.
db = FileStore()
ret = db.put("hello.txt", "lorem ipsum dolor sit amet")
assert ret == "lorem ipsum dolor sit amet"
db.get("hello.txt")
lorem ipsum dolor sit amet
db.delete("hello.txt")
db.get("hello.txt")
ValueError: file hello.txt does not exist

When we say the block device is unreliable, we mean that data can be corrupted when a block is written to or read from. Once corrupted, the corruption is considered persistent, meaning the original data on the block is lost. If corruption occurs, when content is read from a block, the block will return random data, and when content is written to a block, random data will be written to the block instead.
Your task is to:

Implement the core API (create, get, delete) of FileStore, using BlockDevice as the underlying storage primitive.
Implement a strategy to recover from write-time corruption of file content
Implement a strategy to recover from read-time corruption of file content

The user should experience reliable, error-free use of your FileStore implementation, with two possible exceptions:

When the user runs out of storage space on the block device
When the file is corrupted beyond recovery, a determination we leave up to you

Keep in mind that this is a toy program and not subject to all the constraints of real-world filesystems.
You do not need to worry about efficiency, memory or otherwise. Prioritize a complete, clean, and readable implementation.
You may use any and as many auxiliary data structures as you see fit, and any metadata you need to maintain may be stored in memory exclusively; however, the content of the files themselves needs to be written only to the block device.
Class you will be implementing
class FileStore:
    def put(self, file_name: str, file_content: str):
        # Creates a new file with the specified name and content.
        pass
        
    def get(self, file_name: str):
        # Gets the content of the ASCII file.
        pass
    
    def delete(self, file_name: str):
        # Deletes the specified file.
        pass

Block device (Python)
The database should use our prewritten BlockDevice class. This class is essentially an array of fixed-width “blocks” that store arbitrary UTF-8 strings.
>>> # Create a block device with 1024 blocks that can each fit an 8 character string; every read/write has a 1% chance of corruption.
>>> block_device = BlockDevice.get_new_block_device(
            block_count=1024,
            block_size=8,
            corruption_rate=0.01
        )
        
>>> # Write content to block 0.
>>> block_device.write(index=0, content=”12345678”)
12345678

>>> # Read content from block 0.
>>> block_device.read(index=0) 
12345678

>>> # If content length exceeds the block size, the content is truncated.
>>> block_device.write(index=0, content=”123456789”)
12345678

>>> # This read caused a corruption of block 0's content.
>>> block_device.read(index=0)
rcjutrvg

>>> # The corruption is durable (unless corrupted again).
>>> block_device.read(index=0)
rcjutrvg

>>> # This write's content is corrupted.
>>> block_device.write(index=0, content="Corrupt")
xtjuvrvr


## main.py
from dataclasses import dataclass
from functools import partial
import hashlib
import math
from os import write
import random
import string

# See below BlockDevice for FileStore skeleton

def get_random_string(length: int) -> str:
    return "".join(
        random.choice(string.ascii_lowercase) for _ in range(length)
    )

@dataclass
class BlockDevice:
    block_size: int
    blocks: list[str]
    corruption_rate: float

    @staticmethod
    def get_new_block_device(
        block_count=1024 * 1024, block_size=8, corruption_rate=0.01
    ):
        """Create a new block device with the desired parameters."""
        blocks = [get_random_string(block_size) for _ in range(block_count)]
        return BlockDevice(
            block_size=block_size, blocks=blocks, corruption_rate=corruption_rate
        )

    @property
    def block_count(self):
        """How many blocks does this device contain."""
        return len(self.blocks)

    def read(self, index: int) -> str:
        """Read the block at the specified index.

        This method may corrupt the underlying block in a consistent
        manner, such that the original data is unretrievable in the
        future, as well.

        :param index: Index of the block to read.
        :returns: Content of the block.
        """
        uncorrupted = self.blocks[index]
        return self._maybe_corrupt(uncorrupted, index)

    def write(self, index: int, content: str) -> str:
        """Write the specified content to the specified block index.

        This method may corrupt the data before persisteng it to the
        underlying block; however, it will return the content that
        was actually written.

        :: Note: The input content will be truncated to the block_size
                 of this block device.

        :param index: Block index to write into.
        :param content: Content to be written, only uses the first block_size
            characters.

        :returns: What content was written to the block device
        """
        truncated = content[: self.block_size]
        current_content = self.blocks[index]
        new_content = truncated + current_content[len(truncated) :]
        return self._maybe_corrupt(new_content, index)

    def _maybe_corrupt(self, content: str, index: int) -> str:
        should_corrupt = random.uniform(0, 1) < self.corruption_rate
        if should_corrupt:
            maybe_corrupted = get_random_string(self.block_size)
        else:
            maybe_corrupted = content
        self.blocks[index] = maybe_corrupted
        return maybe_corrupted


# Create a block device with 1024 blocks that can each fit an
# character string; every read/write has a 1% chance of corruption.
block_device = BlockDevice.get_new_block_device(
    block_count=1024, block_size=8, corruption_rate=0.01)

# Write content to block 0.
print(f"Writing '12345678' to the block device: {block_device.write(index=0, content='12345678')}")

# Read content from block 0.
print(f"Reading from the block device: {block_device.read(index=0)}")

# If content length exceeds the block size, the content is truncated.
print(f"Writing '123456789' to the block device: {block_device.write(index=0, content='123456789')}")

##########################
# Begin Implementing Below
##########################

class FileStore:

    def __init__(self, failure_rate=0.01, retries=3) -> None:
        self.blockdevice = BlockDevice.get_new_block_device(corruption_rate=failure_rate)
        # File map contains the the filename and the blocks it maps to
        # for the file contents
        self.filemap = {}
        # partials is a dictionary that contains information about blocks
        # that were partially written to but did not fully take up
        # the entire block_size
        self.partials = {}
        self.next_free_block = 0
        # Dictionary containing a hash for each block to check for data
        # corruption
        self.checksums = {}
        self.retries = retries

    def add_checksum(self, block_index: int, content: str):
        # Use md5 checksum to calculate a hash, that can
        # be used to quickly check for corruption, and
        # save it to the checksums dictionary
        checksum = hashlib.md5(content.encode()).hexdigest()
        self.checksums[block_index] = checksum
        return checksum

    def attempt_write(self, block_index: int, content: str) -> str:
        write_check = None
        checksum = self.add_checksum(block_index, content)
        result = ""
        retry = 0
        while write_check != checksum and retry != self.retries:
            result = self.blockdevice.write(block_index, content)[:len(content)]
            write_check = hashlib.md5(result.encode()).hexdigest()
            retry += 1

        if retry == self.retries and write_check != checksum:
            raise Exception(f"Write failed for block {block_index}, retries exceeded")
        return result

    def attempt_read(self, block_index: int) -> str:
        read_check = None
        checksum = self.checksums[block_index]
        is_partial = block_index in self.partials
        if is_partial:
            partial_index = block_index
        tries = 0
        result = ""
        need_rewrite = False
        # We only get two shots at reading a block, the main
        # block and then the backup block
        # TODO - make more copies of the block so tries can be more than 2
        while read_check != checksum and tries != 2:
            result = self.blockdevice.read(block_index)
            if is_partial:
                result = result[:self.partials[partial_index]]
            read_check = hashlib.md5(result.encode()).hexdigest()
            if read_check != checksum:
                # Read failed, and the block is now corrupt. Try the backup
                block_index += 1
                need_rewrite = True
            tries += 1

        if read_check != checksum:
            raise Exception(f"Read has failed and data loss has occured for block {block_index}")
        # See if we need to write back data that was lost upon read
        if need_rewrite:
            self.attempt_write(block_index-1, result)
        return result


    def put(self, file_name: str, file_content: str) -> str:
        # Calculate the number of blocks this file_content will require
        chunks = math.ceil(len(file_content) / self.blockdevice.block_size)

        # We consume two blocks, one as the primary and one as the backup
        # so the amount of space we have is half the number of blocks
        # that the block device provides
        # TODO - replace with a parity scheme that doesn't waste half the blocks
        if self.next_free_block + chunks > len(self.blockdevice.blocks) // 2 - 1:
            raise ValueError("Insufficient space")

        # Creates a new file with the specified name and content.
        result = ""
        self.filemap[file_name] = []

        for i in range(0, chunks):
            # last chunk
            if i == chunks - 1:
                # grab the last part of file_content, which can be less than a
                # full block (block_size)
                content = file_content[(chunks - 1)*self.blockdevice.block_size:]
                # If chunk is smaller than block_size we need to track
                # its length to determine where junk data begins
                if len(content) < self.blockdevice.block_size:
                    self.partials[self.next_free_block] = len(content)
            else:
                # Grab the i'th chunk of file_content, which is block_size long
                content = file_content[(i*self.blockdevice.block_size):(i+1)*self.blockdevice.block_size]

            result = result + self.attempt_write(self.next_free_block, content)

            # Write a backup block
            self.attempt_write(self.next_free_block+1, content)

            # Add this block that was just written to the file map
            self.filemap[file_name].append(self.next_free_block)
            # Advance to the next free block
            self.next_free_block += 2
        return result


    def get(self, file_name: str) -> str:
        content = ""
        if file_name not in self.filemap:
            raise ValueError("Invalid file_name")
        # Grab the blocks that make up the file
        for i in self.filemap[file_name]:
            chunk = self.attempt_read(i)
            content = content + chunk
        return content


    def delete(self, file_name: str) -> bool:
        # For every block that a file used to use
        for i in self.filemap[file_name]:
            # Delete the checksum metadata for the block
            del self.checksums[i]
            # Delete the partial metadata for the block
            if i in self.partials:
                del self.partials[i]
        del self.filemap[file_name]
        return True

import pytest
import unittest

class BlockDeviceTest(unittest.TestCase):
    def setUp(self) -> None:
        self.block_device = BlockDevice.get_new_block_device(
            block_count=1024, block_size=8, corruption_rate=0.01)

    def test_write(self):
        self.assertEqual(
            self.block_device.write(0, '1234abcd'),
            '1234abcd'
        )

    def test_read(self):
        self.assertEqual(
            len(self.block_device.read(1)),
            8
        )

class FileStoreTest(unittest.TestCase):

    def setUp(self) -> None:
        self.file_store = FileStore()

    def test_put(self):
        self.assertEqual(
            self.file_store.put(
                "hello.txt",
                'lorem ipsum dolor sit amet'
            ),
            "lorem ipsum dolor sit amet"
        )

    @pytest.mark.skip(reason="CPU intensive")
    def test_put_multi(self):
        self.assertEqual(
            self.file_store.put(
                "test1.txt",
                'foobar12365799'
            ),
            "foobar12365799"
        )
        self.assertEqual(
            self.file_store.put(
                "test2.txt",
                'foobar987654213'
            ),
            "foobar987654213"
        )


    def test_get(self):
        self.file_store.put(
            "hello.txt",
            'lorem ipsum dolor sit amet'
        )
        self.assertEqual(
            self.file_store.get("hello.txt"),
            "lorem ipsum dolor sit amet"
        )

    def test_delete(self):
        self.file_store.put(
            "hello.txt",
            'lorem ipsum dolor sit amet'
        )
        self.assertEqual(
            self.file_store.get("hello.txt"),
            "lorem ipsum dolor sit amet"
        )
        self.assertTrue(self.file_store.delete("hello.txt"))
        with self.assertRaises(ValueError):
            self.file_store.get("hello.txt")


class Cheating(FileStoreTest):

    def setUp(self) -> None:
        self.file_store = FileStore(failure_rate=0.0)

class VeryUnreliable(FileStoreTest):

    def setUp(self) -> None:
        self.file_store = FileStore(failure_rate=.3)

pytest.main(['--verbose','-s', __file__])
	from dataclasses import dataclass
	from functools import partial
	import hashlib
	import math
	from os import write
	import random
	import string

	# See below BlockDevice for FileStore skeleton

	def get_random_string(length: int) -> str:
	return "".join(
	random.choice(string.ascii_lowercase) for _ in range(length)
	)

	@dataclass
	class BlockDevice:
	block_size: int
	blocks: list[str]
	corruption_rate: float

	@staticmethod
	def get_new_block_device(
	block_count=1024 * 1024, block_size=8, corruption_rate=0.01
	):
	"""Create a new block device with the desired parameters."""
	blocks = [get_random_string(block_size) for _ in range(block_count)]
	return BlockDevice(
	block_size=block_size, blocks=blocks, corruption_rate=corruption_rate
	)

	@property
	def block_count(self):
	"""How many blocks does this device contain."""
	return len(self.blocks)

	def read(self, index: int) -> str:
	"""Read the block at the specified index.

	This method may corrupt the underlying block in a consistent
	manner, such that the original data is unretrievable in the
	future, as well.

	:param index: Index of the block to read.
	:returns: Content of the block.
	"""
	uncorrupted = self.blocks[index]
	return self._maybe_corrupt(uncorrupted, index)

	def write(self, index: int, content: str) -> str:
	"""Write the specified content to the specified block index.

	This method may corrupt the data before persisteng it to the
	underlying block; however, it will return the content that
	was actually written.

	:: Note: The input content will be truncated to the block_size
	of this block device.

	:param index: Block index to write into.
	:param content: Content to be written, only uses the first block_size
	characters.

	:returns: What content was written to the block device
	"""
	truncated = content[: self.block_size]
	current_content = self.blocks[index]
	new_content = truncated + current_content[len(truncated) :]
	return self._maybe_corrupt(new_content, index)

	def _maybe_corrupt(self, content: str, index: int) -> str:
	should_corrupt = random.uniform(0, 1) < self.corruption_rate
	if should_corrupt:
	maybe_corrupted = get_random_string(self.block_size)
	else:
	maybe_corrupted = content
	self.blocks[index] = maybe_corrupted
	return maybe_corrupted


	# Create a block device with 1024 blocks that can each fit an
	# character string; every read/write has a 1% chance of corruption.
	block_device = BlockDevice.get_new_block_device(
	block_count=1024, block_size=8, corruption_rate=0.01)

	# Write content to block 0.
	print(f"Writing '12345678' to the block device: {block_device.write(index=0, content='12345678')}")

	# Read content from block 0.
	print(f"Reading from the block device: {block_device.read(index=0)}")

	# If content length exceeds the block size, the content is truncated.
	print(f"Writing '123456789' to the block device: {block_device.write(index=0, content='123456789')}")

	##########################
	# Begin Implementing Below
	##########################

	class FileStore:

	def __init__(self, failure_rate=0.01, retries=3) -> None:
	self.blockdevice = BlockDevice.get_new_block_device(corruption_rate=failure_rate)
	# File map contains the the filename and the blocks it maps to
	# for the file contents
	self.filemap = {}
	# partials is a dictionary that contains information about blocks
	# that were partially written to but did not fully take up
	# the entire block_size
	self.partials = {}
	self.next_free_block = 0
	# Dictionary containing a hash for each block to check for data
	# corruption
	self.checksums = {}
	self.retries = retries

	def add_checksum(self, block_index: int, content: str):
	# Use md5 checksum to calculate a hash, that can
	# be used to quickly check for corruption, and
	# save it to the checksums dictionary
	checksum = hashlib.md5(content.encode()).hexdigest()
	self.checksums[block_index] = checksum
	return checksum

	def attempt_write(self, block_index: int, content: str) -> str:
	write_check = None
	checksum = self.add_checksum(block_index, content)
	result = ""
	retry = 0
	while write_check != checksum and retry != self.retries:
	result = self.blockdevice.write(block_index, content)[:len(content)]
	write_check = hashlib.md5(result.encode()).hexdigest()
	retry += 1

	if retry == self.retries and write_check != checksum:
	raise Exception(f"Write failed for block {block_index}, retries exceeded")
	return result

	def attempt_read(self, block_index: int) -> str:
	read_check = None
	checksum = self.checksums[block_index]
	is_partial = block_index in self.partials
	if is_partial:
	partial_index = block_index
	tries = 0
	result = ""
	need_rewrite = False
	# We only get two shots at reading a block, the main
	# block and then the backup block
	# TODO - make more copies of the block so tries can be more than 2
	while read_check != checksum and tries != 2:
	result = self.blockdevice.read(block_index)
	if is_partial:
	result = result[:self.partials[partial_index]]
	read_check = hashlib.md5(result.encode()).hexdigest()
	if read_check != checksum:
	# Read failed, and the block is now corrupt. Try the backup
	block_index += 1
	need_rewrite = True
	tries += 1

	if read_check != checksum:
	raise Exception(f"Read has failed and data loss has occured for block {block_index}")
	# See if we need to write back data that was lost upon read
	if need_rewrite:
	self.attempt_write(block_index-1, result)
	return result


	def put(self, file_name: str, file_content: str) -> str:
	# Calculate the number of blocks this file_content will require
	chunks = math.ceil(len(file_content) / self.blockdevice.block_size)

	# We consume two blocks, one as the primary and one as the backup
	# so the amount of space we have is half the number of blocks
	# that the block device provides
	# TODO - replace with a parity scheme that doesn't waste half the blocks
	if self.next_free_block + chunks > len(self.blockdevice.blocks) // 2 - 1:
	raise ValueError("Insufficient space")

	# Creates a new file with the specified name and content.
	result = ""
	self.filemap[file_name] = []

	for i in range(0, chunks):
	# last chunk
	if i == chunks - 1:
	# grab the last part of file_content, which can be less than a
	# full block (block_size)
	content = file_content[(chunks - 1)*self.blockdevice.block_size:]
	# If chunk is smaller than block_size we need to track
	# its length to determine where junk data begins
	if len(content) < self.blockdevice.block_size:
	self.partials[self.next_free_block] = len(content)
	else:
	# Grab the i'th chunk of file_content, which is block_size long
	content = file_content[(iself.blockdevice.block_size):(i+1)self.blockdevice.block_size]

	result = result + self.attempt_write(self.next_free_block, content)

	# Write a backup block
	self.attempt_write(self.next_free_block+1, content)

	# Add this block that was just written to the file map
	self.filemap[file_name].append(self.next_free_block)
	# Advance to the next free block
	self.next_free_block += 2
	return result


	def get(self, file_name: str) -> str:
	content = ""
	if file_name not in self.filemap:
	raise ValueError("Invalid file_name")
	# Grab the blocks that make up the file
	for i in self.filemap[file_name]:
	chunk = self.attempt_read(i)
	content = content + chunk
	return content


	def delete(self, file_name: str) -> bool:
	# For every block that a file used to use
	for i in self.filemap[file_name]:
	# Delete the checksum metadata for the block
	del self.checksums[i]
	# Delete the partial metadata for the block
	if i in self.partials:
	del self.partials[i]
	del self.filemap[file_name]
	return True

	import pytest
	import unittest

	class BlockDeviceTest(unittest.TestCase):
	def setUp(self) -> None:
	self.block_device = BlockDevice.get_new_block_device(
	block_count=1024, block_size=8, corruption_rate=0.01)

	def test_write(self):
	self.assertEqual(
	self.block_device.write(0, '1234abcd'),
	'1234abcd'
	)

	def test_read(self):
	self.assertEqual(
	len(self.block_device.read(1)),
	8
	)

	class FileStoreTest(unittest.TestCase):

	def setUp(self) -> None:
	self.file_store = FileStore()

	def test_put(self):
	self.assertEqual(
	self.file_store.put(
	"hello.txt",
	'lorem ipsum dolor sit amet'
	),
	"lorem ipsum dolor sit amet"
	)

	@pytest.mark.skip(reason="CPU intensive")
	def test_put_multi(self):
	self.assertEqual(
	self.file_store.put(
	"test1.txt",
	'foobar12365799'
	),
	"foobar12365799"
	)
	self.assertEqual(
	self.file_store.put(
	"test2.txt",
	'foobar987654213'
	),
	"foobar987654213"
	)


	def test_get(self):
	self.file_store.put(
	"hello.txt",
	'lorem ipsum dolor sit amet'
	)
	self.assertEqual(
	self.file_store.get("hello.txt"),
	"lorem ipsum dolor sit amet"
	)

	def test_delete(self):
	self.file_store.put(
	"hello.txt",
	'lorem ipsum dolor sit amet'
	)
	self.assertEqual(
	self.file_store.get("hello.txt"),
	"lorem ipsum dolor sit amet"
	)
	self.assertTrue(self.file_store.delete("hello.txt"))
	with self.assertRaises(ValueError):
	self.file_store.get("hello.txt")


	class Cheating(FileStoreTest):

	def setUp(self) -> None:
	self.file_store = FileStore(failure_rate=0.0)

	class VeryUnreliable(FileStoreTest):

	def setUp(self) -> None:
	self.file_store = FileStore(failure_rate=.3)

	pytest.main(['--verbose','-s', __file__])