Skip to content

Instantly share code, notes, and snippets.

@whalesalad
Created November 30, 2019 01:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save whalesalad/43bca4e48c4f8dfc49ab08f3110ad2ff to your computer and use it in GitHub Desktop.
Save whalesalad/43bca4e48c4f8dfc49ab08f3110ad2ff to your computer and use it in GitHub Desktop.
import io
import json
from service.memcached import client as memcached
from service.storage import utils
ONE_MEGABYTE = 1000 * 1000
CHUNK_SIZE = ONE_MEGABYTE
MAX_FILE_SIZE = 50 * ONE_MEGABYTE
class FileTooBigException(Exception):
pass
class FileAlreadyExists(Exception):
pass
class FileDoesNotExist(Exception):
pass
class DataCorruptionException(Exception):
pass
def _get_metadata(filename):
data = memcached.get(utils.key_for_file_metadata(filename))
if not data:
raise FileDoesNotExist(f"{filename} could not be found.")
return json.loads(data)
def exists(filename):
try:
metadata = _get_metadata(filename)
return True
except FileDoesNotExist:
return False
def store(filename, file):
metadata = utils.generate_metadata(file, chunk_size=CHUNK_SIZE)
# If we're too big, barf.
if metadata['size'] > MAX_FILE_SIZE:
raise FileTooBigException("The file is too big. The maximum file size is 50MB.")
if exists(filename):
raise FileAlreadyExists(f"A file with the filename {filename} already exists.")
# 1. store the metadata for the file
memcached.set(utils.key_for_file_metadata(filename), json.dumps(metadata))
# 2. store each chunk of the file
for idx, chunk in enumerate(utils.read_in_chunks(file, CHUNK_SIZE)):
memcached.set(utils.key_for_file_chunk(filename, idx), chunk)
return metadata
def retrieve(filename):
metadata = _get_metadata(filename)
num_chunks = metadata.get('num_chunks')
# TODO, check that all chunks exist to handle data corruption case.
out = io.BytesIO()
for chunk_idx in range(num_chunks):
# TODO introduce error handling for when a chunk does not exist
data = memcached.get(utils.key_for_file_chunk(filename, chunk_idx))
out.write(data)
out.seek(0)
# If our manifest metadata differs from what we actually grabbed... die.
stored = metadata.get('md5')
retrieved = utils.get_md5(out)
if stored != retrieved:
raise DataCorruptionException(f"The retrieved data for {filename} does not match the manifest checksum.")
return out, metadata
import hashlib
import math
def read_in_chunks(file, chunk_size):
while True:
data = file.read(chunk_size)
if not data:
break
yield data
def get_md5(file):
"""
Given a file object, return a md5 hash.
"""
md5 = hashlib.md5()
for chunk in read_in_chunks(file, 4096):
md5.update(chunk)
file.seek(0)
return md5.hexdigest()
def get_size(file):
"""
Given a file object, return the total bytes.
"""
file.seek(0, 2)
size = file.tell()
file.seek(0)
return size
def get_num_chunks(size, chunk_size):
return math.ceil(size / chunk_size)
def generate_metadata(file, chunk_size):
"""
Return size, md5 hash and chunk information to store alongside
the file in memcached.
"""
size = get_size(file)
return {
'size': size,
'md5': get_md5(file),
'num_chunks': get_num_chunks(size, chunk_size)
}
def hashed_filename(filename):
return hashlib.md5(filename.encode('utf-8')).hexdigest()
def key_for_file_metadata(filename):
filename = hashed_filename(filename)
return f"{filename}-metadata"
def key_for_file_chunk(filename, chunk_idx):
filename = hashed_filename(filename)
return f"{filename}-part-{chunk_idx}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment