honestbleeps/CacheBigFile.py

## CacheBigFile.py
# here's the hot mess I whipped up during our code test!

# obvious things I'd change / improve:
# - no hard-coding of "chunk.*" - use the filename as a prefix
# - no writing to disk - store directly to memcache
# - store the # of chunks and an md5 hash of the file in a meta key or in the first binary chunk, rather than in an info.txt
#   to save a roundtrip to memcached on each request
# - write a "getChunkAt" like function, to get chunk #36, for example, in case it has fallen off the cache

from pymemcache.client.base import Client
import glob

class CacheBigFile:
    # split our large file into chunks of 1mb each
    # define the function to split the file into smaller chunks
    def splitFile(inputFile, chunkSize):
        #read the contents of the file
        f = open(inputFile, 'rb')
        data = f.read() # read the entire content of the file
        f.close()

        # get the length of data, ie size of the input file in bytes
        bytes = len(data)

        #calculate the number of chunks to be created
        noOfChunks = bytes/chunkSize
        if bytes % chunkSize:
            noOfChunks += 1

        #create a info.txt file for writing metadata
        f = open('info.txt', 'w')
        f.write(inputFile+','+'chunk.,'+str(noOfChunks)+'i,'+str(chunkSize))
        f.close()

        chunkCount = 0
        chunkNames = []
        for i in range(0, bytes + 1, chunkSize):
            chunkCount = chunkCount + 1
            fn1 = "chunk.%s" % chunkCount
            chunkNames.append(fn1)
            f = open(fn1, 'wb')
            f.write(data[i:i+ chunkSize])
            f.close()


    def getFiles(self):
        files = glob.glob("chunk.*")
        return files


    def writeCache(self):
        client = Client(('localhost', 11211))
        files = self.getFiles()
        for fileName in files:
            with open(fileName) as f:
                client.set(fileName, f)
	# here's the hot mess I whipped up during our code test!

	# obvious things I'd change / improve:
	# - no hard-coding of "chunk.*" - use the filename as a prefix
	# - no writing to disk - store directly to memcache
	# - store the # of chunks and an md5 hash of the file in a meta key or in the first binary chunk, rather than in an info.txt
	# to save a roundtrip to memcached on each request
	# - write a "getChunkAt" like function, to get chunk #36, for example, in case it has fallen off the cache

	from pymemcache.client.base import Client
	import glob

	class CacheBigFile:
	# split our large file into chunks of 1mb each
	# define the function to split the file into smaller chunks
	def splitFile(inputFile, chunkSize):
	#read the contents of the file
	f = open(inputFile, 'rb')
	data = f.read() # read the entire content of the file
	f.close()

	# get the length of data, ie size of the input file in bytes
	bytes = len(data)

	#calculate the number of chunks to be created
	noOfChunks = bytes/chunkSize
	if bytes % chunkSize:
	noOfChunks += 1

	#create a info.txt file for writing metadata
	f = open('info.txt', 'w')
	f.write(inputFile+','+'chunk.,'+str(noOfChunks)+'i,'+str(chunkSize))
	f.close()

	chunkCount = 0
	chunkNames = []
	for i in range(0, bytes + 1, chunkSize):
	chunkCount = chunkCount + 1
	fn1 = "chunk.%s" % chunkCount
	chunkNames.append(fn1)
	f = open(fn1, 'wb')
	f.write(data[i:i+ chunkSize])
	f.close()


	def getFiles(self):
	files = glob.glob("chunk.*")
	return files


	def writeCache(self):
	client = Client(('localhost', 11211))
	files = self.getFiles()
	for fileName in files:
	with open(fileName) as f:
	client.set(fileName, f)