Skip to content

Instantly share code, notes, and snippets.

@leth
Forked from chipx86/streaming-tar.py
Last active August 12, 2016 14:24
Show Gist options
  • Save leth/6adb9d30f2fdcb8802532a87dfbeff77 to your computer and use it in GitHub Desktop.
Save leth/6adb9d30f2fdcb8802532a87dfbeff77 to your computer and use it in GitHub Desktop.
Sample code to build a tar chunk-by-chunk and stream it out all at once.
#!/usr/bin/env python
#
# Building a tar file chunk-by-chunk.
#
# This is a quick bit of sample code for streaming data to a tar file,
# building it piece-by-piece. The tarfile is built on-the-fly and streamed
# back out. This is useful for web applications that need to dynamically
# build a tar file without swamping the server.
from io import BytesIO
from os import walk
from tarfile import TarFile, NUL, BLOCKSIZE
from os.path import (
abspath, join as path_join, sep as path_sep, split as path_split
)
class FileStream:
def __init__(self):
self.buffer = BytesIO()
self.offset = 0
def write(self, s):
self.buffer.write(s)
self.offset += len(s)
def tell(self):
return self.offset
def close(self):
self.buffer.close()
def read_all(self):
try:
return self.buffer.getvalue()
finally:
self.buffer.close()
self.buffer = BytesIO()
class StreamingTar:
def __init__(self, directory, file_chunk_size=8192):
self._directory = directory
self._file_chunk_size = file_chunk_size
@staticmethod
def _stream_file_into_tar(tarinfo, tar, fh, buf_size):
out = tar.fileobj
for b in iter(lambda: fh.read(buf_size), b''):
out.write(b)
yield
blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
if remainder > 0:
out.write(NUL * (BLOCKSIZE - remainder))
blocks += 1
tar.offset += blocks * BLOCKSIZE
yield
def __iter__(self):
out = FileStream()
tar = TarFile(fileobj=out, mode='w')
prefix, name = path_split(self._directory)
prefix_len = len(prefix) + len(path_sep)
tar.add(name=self._directory, arcname=name, recursive=False)
for path, dirs, files in walk(self._directory):
arcpath = path[prefix_len:]
# Add files
for f in files:
filepath = path_join(path, f)
with open(filepath, 'rb') as fh:
info = tar.gettarinfo(
name=filepath, arcname=path_join(arcpath, f),
fileobj=fh)
tar.addfile(info)
for _ in self._stream_file_into_tar(
info, tar, fh, self._file_chunk_size):
yield out.read_all()
# Add directories
for d in dirs:
tar.add(
name=path_join(path, d), arcname=path_join(arcpath, d),
recursive=False)
yield out.read_all()
tar.close()
yield out.read_all()
out.close()
if __name__ == '__main__':
t = StreamingTar(abspath('foobar'))
with open('out.tar', 'wb') as fh:
for chunk in t:
# print(repr(b))
fh.write(chunk)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment