Skip to content

Instantly share code, notes, and snippets.

@abbbi
Created November 4, 2021 11:30
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abbbi/b4b07efd133cdc5f86c0da01a030e76a to your computer and use it in GitHub Desktop.
Save abbbi/b4b07efd133cdc5f86c0da01a030e76a to your computer and use it in GitHub Desktop.
streaming_tar.py
#!/usr/bin/env python3
#
# Building a tar file chunk-by-chunk.
#
# This is a quick bit of sample code for streaming data to a tar file,
# building it piece-by-piece. The tarfile is built on-the-fly and streamed
# back out. This is useful for web applications that need to dynamically
# build a tar file without swamping the server.
import os
import sys
import tarfile
from io import BytesIO
class FileStream(object):
def __init__(self):
self.buffer = BytesIO()
self.offset = 0
def write(self, s):
self.buffer.write(s)
self.offset += len(s)
def tell(self):
return self.offset
def close(self):
self.buffer.close()
def pop(self):
s = self.buffer.getvalue()
self.buffer.close()
self.buffer = BytesIO()
return s
def stream_build_tar(in_filename, streaming_fp):
tar = tarfile.TarFile.open(out_filename, 'w|gz', streaming_fp)
stat = os.stat(in_filename)
tar_info = tarfile.TarInfo(in_filename)
# Note that you can get this information from the storage backend,
# but it's valid for either to raise a NotImplementedError, so it's
# important to check.
#
# Things like the mode or ownership won't be available.
tar_info.mtime = stat.st_mtime
tar_info.size = stat.st_size
# Note that we don't pass a fileobj, so we don't write any data
# through addfile. We'll do this ourselves.
tar.addfile(tar_info)
yield
with open(in_filename, 'rb') as in_fp:
total_size = 0
while True:
s = in_fp.read(BLOCK_SIZE)
if len(s) > 0:
tar.fileobj.write(s)
yield
if len(s) < BLOCK_SIZE:
blocks, remainder = divmod(tar_info.size, tarfile.BLOCKSIZE)
if remainder > 0:
tar.fileobj.write(tarfile.NUL *
(tarfile.BLOCKSIZE - remainder))
yield
blocks += 1
tar.offset += blocks * tarfile.BLOCKSIZE
break
tar.close()
yield
BLOCK_SIZE = 4096
if len(sys.argv) != 3:
print('Usage: %s in_filename out_filename' % sys.argv[0])
sys.exit(1)
in_filename = sys.argv[1]
out_filename = sys.argv[2]
streaming_fp = FileStream()
with open(out_filename, 'wb') as out_fp:
for i in stream_build_tar(in_filename, streaming_fp):
s = streaming_fp.pop()
if len(s) > 0:
print('Writing %d bytes...' % len(s))
out_fp.write(s)
out_fp.flush()
print('Wrote tar file to %s' % out_filename)
@memogarcia
Copy link

Anytime you tar a file with this script it takes around 30x the time as normal tar and also the checksums between a normal tar command and this python script are different

@abbbi
Copy link
Author

abbbi commented Nov 10, 2021

Anytime you tar a file with this script it takes around 30x the time as normal tar and also the checksums between a normal tar command and this python script are different

might well be, this gist just exists as i wanted to be able to use the script from the original gist with python3, no logical changes to the script otherwise.

https://gist.github.com/chipx86/9598b1e4a9a1a7831054

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment