Skip to content

Instantly share code, notes, and snippets.

@helenst
Created October 31, 2019 09:24
Show Gist options
  • Save helenst/1d6bbbc8bf30df1a3e14a638ae3121e6 to your computer and use it in GitHub Desktop.
Save helenst/1d6bbbc8bf30df1a3e14a638ae3121e6 to your computer and use it in GitHub Desktop.
Include directory entries in tarfile and allow top level directory to be specified
diff --git a/python_client/src/wellcome_storage_service/downloader.py b/python_client/src/wellcome_storage_service/downloader.py
index 6a78c76b..fc10f56e 100644
--- a/python_client/src/wellcome_storage_service/downloader.py
+++ b/python_client/src/wellcome_storage_service/downloader.py
@@ -3,6 +3,7 @@
import abc
import os
import tarfile
+import time
try:
from collections.abc import ABC
@@ -45,7 +46,7 @@ def download_bag(storage_manifest, out_dir):
)
-def download_compressed_bag(storage_manifest, out_path):
+def download_compressed_bag(storage_manifest, out_path, top_level_directory=''):
"""
Download all the files in a bag to a compressed archive.
@@ -58,17 +59,33 @@ def download_compressed_bag(storage_manifest, out_path):
provider = _choose_provider(location)
with tarfile.open(out_path, "w:gz") as tf:
+ # Keeps track of which directories have been added to the tar file
+ dirnames = set()
for manifest_file in _all_files(storage_manifest):
fileobj = provider.get_fileobj(
location=location, manifest_file=manifest_file
)
- tarinfo = tarfile.TarInfo(name=manifest_file["name"])
+ name_in_tar = os.path.join(top_level_directory, manifest_file["name"])
+
+ # Ensure all parent directories exist in tar
+ name = name_in_tar
+ while name:
+ name = os.path.dirname(name)
+ if name and name not in dirnames:
+ tarinfo = tarfile.TarInfo(name)
+ tarinfo.type = tarfile.DIRTYPE
+ tarinfo.mode = 0o755
+ tarinfo.mtime = time.time()
+ tf.addfile(tarinfo=tarinfo)
+ dirnames.add(name)
+
+ tarinfo = tarfile.TarInfo(name=name_in_tar)
tarinfo.size = manifest_file["size"]
+ tarinfo.mtime = time.time()
tf.addfile(tarinfo=tarinfo, fileobj=fileobj)
-
class AbstractProvider(object):
"""
Abstract class for a downloader.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment