Skip to content

Instantly share code, notes, and snippets.

@victorchall
Created June 30, 2024 02:26
Show Gist options
  • Save victorchall/697c031771e77f820e4d7030097b472a to your computer and use it in GitHub Desktop.
Save victorchall/697c031771e77f820e4d7030097b472a to your computer and use it in GitHub Desktop.
Python script to create 1GB tar files for webp/txt/json tuple dataset
import os
import tarfile
import json
from collections import defaultdict
def group_files(root_dir):
file_groups = defaultdict(list)
for subdir, _, files in os.walk(root_dir):
for file in files:
if file == "urls.txt" or file == "caption_cog_params.txt":
continue
if file.endswith(('.webp', '.txt', '.json')):
basename = os.path.splitext(file)[0]
file_groups[basename].append(os.path.join(subdir, file))
return file_groups
def create_tar_files(file_groups, output_dir, max_size=1024*1024*1024): # 1 GB
current_tar = None
current_size = 0
tar_count = 1
for basename, files in file_groups.items():
file_size = sum(os.path.getsize(f) for f in files)
if current_tar is None or current_size + file_size > max_size:
if current_tar:
current_tar.close()
tar_filename = os.path.join(output_dir, f'{tar_count:05d}.tar')
current_tar = tarfile.open(tar_filename, 'w')
current_size = 0
tar_count += 1
for file in files:
arcname = os.path.basename(file)
current_tar.add(file, arcname=arcname)
current_size += os.path.getsize(file)
if current_tar:
current_tar.close()
def check_filegroups(file_groups):
i = 0
for group in file_groups.keys():
i += 1
assert any(x.endswith(".txt") for x in file_groups[group]), f"Missing .txt for {group}"
assert any(x.endswith(".webp") for x in file_groups[group]), f"Missing .txt for {group}"
assert any(x.endswith(".json") for x in file_groups[group]), f"Missing .txt for {group}"
print("all file groups checked out ok")
print(f"total: {i}")
def main():
root_dir = input("Enter the root directory path: ")
output_dir = input("Enter the output directory path for TAR files: ")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_groups = group_files(root_dir)
check_filegroups(file_groups)
create_tar_files(file_groups, output_dir)
print("TAR files have been created successfully.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment