Skip to content

Instantly share code, notes, and snippets.

@rwightman
Last active February 18, 2024 20:14
Show Gist options
  • Save rwightman/ca11ca662134ea404d9513ebee7a1577 to your computer and use it in GitHub Desktop.
Save rwightman/ca11ca662134ea404d9513ebee7a1577 to your computer and use it in GitHub Desktop.
import math
import os
from collections import defaultdict
from pathlib import Path
from huggingface_hub import CommitOperationAdd, preupload_lfs_files, create_commit
# fast transfers using a Rust library, `pip install hf-transfer`
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
splits = ['train', 'validation', 'test']
path = Path('/data/in22k-wds')
files = path.glob('*.tar')
split_files = [(f, f.name.split('-')) for f in files]
per_split = defaultdict(list)
for s in splits:
for f in split_files:
if any([n == s for n in f[1]]):
per_split[s].append(f)
for s in per_split.keys():
per_split[s] = list(sorted(per_split[s]))
num_files_per_commit = 100
repo_id = 'timm/imagenet-22k-wds'
for s, file_list in per_split.items():
operations = []
commits = 0
num_commits = int(math.ceil(len(file_list) / num_files_per_commit))
for f in file_list:
shard_name = f[0].name
addition = CommitOperationAdd(path_in_repo=shard_name, path_or_fileobj=f[0])
preupload_lfs_files(repo_id, additions=[addition], repo_type='dataset')
operations.append(addition)
if len(operations) == num_files_per_commit:
create_commit(repo_id, operations=operations, commit_message=f"commit {commits + 1} of {num_commits} for split {s}.", repo_type='dataset')
operations = []
commits += 1
if operations:
create_commit(repo_id, operations=operations,
commit_message=f"commit {commits + 1} of {num_commits} for split {s}.", repo_type='dataset')
operations = []
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment