Skip to content

Instantly share code, notes, and snippets.

@pszemraj
Created January 16, 2024 02:53
Show Gist options
  • Save pszemraj/81c3a3c795d5e8db2ac2b3aa16ee496c to your computer and use it in GitHub Desktop.
Save pszemraj/81c3a3c795d5e8db2ac2b3aa16ee496c to your computer and use it in GitHub Desktop.
upload a folder to Hugging Face Hub and other utils
import argparse
import logging
import time
from datetime import datetime
from pathlib import Path
from typing import Optional
from huggingface_hub import upload_folder
from watchdog.events import PatternMatchingEventHandler
from watchdog.observers import Observer
def get_timestamp():
return datetime.now().strftime("%b-%d %H:%M:%S")
def validate_inputs(
repo_id: str, folder_path: Path, path_in_repo: Optional[str]
) -> None:
if "/" not in repo_id:
raise ValueError(
"Invalid repo_id format. It should be in 'username/repository' format."
)
if not folder_path.exists() or not folder_path.is_dir():
raise FileNotFoundError(
f"The folder path '{folder_path}' does not exist or is not a directory."
)
if path_in_repo and (path_in_repo.startswith("/") or path_in_repo.endswith("/")):
raise ValueError("path_in_repo should not start or end with '/'.")
def upload_to_huggingface(
repo_id: str, folder_path: Path, path_in_repo: Optional[str]
) -> None:
try:
upload_folder(
repo_id=repo_id,
folder_path=str(folder_path),
path_in_repo=path_in_repo,
ignore_patterns="*.pt*",
commit_message=f"Automated upload: directory change @ {get_timestamp()}",
)
logging.info("Upload completed successfully.")
except Exception as e:
logging.error(f"An error occurred during upload: {e}")
class ChangeHandler(PatternMatchingEventHandler):
def __init__(
self,
repo_id: str,
folder_path: Path,
path_in_repo: Optional[str],
exclude_substring: str,
delay: float = 15.0,
) -> None:
self.repo_id = repo_id
self.folder_path = folder_path
self.path_in_repo = path_in_repo
self.exclude_substring = exclude_substring
self.last_upload_time = 0
self.delay = delay
ignore_patterns = None
if exclude_substring:
ignore_patterns = [f"*{exclude_substring}*"]
super().__init__(ignore_patterns=ignore_patterns)
def should_upload(self, event_path: str) -> bool:
current_time = time.time()
if current_time - self.last_upload_time > self.delay:
if self.exclude_substring and self.exclude_substring in event_path:
return False
return True
return False
def on_any_event(self, event) -> None:
if self.should_upload(event.src_path):
self.last_upload_time = time.time()
upload_to_huggingface(self.repo_id, self.folder_path, self.path_in_repo)
def main() -> None:
parser = argparse.ArgumentParser(
description="Monitor a folder and upload to Hugging Face Hub on changes."
)
parser.add_argument(
"repo_id",
type=str,
help="Repository ID on Hugging Face (e.g., 'username/repo_name')",
)
parser.add_argument(
"folder_path", type=Path, help="Path to the folder to be monitored"
)
parser.add_argument(
"-p",
"--path_in_repo",
type=str,
default=None,
help="Path in the repository where the folder will be uploaded (default: None)",
)
parser.add_argument(
"-ex",
"--exclude-substring",
type=str,
default="",
help="Substring to exclude files/directories from triggering uploads (default: '')",
)
parser.add_argument(
"-f",
"--check_freq",
type=int,
default=30,
help="Frequency (in seconds) to check for changes (default: 30)",
)
args = parser.parse_args()
validate_inputs(args.repo_id, args.folder_path, args.path_in_repo)
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
event_handler = ChangeHandler(
args.repo_id,
args.folder_path,
args.path_in_repo,
args.exclude_substring,
delay=args.check_freq,
)
observer = Observer()
observer.schedule(event_handler, path=str(args.folder_path), recursive=True)
observer.start()
logging.info(f"Monitoring folder:\t{args.folder_path}")
try:
while True:
time.sleep(1) # sleep indefinitely, the observer works in a separate thread
except KeyboardInterrupt:
observer.stop()
observer.join()
logging.info("Stopping monitoring")
if __name__ == "__main__":
main()
"""
this script will upload a folder to Hugging Face Hub
pip install huggingface-hub
"""
import argparse
import logging
import sys
from pathlib import Path
from huggingface_hub import upload_folder
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
def validate_inputs(repo_id, folder_path, path_in_repo):
# Validate repo_id format
if "/" not in repo_id:
raise ValueError(
"Invalid repo_id format. It should be in 'username/repository' format."
)
# Validate folder_path
if not folder_path.exists() or not folder_path.is_dir():
raise FileNotFoundError(
f"The folder path '{folder_path}' does not exist or is not a directory."
)
# Validate path_in_repo if provided
if path_in_repo and (path_in_repo.startswith("/") or path_in_repo.endswith("/")):
raise ValueError("path_in_repo should not start or end with '/'.")
def main():
"""
Main function to set up the folder monitoring and upload process.
"""
parser = argparse.ArgumentParser(description="Upload a folder to Hugging Face Hub.")
parser.add_argument(
"repo_id",
type=str,
help="The repository ID on Hugging Face (e.g., 'username/repo_name')",
)
parser.add_argument(
"folder_path",
type=Path,
help="Path to the folder to be uploaded",
)
parser.add_argument(
"--path_in_repo",
type=str,
default=None,
help="Path in the repository where the folder will be uploaded (defaults to None)",
)
args = parser.parse_args()
validate_inputs(args.repo_id, args.folder_path, args.path_in_repo)
try:
folder_path_str = str(args.folder_path.resolve())
logging.info(
f"Starting upload of folder {folder_path_str} to repo {args.repo_id}"
)
upload_folder(
repo_id=args.repo_id,
folder_path=folder_path_str,
path_in_repo=args.path_in_repo,
ignore_patterns="*.pt*", # ignore optimizers etc
commit_message="manual upload with upload_folder.py",
)
logging.info("Upload completed successfully.")
except Exception as e:
logging.error(f"An error occurred: {e}")
sys.exit(1)
logging.info(f"Done! pushed to:\t{args.repo_id}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment