Created
February 9, 2025 13:33
-
-
Save jaju/ca82b29000d88689a67bc0aa7f7e0e0d to your computer and use it in GitHub Desktop.
Shell helpers for quick, light-weight navigation/listing of models and datasets. MacOS-specific settings in place - if an external volume is mounted at "/Volumes/External" and has a huggingface directory, it takes priority and is set as the HF_HOME
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Collection of routines that will be sourced in a shell to manage huggingface settings and environment variables | |
# Expects huggingface_hub python dependency installed, python executable as "python" (and not python3), and fzf | |
_EXEDIR=$(dirname $0) | |
_PRELUDE="import sys; sys.path.insert(0, '$_EXEDIR'); import hf_shell_fns as hf" | |
function hf_home() { | |
python -c "$_PRELUDE; print(hf.hf_home())" | |
} | |
# Function to print the HF_HUB_CACHE variable from huggingface hub library. | |
# This honors the HF_HOME variable if it is set. | |
function hf_hub_cache() { | |
python -c "$_PRELUDE; print(hf.cache_dir())" | |
} | |
function hf_list_models() { | |
python -c "$_PRELUDE; print(hf.list_models())" | |
} | |
function hf_list_datasets() { | |
python -c "$_PRELUDE; print(hf.list_datasets())" | |
} | |
function hf_list_model() { | |
model=$1 | |
if [ -z "$model" ]; then | |
model=`hf_list_models | fzf` | |
fi | |
python -c "$_PRELUDE; print(hf.list_model('$model'))" | |
} | |
function hf_list_dataset() { | |
dataset=$1 | |
if [ -z "$dataset" ]; then | |
dataset=`hf_list_datasets | fzf` | |
fi | |
python -c "$_PRELUDE; print(hf.list_dataset('$dataset'))" | yq . | |
} | |
# If the external device is mounted, and it has the huggingface directory, set the HF_HOME to the external device | |
EXTERNAL_DEVICE_MOUNT_POINT="/Volumes/External" | |
function hf_reset() { | |
if [ -d "$EXTERNAL_DEVICE_MOUNT_POINT/huggingface" ]; then | |
export HF_HOME="$EXTERNAL_DEVICE_MOUNT_POINT/huggingface" | |
echo "HF_HOME set to $HF_HOME" | |
fi | |
} | |
hf_reset |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import json | |
import re | |
import pathlib | |
from pathlib import Path | |
from textwrap import dedent | |
from huggingface_hub import scan_cache_dir | |
from huggingface_hub.constants import HF_HOME, HF_HUB_CACHE | |
def hf_home(): | |
return HF_HOME | |
def cache_dir(): | |
return HF_HUB_CACHE | |
def list_models(): | |
repo_names = [r.repo_id for r in scan_cache_dir(HF_HUB_CACHE).repos if r.repo_type == 'model'] | |
return "\n".join(repo_names) | |
def list_datasets(): | |
repo_names = [r.repo_id for r in scan_cache_dir(HF_HUB_CACHE).repos if r.repo_type == 'dataset'] | |
return "\n".join(repo_names) | |
def list_model(repo_id): | |
repo_cache_path = Path(HF_HUB_CACHE) / f'models--{repo_id.replace("/", "--")}' / 'snapshots' | |
snapshots = sorted(repo_cache_path.glob('*'), key=lambda x: x.stat().st_mtime, reverse=True) | |
config_file = next((p / 'config.json' for p in snapshots if (p / 'config.json').exists()), None) | |
if not config_file: | |
print(f'Config file not found for {repo_id}') | |
return None | |
with open(config_file, 'r') as f: | |
config = json.load(f) | |
return dedent(f""" | |
model - {repo_id} | |
model_type - {config.get('model_type', 'N/A')} | |
config - {config_file} | |
hidden_layers - {config.get('hidden_layers', 'N/A')} | |
torch_dtype - {config.get('torch_dtype', 'N/A')} | |
quantization - {config.get('quantization', 'N/A')} | |
architectures - {[architecture for architecture in config.get('architectures', [])]} | |
""").strip() | |
def get_yaml_front_matter(file_path): | |
readme_path = pathlib.Path(file_path) | |
with open(readme_path, 'r', encoding='utf-8') as f: | |
content = f.read() | |
match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL) | |
if match: | |
return match.group(1) | |
else: | |
return None | |
def list_dataset(repo_id): | |
repo_cache_path = Path(HF_HUB_CACHE) / f'datasets--{repo_id.replace("/", "--")}' / 'snapshots' | |
snapshots = sorted(repo_cache_path.glob('*'), key=lambda x: x.stat().st_mtime, reverse=True) | |
readme_file = next((p / 'README.md' for p in snapshots if (p / 'README.md').exists()), None) | |
if not readme_file: | |
print(f'README file not found for {repo_id}') | |
return None | |
yaml = get_yaml_front_matter(readme_file) | |
return yaml |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment