Skip to content

Instantly share code, notes, and snippets.

Created February 9, 2025 13:33
Show Gist options
  • Save jaju/ca82b29000d88689a67bc0aa7f7e0e0d to your computer and use it in GitHub Desktop.
Save jaju/ca82b29000d88689a67bc0aa7f7e0e0d to your computer and use it in GitHub Desktop.
Shell helpers for quick, light-weight navigation/listing of models and datasets. MacOS-specific settings in place - if an external volume is mounted at "/Volumes/External" and has a huggingface directory, it takes priority and is set as the HF_HOME
# Collection of routines that will be sourced in a shell to manage huggingface settings and environment variables
# Expects huggingface_hub python dependency installed, python executable as "python" (and not python3), and fzf
_EXEDIR=$(dirname $0)
_PRELUDE="import sys; sys.path.insert(0, '$_EXEDIR'); import hf_shell_fns as hf"
function hf_home() {
python -c "$_PRELUDE; print(hf.hf_home())"
# Function to print the HF_HUB_CACHE variable from huggingface hub library.
# This honors the HF_HOME variable if it is set.
function hf_hub_cache() {
python -c "$_PRELUDE; print(hf.cache_dir())"
function hf_list_models() {
python -c "$_PRELUDE; print(hf.list_models())"
function hf_list_datasets() {
python -c "$_PRELUDE; print(hf.list_datasets())"
function hf_list_model() {
if [ -z "$model" ]; then
model=`hf_list_models | fzf`
python -c "$_PRELUDE; print(hf.list_model('$model'))"
function hf_list_dataset() {
if [ -z "$dataset" ]; then
dataset=`hf_list_datasets | fzf`
python -c "$_PRELUDE; print(hf.list_dataset('$dataset'))" | yq .
# If the external device is mounted, and it has the huggingface directory, set the HF_HOME to the external device
function hf_reset() {
if [ -d "$EXTERNAL_DEVICE_MOUNT_POINT/huggingface" ]; then
echo "HF_HOME set to $HF_HOME"
import sys
import json
import re
import pathlib
from pathlib import Path
from textwrap import dedent
from huggingface_hub import scan_cache_dir
from huggingface_hub.constants import HF_HOME, HF_HUB_CACHE
def hf_home():
return HF_HOME
def cache_dir():
def list_models():
repo_names = [r.repo_id for r in scan_cache_dir(HF_HUB_CACHE).repos if r.repo_type == 'model']
return "\n".join(repo_names)
def list_datasets():
repo_names = [r.repo_id for r in scan_cache_dir(HF_HUB_CACHE).repos if r.repo_type == 'dataset']
return "\n".join(repo_names)
def list_model(repo_id):
repo_cache_path = Path(HF_HUB_CACHE) / f'models--{repo_id.replace("/", "--")}' / 'snapshots'
snapshots = sorted(repo_cache_path.glob('*'), key=lambda x: x.stat().st_mtime, reverse=True)
config_file = next((p / 'config.json' for p in snapshots if (p / 'config.json').exists()), None)
if not config_file:
print(f'Config file not found for {repo_id}')
return None
with open(config_file, 'r') as f:
config = json.load(f)
return dedent(f"""
model - {repo_id}
model_type - {config.get('model_type', 'N/A')}
config - {config_file}
hidden_layers - {config.get('hidden_layers', 'N/A')}
torch_dtype - {config.get('torch_dtype', 'N/A')}
quantization - {config.get('quantization', 'N/A')}
architectures - {[architecture for architecture in config.get('architectures', [])]}
def get_yaml_front_matter(file_path):
readme_path = pathlib.Path(file_path)
with open(readme_path, 'r', encoding='utf-8') as f:
content =
match = re.match(r'^---\n(.*?)\n---', content, re.DOTALL)
if match:
return None
def list_dataset(repo_id):
repo_cache_path = Path(HF_HUB_CACHE) / f'datasets--{repo_id.replace("/", "--")}' / 'snapshots'
snapshots = sorted(repo_cache_path.glob('*'), key=lambda x: x.stat().st_mtime, reverse=True)
readme_file = next((p / '' for p in snapshots if (p / '').exists()), None)
if not readme_file:
print(f'README file not found for {repo_id}')
return None
yaml = get_yaml_front_matter(readme_file)
return yaml
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment