Skip to content

Instantly share code, notes, and snippets.

@non-static
Created March 14, 2022 04:51
Show Gist options
  • Save non-static/b372aaae9b793337c25446d8fc3e332e to your computer and use it in GitHub Desktop.
Save non-static/b372aaae9b793337c25446d8fc3e332e to your computer and use it in GitHub Desktop.
Given a HuggingFace model card link like this: "https://huggingface.co/bert-base-uncased/tree/main", download all files in the list, using python asyncio.
import os
import sys
import requests
import asyncio
import aiohttp
import aiofile
from bs4 import BeautifulSoup
HOST = "https://huggingface.co"
def get_file_url_list(source_url):
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
req = requests.get(source_url, headers)
soup = BeautifulSoup(req.content, 'html.parser')
all_links = soup.find_all("a", class_="col-span-4 md:col-span-2 text-right truncate leading-6 font-mono text-gray-400 text-xs xl:pr-10")
file_url_list = []
for l in all_links:
url = f"{HOST}{l.get('href')}"
file_url_list.append(url)
return file_url_list
def save_files(file_url_list, target_folder):
os.makedirs(target_folder, exist_ok=True)
async def download_file(url):
print(f"Current task: {url}")
file_name = url.split('/')[-1]
print(f"File name: {file_name}")
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
assert resp.status == 200
print(f"Download started {url}")
data = await resp.read()
async with aiofile.async_open(os.path.join(target_folder, file_name), "wb") as afp:
print(f"Saving {file_name} ...")
await afp.write(data)
loop = asyncio.get_event_loop()
tasks = [loop.create_task(download_file(url)) for url in file_url_list]
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
if __name__ == '__main__':
download_list = get_file_url_list(sys.argv[1])
print(download_list)
save_files(download_list, sys.argv[2])
print("done!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment