Skip to content

Instantly share code, notes, and snippets.

@shangeth
Created March 6, 2024 02:27
Show Gist options
  • Save shangeth/3aedd9d883f94fe0c2bf2d7405628ea8 to your computer and use it in GitHub Desktop.
Save shangeth/3aedd9d883f94fe0c2bf2d7405628ea8 to your computer and use it in GitHub Desktop.
This Python script automates downloading and extracting .tar files from the Common Voice dataset on Hugging Face, using a Hugging Face token for authorization. It creates directories based on set types (e.g., "test"), downloads specified .tar files, extracts their contents, and cleans up by removing the .tar files post-extraction. Ideal for res…
import requests
import os
import tarfile
# Hugging Face token
hf_token = "<HF_TOKEN_HERE>"
headers = {"Authorization": f"Bearer {hf_token}"}
# Directory to save and extract files
set = "test" # train|test|dev
n_files = 1 # train=27|test=1|dev=1
save_dir = f"/root/shangeth/t0/mm-llm/data/CommonVoice/data/{set}"
if not os.path.exists(save_dir):
os.makedirs(save_dir)
# Base URL for the files
base_url = f"https://huggingface.co/datasets/mozilla-foundation/common_voice_16_1/resolve/main/audio/en/{set}/"
# Function to download and extract files
def download_and_extract(file_name):
url = f"{base_url}{file_name}"
local_filename = os.path.join(save_dir, file_name)
# Download the file
with requests.get(url, headers=headers, stream=True) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
# Extract the tar file
with tarfile.open(local_filename) as tar:
tar.extractall(path=save_dir)
os.remove(local_filename) # Remove the tar file after extraction
# Loop through the file range
for i in range(n_files): # 0 to 27
file_name = f"en_{set}_{i}.tar"
print(f"Downloading and extracting {file_name}...")
download_and_extract(file_name)
print("All files downloaded and extracted.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment