shangeth/download_common_voice_16.py

## download_common_voice_16.py
import requests
import os
import tarfile

# Hugging Face token
hf_token = "<HF_TOKEN_HERE>"
headers = {"Authorization": f"Bearer {hf_token}"}

# Directory to save and extract files

set = "test" # train|test|dev
n_files = 1 # train=27|test=1|dev=1
save_dir = f"/root/shangeth/t0/mm-llm/data/CommonVoice/data/{set}"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Base URL for the files
base_url = f"https://huggingface.co/datasets/mozilla-foundation/common_voice_16_1/resolve/main/audio/en/{set}/"

# Function to download and extract files
def download_and_extract(file_name):
    url = f"{base_url}{file_name}"
    local_filename = os.path.join(save_dir, file_name)
    # Download the file
    with requests.get(url, headers=headers, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    # Extract the tar file
    with tarfile.open(local_filename) as tar:
        tar.extractall(path=save_dir)
    os.remove(local_filename)  # Remove the tar file after extraction

# Loop through the file range
for i in range(n_files):  # 0 to 27
    file_name = f"en_{set}_{i}.tar"
    print(f"Downloading and extracting {file_name}...")
    download_and_extract(file_name)

print("All files downloaded and extracted.")
	import requests
	import os
	import tarfile

	# Hugging Face token
	hf_token = "<HF_TOKEN_HERE>"
	headers = {"Authorization": f"Bearer {hf_token}"}

	# Directory to save and extract files

	set = "test" # train\|test\|dev
	n_files = 1 # train=27\|test=1\|dev=1
	save_dir = f"/root/shangeth/t0/mm-llm/data/CommonVoice/data/{set}"
	if not os.path.exists(save_dir):
	os.makedirs(save_dir)

	# Base URL for the files
	base_url = f"https://huggingface.co/datasets/mozilla-foundation/common_voice_16_1/resolve/main/audio/en/{set}/"

	# Function to download and extract files
	def download_and_extract(file_name):
	url = f"{base_url}{file_name}"
	local_filename = os.path.join(save_dir, file_name)
	# Download the file
	with requests.get(url, headers=headers, stream=True) as r:
	r.raise_for_status()
	with open(local_filename, 'wb') as f:
	for chunk in r.iter_content(chunk_size=8192):
	f.write(chunk)
	# Extract the tar file
	with tarfile.open(local_filename) as tar:
	tar.extractall(path=save_dir)
	os.remove(local_filename) # Remove the tar file after extraction

	# Loop through the file range
	for i in range(n_files): # 0 to 27
	file_name = f"en_{set}_{i}.tar"
	print(f"Downloading and extracting {file_name}...")
	download_and_extract(file_name)

	print("All files downloaded and extracted.")