Skip to content

Instantly share code, notes, and snippets.

@daniel-dona
Created April 25, 2024 14:54
Show Gist options
  • Save daniel-dona/e1bce1d8ab01284d019d087664127cba to your computer and use it in GitHub Desktop.
Save daniel-dona/e1bce1d8ab01284d019d087664127cba to your computer and use it in GitHub Desktop.
Obtain signed download URL for Common Voice corpus
import requests
import urllib
api_url = "https://commonvoice.mozilla.org/api/v1"
data_endpoint = "bucket/dataset"
langs_endpoint = "datasets/languages"
downloader_endpoint = "downloaders"
lang = "es"
dataset = "cv-corpus-17.0-2024-03-15"
datasets = requests.get(f"{api_url}/{langs_endpoint}/{lang}")
for entry in datasets.json():
if entry["release_dir"] == dataset:
print("Found dataset: \n\t", entry)
download_path = entry["download_path"].replace("{locale}", lang)
print("Download server path: \n\t", download_path)
email = input("Provide an email to download the dataset:")
report_email = requests.post(f"{api_url}/{lang}/{downloader_endpoint}", data={"email": email, "locale": lang, "dataset": entry["id"]})
signed_url = requests.get(f"{api_url}/{data_endpoint}/{urllib.parse.quote_plus(download_path)}")
print("Signed download url: \n\t", signed_url.json()["url"])
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment