Last active
February 5, 2023 20:14
-
-
Save Edresson/752ecaba9b2f10a2b80a2af89fd20d99 to your computer and use it in GitHub Desktop.
Generate download links for all Common Voice 7.0 languages
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# reference: https://github.com/huggingface/datasets/blob/master/datasets/common_voice/common_voice.py | |
# DATA_URL = "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-6.1-2020-12-11/{}.tar.gz" | |
DATA_URL = "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-7.0-2021-07-21/cv-corpus-7.0-2021-07-21-{}.tar.gz" | |
LANGUAGES = { | |
"ab": { | |
"Language": "Abkhaz", | |
"Date": "2020-12-11", | |
"Size": "39 MB", | |
"Version": "ab_1h_2020-12-11", | |
"Validated_Hr_Total": 0.05, | |
"Overall_Hr_Total": 1, | |
"Number_Of_Voice": 14, | |
}, | |
"ar": { | |
"Language": "Arabic", | |
"Date": "2020-12-11", | |
"Size": "2 GB", | |
"Version": "ar_77h_2020-12-11", | |
"Validated_Hr_Total": 49, | |
"Overall_Hr_Total": 77, | |
"Number_Of_Voice": 672, | |
}, | |
"as": { | |
"Language": "Assamese", | |
"Date": "2020-12-11", | |
"Size": "21 MB", | |
"Version": "as_0.78h_2020-12-11", | |
"Validated_Hr_Total": 0.74, | |
"Overall_Hr_Total": 0.78, | |
"Number_Of_Voice": 17, | |
}, | |
"br": { | |
"Language": "Breton", | |
"Date": "2020-12-11", | |
"Size": "444 MB", | |
"Version": "br_16h_2020-12-11", | |
"Validated_Hr_Total": 7, | |
"Overall_Hr_Total": 16, | |
"Number_Of_Voice": 157, | |
}, | |
"ca": { | |
"Language": "Catalan", | |
"Date": "2020-12-11", | |
"Size": "19 GB", | |
"Version": "ca_748h_2020-12-11", | |
"Validated_Hr_Total": 623, | |
"Overall_Hr_Total": 748, | |
"Number_Of_Voice": 5376, | |
}, | |
"cnh": { | |
"Language": "Hakha Chin", | |
"Date": "2020-12-11", | |
"Size": "39 MB", | |
"Version": "ab_1h_2020-12-11", | |
"Validated_Hr_Total": 0.05, | |
"Overall_Hr_Total": 1, | |
"Number_Of_Voice": 14, | |
}, | |
"cs": { | |
"Language": "Czech", | |
"Date": "2020-12-11", | |
"Size": "39 MB", | |
"Version": "ab_1h_2020-12-11", | |
"Validated_Hr_Total": 0.05, | |
"Overall_Hr_Total": 1, | |
"Number_Of_Voice": 14, | |
}, | |
"cv": { | |
"Language": "Chuvash", | |
"Date": "2020-12-11", | |
"Size": "419 MB", | |
"Version": "cv_16h_2020-12-11", | |
"Validated_Hr_Total": 4, | |
"Overall_Hr_Total": 16, | |
"Number_Of_Voice": 92, | |
}, | |
"cy": { | |
"Language": "Welsh", | |
"Date": "2020-12-11", | |
"Size": "3 GB", | |
"Version": "cy_124h_2020-12-11", | |
"Validated_Hr_Total": 95, | |
"Overall_Hr_Total": 124, | |
"Number_Of_Voice": 1382, | |
}, | |
"de": { | |
"Language": "German", | |
"Date": "2020-12-11", | |
"Size": "22 GB", | |
"Version": "de_836h_2020-12-11", | |
"Validated_Hr_Total": 777, | |
"Overall_Hr_Total": 836, | |
"Number_Of_Voice": 12659, | |
}, | |
"dv": { | |
"Language": "Dhivehi", | |
"Date": "2020-12-11", | |
"Size": "515 MB", | |
"Version": "dv_19h_2020-12-11", | |
"Validated_Hr_Total": 18, | |
"Overall_Hr_Total": 19, | |
"Number_Of_Voice": 167, | |
}, | |
"el": { | |
"Language": "Greek", | |
"Date": "2020-12-11", | |
"Size": "364 MB", | |
"Version": "el_13h_2020-12-11", | |
"Validated_Hr_Total": 6, | |
"Overall_Hr_Total": 13, | |
"Number_Of_Voice": 118, | |
}, | |
"en": { | |
"Language": "English", | |
"Date": "2020-12-11", | |
"Size": "56 GB", | |
"Version": "en_2181h_2020-12-11", | |
"Validated_Hr_Total": 1686, | |
"Overall_Hr_Total": 2181, | |
"Number_Of_Voice": 66173, | |
}, | |
"eo": { | |
"Language": "Esperanto", | |
"Date": "2020-12-11", | |
"Size": "3 GB", | |
"Version": "eo_102h_2020-12-11", | |
"Validated_Hr_Total": 90, | |
"Overall_Hr_Total": 102, | |
"Number_Of_Voice": 574, | |
}, | |
"es": { | |
"Language": "Spanish", | |
"Date": "2020-12-11", | |
"Size": "15 GB", | |
"Version": "es_579h_2020-12-11", | |
"Validated_Hr_Total": 324, | |
"Overall_Hr_Total": 579, | |
"Number_Of_Voice": 19484, | |
}, | |
"et": { | |
"Language": "Estonian", | |
"Date": "2020-12-11", | |
"Size": "732 MB", | |
"Version": "et_27h_2020-12-11", | |
"Validated_Hr_Total": 19, | |
"Overall_Hr_Total": 27, | |
"Number_Of_Voice": 543, | |
}, | |
"eu": { | |
"Language": "Basque", | |
"Date": "2020-12-11", | |
"Size": "3 GB", | |
"Version": "eu_131h_2020-12-11", | |
"Validated_Hr_Total": 89, | |
"Overall_Hr_Total": 131, | |
"Number_Of_Voice": 1028, | |
}, | |
"fa": { | |
"Language": "Persian", | |
"Date": "2020-12-11", | |
"Size": "8 GB", | |
"Version": "fa_321h_2020-12-11", | |
"Validated_Hr_Total": 282, | |
"Overall_Hr_Total": 321, | |
"Number_Of_Voice": 3655, | |
}, | |
"fi": { | |
"Language": "Finnish", | |
"Date": "2020-12-11", | |
"Size": "48 MB", | |
"Version": "fi_1h_2020-12-11", | |
"Validated_Hr_Total": 1, | |
"Overall_Hr_Total": 1, | |
"Number_Of_Voice": 27, | |
}, | |
"fr": { | |
"Language": "French", | |
"Date": "2020-12-11", | |
"Size": "18 GB", | |
"Version": "fr_682h_2020-12-11", | |
"Validated_Hr_Total": 623, | |
"Overall_Hr_Total": 682, | |
"Number_Of_Voice": 12953, | |
}, | |
"fy-NL": { | |
"Language": "Frisian", | |
"Date": "2020-12-11", | |
"Size": "1 GB", | |
"Version": "fy-NL_46h_2020-12-11", | |
"Validated_Hr_Total": 14, | |
"Overall_Hr_Total": 46, | |
"Number_Of_Voice": 467, | |
}, | |
"ga-IE": { | |
"Language": "Irish", | |
"Date": "2020-12-11", | |
"Size": "149 MB", | |
"Version": "ga-IE_5h_2020-12-11", | |
"Validated_Hr_Total": 3, | |
"Overall_Hr_Total": 5, | |
"Number_Of_Voice": 101, | |
}, | |
"hi": { | |
"Language": "Hindi", | |
"Date": "2020-12-11", | |
"Size": "20 MB", | |
"Version": "hi_0.8h_2020-12-11", | |
"Validated_Hr_Total": 0.54, | |
"Overall_Hr_Total": 0.8, | |
"Number_Of_Voice": 31, | |
}, | |
"hsb": { | |
"Language": "Sorbian, Upper", | |
"Date": "2020-12-11", | |
"Size": "76 MB", | |
"Version": "hsb_2h_2020-12-11", | |
"Validated_Hr_Total": 2, | |
"Overall_Hr_Total": 2, | |
"Number_Of_Voice": 19, | |
}, | |
"hu": { | |
"Language": "Hungarian", | |
"Date": "2020-12-11", | |
"Size": "232 MB", | |
"Version": "hu_8h_2020-12-11", | |
"Validated_Hr_Total": 8, | |
"Overall_Hr_Total": 8, | |
"Number_Of_Voice": 47, | |
}, | |
"ia": { | |
"Language": "InterLinguia", | |
"Date": "2020-12-11", | |
"Size": "216 MB", | |
"Version": "ia_8h_2020-12-11", | |
"Validated_Hr_Total": 6, | |
"Overall_Hr_Total": 8, | |
"Number_Of_Voice": 36, | |
}, | |
"id": { | |
"Language": "Indonesian", | |
"Date": "2020-12-11", | |
"Size": "454 MB", | |
"Version": "id_17h_2020-12-11", | |
"Validated_Hr_Total": 9, | |
"Overall_Hr_Total": 17, | |
"Number_Of_Voice": 219, | |
}, | |
"it": { | |
"Language": "Italian", | |
"Date": "2020-12-11", | |
"Size": "5 GB", | |
"Version": "it_199h_2020-12-11", | |
"Validated_Hr_Total": 158, | |
"Overall_Hr_Total": 199, | |
"Number_Of_Voice": 5729, | |
}, | |
"ja": { | |
"Language": "Japanese", | |
"Date": "2020-12-11", | |
"Size": "146 MB", | |
"Version": "ja_5h_2020-12-11", | |
"Validated_Hr_Total": 3, | |
"Overall_Hr_Total": 5, | |
"Number_Of_Voice": 235, | |
}, | |
"ka": { | |
"Language": "Georgian", | |
"Date": "2020-12-11", | |
"Size": "99 MB", | |
"Version": "ka_3h_2020-12-11", | |
"Validated_Hr_Total": 3, | |
"Overall_Hr_Total": 3, | |
"Number_Of_Voice": 44, | |
}, | |
"kab": { | |
"Language": "Kabyle", | |
"Date": "2020-12-11", | |
"Size": "16 GB", | |
"Version": "kab_622h_2020-12-11", | |
"Validated_Hr_Total": 525, | |
"Overall_Hr_Total": 622, | |
"Number_Of_Voice": 1309, | |
}, | |
"ky": { | |
"Language": "Kyrgyz", | |
"Date": "2020-12-11", | |
"Size": "553 MB", | |
"Version": "ky_22h_2020-12-11", | |
"Validated_Hr_Total": 11, | |
"Overall_Hr_Total": 22, | |
"Number_Of_Voice": 134, | |
}, | |
"lg": { | |
"Language": "Luganda", | |
"Date": "2020-12-11", | |
"Size": "199 MB", | |
"Version": "lg_8h_2020-12-11", | |
"Validated_Hr_Total": 3, | |
"Overall_Hr_Total": 8, | |
"Number_Of_Voice": 76, | |
}, | |
"lt": { | |
"Language": "Lithuanian", | |
"Date": "2020-12-11", | |
"Size": "129 MB", | |
"Version": "lt_4h_2020-12-11", | |
"Validated_Hr_Total": 2, | |
"Overall_Hr_Total": 4, | |
"Number_Of_Voice": 30, | |
}, | |
"lv": { | |
"Language": "Latvian", | |
"Date": "2020-12-11", | |
"Size": "199 MB", | |
"Version": "lv_7h_2020-12-11", | |
"Validated_Hr_Total": 6, | |
"Overall_Hr_Total": 7, | |
"Number_Of_Voice": 99, | |
}, | |
"mn": { | |
"Language": "Mongolian", | |
"Date": "2020-12-11", | |
"Size": "464 MB", | |
"Version": "mn_17h_2020-12-11", | |
"Validated_Hr_Total": 11, | |
"Overall_Hr_Total": 17, | |
"Number_Of_Voice": 376, | |
}, | |
"mt": { | |
"Language": "Maltese", | |
"Date": "2020-12-11", | |
"Size": "405 MB", | |
"Version": "mt_15h_2020-12-11", | |
"Validated_Hr_Total": 7, | |
"Overall_Hr_Total": 15, | |
"Number_Of_Voice": 171, | |
}, | |
"nl": { | |
"Language": "Dutch", | |
"Date": "2020-12-11", | |
"Size": "2 GB", | |
"Version": "nl_63h_2020-12-11", | |
"Validated_Hr_Total": 59, | |
"Overall_Hr_Total": 63, | |
"Number_Of_Voice": 1012, | |
}, | |
"or": { | |
"Language": "Odia", | |
"Date": "2020-12-11", | |
"Size": "190 MB", | |
"Version": "or_7h_2020-12-11", | |
"Validated_Hr_Total": 0.87, | |
"Overall_Hr_Total": 7, | |
"Number_Of_Voice": 34, | |
}, | |
"pa-IN": { | |
"Language": "Punjabi", | |
"Date": "2020-12-11", | |
"Size": "67 MB", | |
"Version": "pa-IN_2h_2020-12-11", | |
"Validated_Hr_Total": 0.5, | |
"Overall_Hr_Total": 2, | |
"Number_Of_Voice": 26, | |
}, | |
"pl": { | |
"Language": "Polish", | |
"Date": "2020-12-11", | |
"Size": "3 GB", | |
"Version": "pl_129h_2020-12-11", | |
"Validated_Hr_Total": 108, | |
"Overall_Hr_Total": 129, | |
"Number_Of_Voice": 2647, | |
}, | |
"pt": { | |
"Language": "Portuguese", | |
"Date": "2020-12-11", | |
"Size": "2 GB", | |
"Version": "pt_63h_2020-12-11", | |
"Validated_Hr_Total": 50, | |
"Overall_Hr_Total": 63, | |
"Number_Of_Voice": 1120, | |
}, | |
"rm-sursilv": { | |
"Language": "Romansh Sursilvan", | |
"Date": "2020-12-11", | |
"Size": "263 MB", | |
"Version": "rm-sursilv_9h_2020-12-11", | |
"Validated_Hr_Total": 5, | |
"Overall_Hr_Total": 9, | |
"Number_Of_Voice": 78, | |
}, | |
"rm-vallader": { | |
"Language": "Romansh Vallader", | |
"Date": "2020-12-11", | |
"Size": "103 MB", | |
"Version": "rm-vallader_3h_2020-12-11", | |
"Validated_Hr_Total": 2, | |
"Overall_Hr_Total": 3, | |
"Number_Of_Voice": 39, | |
}, | |
"ro": { | |
"Language": "Romanian", | |
"Date": "2020-12-11", | |
"Size": "250 MB", | |
"Version": "ro_9h_2020-12-11", | |
"Validated_Hr_Total": 6, | |
"Overall_Hr_Total": 9, | |
"Number_Of_Voice": 130, | |
}, | |
"ru": { | |
"Language": "Russian", | |
"Date": "2020-12-11", | |
"Size": "3 GB", | |
"Version": "ru_130h_2020-12-11", | |
"Validated_Hr_Total": 111, | |
"Overall_Hr_Total": 130, | |
"Number_Of_Voice": 1412, | |
}, | |
"rw": { | |
"Language": "Kinyarwanda", | |
"Date": "2020-12-11", | |
"Size": "40 GB", | |
"Version": "rw_1510h_2020-12-11", | |
"Validated_Hr_Total": 1183, | |
"Overall_Hr_Total": 1510, | |
"Number_Of_Voice": 410, | |
}, | |
"sah": { | |
"Language": "Sakha", | |
"Date": "2020-12-11", | |
"Size": "173 MB", | |
"Version": "sah_6h_2020-12-11", | |
"Validated_Hr_Total": 4, | |
"Overall_Hr_Total": 6, | |
"Number_Of_Voice": 42, | |
}, | |
"sl": { | |
"Language": "Slovenian", | |
"Date": "2020-12-11", | |
"Size": "212 MB", | |
"Version": "sl_7h_2020-12-11", | |
"Validated_Hr_Total": 5, | |
"Overall_Hr_Total": 7, | |
"Number_Of_Voice": 82, | |
}, | |
"sv-SE": { | |
"Language": "Swedish", | |
"Date": "2020-12-11", | |
"Size": "402 MB", | |
"Version": "sv-SE_15h_2020-12-11", | |
"Validated_Hr_Total": 12, | |
"Overall_Hr_Total": 15, | |
"Number_Of_Voice": 222, | |
}, | |
"ta": { | |
"Language": "Tamil", | |
"Date": "2020-12-11", | |
"Size": "648 MB", | |
"Version": "ta_24h_2020-12-11", | |
"Validated_Hr_Total": 14, | |
"Overall_Hr_Total": 24, | |
"Number_Of_Voice": 266, | |
}, | |
"th": { | |
"Language": "Thai", | |
"Date": "2020-12-11", | |
"Size": "325 MB", | |
"Version": "th_12h_2020-12-11", | |
"Validated_Hr_Total": 8, | |
"Overall_Hr_Total": 12, | |
"Number_Of_Voice": 182, | |
}, | |
"tr": { | |
"Language": "Turkish", | |
"Date": "2020-12-11", | |
"Size": "592 MB", | |
"Version": "tr_22h_2020-12-11", | |
"Validated_Hr_Total": 20, | |
"Overall_Hr_Total": 22, | |
"Number_Of_Voice": 678, | |
}, | |
"tt": { | |
"Language": "Tatar", | |
"Date": "2020-12-11", | |
"Size": "741 MB", | |
"Version": "tt_28h_2020-12-11", | |
"Validated_Hr_Total": 26, | |
"Overall_Hr_Total": 28, | |
"Number_Of_Voice": 185, | |
}, | |
"uk": { | |
"Language": "Ukrainian", | |
"Date": "2020-12-11", | |
"Size": "1 GB", | |
"Version": "uk_43h_2020-12-11", | |
"Validated_Hr_Total": 30, | |
"Overall_Hr_Total": 43, | |
"Number_Of_Voice": 459, | |
}, | |
"vi": { | |
"Language": "Vietnamese", | |
"Date": "2020-12-11", | |
"Size": "50 MB", | |
"Version": "vi_1h_2020-12-11", | |
"Validated_Hr_Total": 0.74, | |
"Overall_Hr_Total": 1, | |
"Number_Of_Voice": 62, | |
}, | |
"vot": { | |
"Language": "Votic", | |
"Date": "2020-12-11", | |
"Size": "7 MB", | |
"Version": "vot_0.28h_2020-12-11", | |
"Validated_Hr_Total": 0, | |
"Overall_Hr_Total": 0.28, | |
"Number_Of_Voice": 3, | |
}, | |
"zh-CN": { | |
"Language": "Chinese (China)", | |
"Date": "2020-12-11", | |
"Size": "2 GB", | |
"Version": "zh-CN_78h_2020-12-11", | |
"Validated_Hr_Total": 56, | |
"Overall_Hr_Total": 78, | |
"Number_Of_Voice": 3501, | |
}, | |
"zh-HK": { | |
"Language": "Chinese (Hong Kong)", | |
"Date": "2020-12-11", | |
"Size": "3 GB", | |
"Version": "zh-HK_100h_2020-12-11", | |
"Validated_Hr_Total": 50, | |
"Overall_Hr_Total": 100, | |
"Number_Of_Voice": 2536, | |
}, | |
"zh-TW": { | |
"Language": "Chinese (Taiwan)", | |
"Date": "2020-12-11", | |
"Size": "2 GB", | |
"Version": "zh-TW_78h_2020-12-11", | |
"Validated_Hr_Total": 55, | |
"Overall_Hr_Total": 78, | |
"Number_Of_Voice": 1444, | |
}, | |
} | |
for lang_id in LANGUAGES.keys(): | |
print(DATA_URL.format(lang_id)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for this file.