Created
July 31, 2018 12:19
-
-
Save flodolo/23844c17dc349542627e1367d061b7ee to your computer and use it in GitHub Desktop.
Extract language names from CLDR for bug 1476781
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
# Needs clones of these repositories in the same path as the script | |
# https://github.com/unicode-cldr/cldr-misc-full/ | |
# https://github.com/unicode-cldr/cldr-localenames-full | |
import json | |
import os | |
from collections import OrderedDict | |
from urllib.request import urlopen | |
# This array is used to map a Mozilla code to CLDR, e.g. | |
# 'es-ES': 'es' | |
locale_mapping = { | |
'bn-BD': 'bn', | |
'en-US': 'en', | |
'es-ES': 'es', | |
'fy-NL': 'fy', | |
'ga-IE': 'ga', | |
'gu-IN': 'gu', | |
'hi-IN': 'hi', | |
'hy-AM': 'hy', | |
'ja-JP-mac': 'ja', | |
'nb-NO': 'nb', | |
'ne-NP': 'ne', | |
'nn-NO': 'nn', | |
'pa-IN': 'pa', | |
'pt-BR': 'pt', | |
'sv-SE': 'sv', | |
'zh-CN': 'zh-Hans', | |
'zh-TW': 'zh-Hant', | |
} | |
def getShippingLocales(shipping_locales): | |
# Get the list of locales shipping in Firefox | |
locales_urls = [ | |
'https://hg.mozilla.org/mozilla-central/raw-file/default/browser/locales/all-locales', | |
'https://hg.mozilla.org/mozilla-central/raw-file/default/mobile/android/locales/all-locales', | |
] | |
for locales_url in locales_urls: | |
try: | |
with urlopen(locales_url) as response: | |
output = response.readlines() | |
for locale in output: | |
locale = locale.rstrip().decode() | |
shipping_locales.append(locale) | |
except Exception as e: | |
print(e) | |
shipping_locales = list(set(shipping_locales)) | |
shipping_locales.sort() | |
def main(): | |
# Path to this script | |
script_folder = os.path.abspath(os.path.dirname(__file__)) | |
shipping_locales = [] | |
getShippingLocales(shipping_locales) | |
log = { | |
'no-cldr': [], | |
'missing-transform': [], | |
'missing-name': [], | |
'capitalized': [] | |
} | |
#with open(os.path.join(cldr_localenames_path, 'en', 'languages.json')) as data_file: | |
# json_data = json.load(data_file) | |
#cldr_languages = json_data['main']['en']['localeDisplayNames']['languages'] | |
languages = OrderedDict() | |
transforms=[] | |
for locale in shipping_locales: | |
cldr_locale = locale_mapping.get(locale, locale) | |
cldr_path_names = os.path.join( | |
script_folder, 'cldr-localenames-full', 'main', cldr_locale) | |
cldr_path_transforms = os.path.join( | |
script_folder, 'cldr-misc-full', 'main', cldr_locale) | |
# Check if folder exists in CLDR | |
if not os.path.isdir(cldr_path_names): | |
log['no-cldr'].append(cldr_locale) | |
languages[locale] = 'N/A' | |
continue | |
# Read transform. Possible values are | |
# 'titlecase-firstword': title case | |
# 'no-change': no change from the language name | |
transform_file = os.path.join( | |
cldr_path_transforms, 'contextTransforms.json') | |
text_transformation = 'no-change' | |
if os.path.isfile(transform_file): | |
with open(transform_file) as data_file: | |
json_data = json.load(data_file) | |
try: | |
text_transformation = json_data['main'][cldr_locale]['contextTransforms']['languages']['uiListOrMenu'] | |
except Exception as e: | |
log['missing-transform'].append(locale) | |
# Read language name | |
language_file = os.path.join( | |
cldr_path_names, 'languages.json') | |
if os.path.isfile(language_file): | |
with open(language_file) as data_file: | |
language_name = 'N/A' | |
json_data = json.load(data_file) | |
try: | |
language_name = json_data['main'][cldr_locale]['localeDisplayNames']['languages'][cldr_locale] | |
except Exception as e: | |
log['missing-name'].append(locale) | |
# Apply text transform | |
if text_transformation == 'titlecase-firstword' and language_name != 'N/A': | |
print('Language name capitalized for {}.\nOriginal: {}\nUpdated {}'.format( | |
cldr_locale, language_name, language_name.capitalize() | |
)) | |
language_name = language_name.capitalize() | |
log['capitalized'].append(locale) | |
languages[locale] = language_name | |
for list_type, locales in log.items(): | |
locales.sort() | |
print('Locales not available in CLDR: {}'.format(', '.join(log['no-cldr']))) | |
print('Locales missing language name: {}'.format(', '.join(log['missing-transform']))) | |
print('Locales missing context transform: {}'.format(', '.join(log['missing-name']))) | |
print('Locales capitalized: {}'.format(', '.join(log['capitalized']))) | |
with open('output.json', 'w') as f: | |
json.dump(languages, f, ensure_ascii=False, indent=2, sort_keys=True) | |
print('JSON saved as output.json') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment