Skip to content

Instantly share code, notes, and snippets.

@fvicente
Created December 18, 2017 11:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fvicente/91d7331b3a33f4e9d156253cf993f9ab to your computer and use it in GitHub Desktop.
Save fvicente/91d7331b3a33f4e9d156253cf993f9ab to your computer and use it in GitHub Desktop.
Dictionary of most used language by country based on https://stackoverflow.com/a/22202770/2962940
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.ElementTree as etree
import urllib
import pprint
def get_most_used_language_by_territory():
langxml = urllib.urlopen('http://unicode.org/repos/cldr/trunk/common/supplemental/supplementalData.xml')
langtree = etree.XML(langxml.read())
langs = {}
for t in langtree.find('territoryInfo').findall('territory'):
code = t.get('type')
# Seems like the languages are conveniently ordered by population percent, so the first one
# on the list will be the most used.
# We don't even need to look at the 'official' status, just get what is better for the user
lang_pop = t.findall('languagePopulation')
langs[code] = (lang_pop[0] if len(lang_pop) > 0 else {}).get('type', 'en')
return langs
def main():
langs = get_most_used_language_by_territory()
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(langs)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment