Bouke/iana_extractor.py

## iana_extractor.py
"""
Parses the language tags provided by IANA_

For a definition of the tag types, see
http://www.w3.org/International/questions/qa-choosing-language-tags

IANA_: http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
"""
from collections import OrderedDict
from pprint import pprint

from urllib.request import urlopen

URL = "http://www.iana.org/assignments/language-subtag-registry/language" \
      "-subtag-registry"


def string_lines(iterator):
    prev_line = None
    for line in iterator:
        line = line.decode(encoding='UTF-8').replace('\n', '')
        if line.startswith('  '):
            prev_line += line[1:]
            continue
        if prev_line:
            yield prev_line
        prev_line = line
    if prev_line:
        yield prev_line


def locale_chunks(iterator):
    data = []
    for line in iterator:
        if line == '%%':
            yield data
            data = []
        else:
            try:
                data.append(line.split(': ')[0:2])
            except ValueError:
                pass


def locale_dict(iterator):
    for data in iterator:
        yield {key: value for key, value in data}


def language_filter(iterator):
    for locale in iterator:
        type = locale.get('Type')
        if type == 'language':
            yield locale['Subtag'], locale['Description']
        elif type == 'extlang':
            yield locale['Subtag'], locale['Description']
        elif type == 'redundant':
            yield locale['Tag'], locale['Description']


language_iterator = language_filter(locale_dict(locale_chunks(string_lines(
    urlopen(URL)))))
LANGUAGES = OrderedDict(sorted(language_iterator), key=lambda item: item[0])

pprint(LANGUAGES)
	"""
	Parses the language tags provided by IANA_

	For a definition of the tag types, see
	http://www.w3.org/International/questions/qa-choosing-language-tags

	IANA_: http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
	"""
	from collections import OrderedDict
	from pprint import pprint

	from urllib.request import urlopen

	URL = "http://www.iana.org/assignments/language-subtag-registry/language" \
	"-subtag-registry"


	def string_lines(iterator):
	prev_line = None
	for line in iterator:
	line = line.decode(encoding='UTF-8').replace('\n', '')
	if line.startswith(' '):
	prev_line += line[1:]
	continue
	if prev_line:
	yield prev_line
	prev_line = line
	if prev_line:
	yield prev_line


	def locale_chunks(iterator):
	data = []
	for line in iterator:
	if line == '%%':
	yield data
	data = []
	else:
	try:
	data.append(line.split(': ')[0:2])
	except ValueError:
	pass


	def locale_dict(iterator):
	for data in iterator:
	yield {key: value for key, value in data}


	def language_filter(iterator):
	for locale in iterator:
	type = locale.get('Type')
	if type == 'language':
	yield locale['Subtag'], locale['Description']
	elif type == 'extlang':
	yield locale['Subtag'], locale['Description']
	elif type == 'redundant':
	yield locale['Tag'], locale['Description']


	language_iterator = language_filter(locale_dict(locale_chunks(string_lines(
	urlopen(URL)))))
	LANGUAGES = OrderedDict(sorted(language_iterator), key=lambda item: item[0])

	pprint(LANGUAGES)