pypt/land_id.py

## land_id.py
import logging
from typing import Optional
from urllib.parse import urlparse

import cld2

logging.basicConfig(level=logging.DEBUG)

UNKNOWN_LANGUAGE_CODE = 'un-UN'


def language_code_for_text(text: str) -> Optional[str]:
    """
    Guess the language of a text input.

    https://pypi.org/project/cld2-cffi/

    :param text: Text.
    :return: ISO 639-1 language code, e.g. "en", or None if language couldn't be determined.
    """
    assert text is not None, "Text is None."
    assert len(text), "Text is empty."

    try:
        is_reliable, text_bytes_found, details = cld2.detect(text)
    except Exception as ex:
        logging.warning(f"Unable to determine language for text '{text[0:40]}...': {ex}")
        return None

    if not is_reliable:
        logging.warning(f"Language guess is not reliable for text '{text[0:40]}...'")
        # Still stick with it though

    if len(details) == 0:
        logging.warning(f"Language could not be guessed for text '{text[0:40]}...'")
        return None

    best_guess = details[0]
    return best_guess.language_code


def country_tld_from_url(url: str) -> Optional[str]:
    """
    Extract country TLD from URL.

    :param url: URL, e.g. "https://www.bbc.co.uk/news/politics/eu-regions/vote2014_sitemap.xml".
    :return: Country TLD of URL without the prefix period, e.g. "uk", or None if there's no TLD.
    """
    if not url:
        return None

    try:
        parsed_url = urlparse(url)
    except Exception as ex:
        logging.warning(f"Unable to parse URL {url}: {ex}")
        return None

    hostname_parts = parsed_url.hostname.split('.')

    if len(hostname_parts) < 2:
        logging.warning(f"No TLD found in URL {url}")
        return None

    return hostname_parts[-1]


def iso_639_1_code_to_bcp_47_identifier(iso_639_1_code: str, url_hint: Optional[str] = None) -> Optional[str]:
    """
    Convert ISO 639-1 language code to BCP-47 identifier.

    Google Cloud requires for us to pass the language as a BCP-47 identifier:

    https://cloud.google.com/speech-to-text/docs/languages

    so we have to do some guessing about the dialect the audio data is going to be in.

    :param iso_639_1_code: ISO 639-1 language code, e.g. "en".
    :param url_hint: Optional URL hint to use for guessing the dialect used.
    :return: BCP-47 identifier, e.g. "en-US", or None if the identifier can't be determined.
    """

    if not iso_639_1_code:
        logging.warning("ISO 639-1 code is unset.")
        return None

    tld = None
    if url_hint:
        tld = country_tld_from_url(url_hint)

    iso_639_1_code = iso_639_1_code.lower()

    if iso_639_1_code in {
        # Language == country.upper()
        'de',
        'hr',
        'is',
        'it',
        'lv',
        'lt',
        'hu',
        'nl',
        'pl',
        'ro',
        'sk',
        'sl',
        'fi',
        'tr',
        'bg',
        'ru',
        'th',
    }:
        return f"{iso_639_1_code}-{iso_639_1_code.upper()}"

    elif iso_639_1_code in {
        # Languages in India
        'gu',
        'gn',
        'ml',
        'mr',
    }:
        return f"{iso_639_1_code}-IN"

    elif iso_639_1_code == 'af':
        return 'af-ZA'

    elif iso_639_1_code == 'am':
        return 'am-ET'

    elif iso_639_1_code == 'hy':
        return 'hy-AM'

    elif iso_639_1_code == 'az':
        return 'az-AZ'

    elif iso_639_1_code == 'id':
        return 'id-ID'

    elif iso_639_1_code == 'ms':
        return 'ms-MY'

    elif iso_639_1_code == 'bn':

        if tld == 'in':
            return 'bn-IN'

        # Fallback
        return 'bn-BD'

    elif iso_639_1_code == 'ca':
        return 'ca-ES'

    elif iso_639_1_code == 'cs':
        return 'cs-CZ'

    elif iso_639_1_code == 'da':
        return 'da-DK'

    elif iso_639_1_code == 'en':

        if tld == 'uk':
            return 'en-GB'

        elif tld in {
            'au',
            'ca',
            'gh',
            'in',
            'ie',
            'ke',
            'nz',
            'ng',
            'ph',
            'sg',
            'za',
            'tz',
        }:
            return f'en-{tld.upper()}'

        # Fallback
        return 'en-US'

    elif iso_639_1_code == 'es':

        if tld in {
            'ar',
            'bo',
            'cl',
            'co',
            'cr',
            'ec',
            'sv',
            'es',
            'us',
            'gt',
            'hn',
            'mx',
            'ni',
            'pa',
            'py',
            'pe',
            'pr',
            'do',
            'uy',
            've',
        }:
            return f'es-{tld.upper()}'

        # Fallback
        return 'es-ES'

    elif iso_639_1_code == 'eu':
        return 'eu-ES'

    elif iso_639_1_code == 'fil':
        return 'fil-PH'

    elif iso_639_1_code == 'fr':
        if tld == 'ca':
            return 'fr-CA'

        return 'fr-FR'

    elif iso_639_1_code == 'gl':
        return 'gl-ES'

    elif iso_639_1_code == 'ka':
        return 'ka-GE'

    elif iso_639_1_code == 'zu':
        return 'zu-ZA'

    elif iso_639_1_code == 'jv':
        return 'jv-ID'

    elif iso_639_1_code == 'km':
        return 'km-KH'

    elif iso_639_1_code == 'lo':
        return 'lo-LA'

    elif iso_639_1_code == 'ne':
        return 'ne-NP'

    elif iso_639_1_code == 'nb':
        return 'nb-NO'

    elif iso_639_1_code == 'pt':
        if tld == 'br':
            return 'pt-BR'

        # Fallback
        return 'pt-PT'

    elif iso_639_1_code == 'si':
        return 'si-LK'

    elif iso_639_1_code == 'su':
        return 'su-ID'

    elif iso_639_1_code == 'sw':
        if tld == 'tz':
            return 'sw-TZ'

        # Fallback
        return 'sw-KE'

    elif iso_639_1_code == 'sv':
        return 'sv-SE'

    elif iso_639_1_code == 'ta':
        if tld in {
            'sg',
            'lk',
            'my',
        }:
            return f'ta-{tld.upper()}'

        # Fallback
        return 'ta-IN'

    elif iso_639_1_code == 'te':
        return 'te-IN'

    elif iso_639_1_code == 'vi':
        return 'vi-VN'

    elif iso_639_1_code == 'ur':
        if tld == 'pk':
            return 'ur-PK'

        # Fallback -- more Urdu speakers in India than Pakistan
        return 'ur-IN'

    elif iso_639_1_code == 'el':
        return 'el-GR'

    elif iso_639_1_code == 'sr':
        return 'sr-RS'

    elif iso_639_1_code == 'uk':
        return 'uk-UA'

    elif iso_639_1_code == 'he':
        return 'he-IL'

    elif iso_639_1_code == 'ar':

        if tld in {
            'il',
            'jo',
            'ae',
            'bh',
            'dz',
            'sa',
            'iq',
            'kw',
            'ma',
            'tn',
            'om',
            'ps',
            'qa',
            'lb',
        }:
            return f'ar-{tld.upper()}'

        # Fallback -- Egyptian Arabic is the most popular dialect
        return 'ar-EG'

    elif iso_639_1_code == 'hi':
        return 'hi-IN'

    elif iso_639_1_code == 'ko':
        return 'ko-KR'

    # Chinese (simplified)
    elif iso_639_1_code == 'zh' or iso_639_1_code == 'zh-Hans':
        if tld == 'hk':
            return 'zh-HK'

        # Fallback
        return 'zh'

    # Chinese (traditional)
    elif iso_639_1_code == 'yue' or iso_639_1_code == 'zh-Hant':
        if tld == 'tw':
            return 'zh-TW'

        # Fallback
        return 'yue-Hant-HK'

    elif iso_639_1_code == 'ja':
        return 'ja-JP'

    return None
	import logging
	from typing import Optional
	from urllib.parse import urlparse

	import cld2

	logging.basicConfig(level=logging.DEBUG)

	UNKNOWN_LANGUAGE_CODE = 'un-UN'


	def language_code_for_text(text: str) -> Optional[str]:
	"""
	Guess the language of a text input.

	https://pypi.org/project/cld2-cffi/

	:param text: Text.
	:return: ISO 639-1 language code, e.g. "en", or None if language couldn't be determined.
	"""
	assert text is not None, "Text is None."
	assert len(text), "Text is empty."

	try:
	is_reliable, text_bytes_found, details = cld2.detect(text)
	except Exception as ex:
	logging.warning(f"Unable to determine language for text '{text[0:40]}...': {ex}")
	return None

	if not is_reliable:
	logging.warning(f"Language guess is not reliable for text '{text[0:40]}...'")
	# Still stick with it though

	if len(details) == 0:
	logging.warning(f"Language could not be guessed for text '{text[0:40]}...'")
	return None

	best_guess = details[0]
	return best_guess.language_code


	def country_tld_from_url(url: str) -> Optional[str]:
	"""
	Extract country TLD from URL.

	:param url: URL, e.g. "https://www.bbc.co.uk/news/politics/eu-regions/vote2014_sitemap.xml".
	:return: Country TLD of URL without the prefix period, e.g. "uk", or None if there's no TLD.
	"""
	if not url:
	return None

	try:
	parsed_url = urlparse(url)
	except Exception as ex:
	logging.warning(f"Unable to parse URL {url}: {ex}")
	return None

	hostname_parts = parsed_url.hostname.split('.')

	if len(hostname_parts) < 2:
	logging.warning(f"No TLD found in URL {url}")
	return None

	return hostname_parts[-1]


	def iso_639_1_code_to_bcp_47_identifier(iso_639_1_code: str, url_hint: Optional[str] = None) -> Optional[str]:
	"""
	Convert ISO 639-1 language code to BCP-47 identifier.

	Google Cloud requires for us to pass the language as a BCP-47 identifier:

	https://cloud.google.com/speech-to-text/docs/languages

	so we have to do some guessing about the dialect the audio data is going to be in.

	:param iso_639_1_code: ISO 639-1 language code, e.g. "en".
	:param url_hint: Optional URL hint to use for guessing the dialect used.
	:return: BCP-47 identifier, e.g. "en-US", or None if the identifier can't be determined.
	"""

	if not iso_639_1_code:
	logging.warning("ISO 639-1 code is unset.")
	return None

	tld = None
	if url_hint:
	tld = country_tld_from_url(url_hint)

	iso_639_1_code = iso_639_1_code.lower()

	if iso_639_1_code in {
	# Language == country.upper()
	'de',
	'hr',
	'is',
	'it',
	'lv',
	'lt',
	'hu',
	'nl',
	'pl',
	'ro',
	'sk',
	'sl',
	'fi',
	'tr',
	'bg',
	'ru',
	'th',
	}:
	return f"{iso_639_1_code}-{iso_639_1_code.upper()}"

	elif iso_639_1_code in {
	# Languages in India
	'gu',
	'gn',
	'ml',
	'mr',
	}:
	return f"{iso_639_1_code}-IN"

	elif iso_639_1_code == 'af':
	return 'af-ZA'

	elif iso_639_1_code == 'am':
	return 'am-ET'

	elif iso_639_1_code == 'hy':
	return 'hy-AM'

	elif iso_639_1_code == 'az':
	return 'az-AZ'

	elif iso_639_1_code == 'id':
	return 'id-ID'

	elif iso_639_1_code == 'ms':
	return 'ms-MY'

	elif iso_639_1_code == 'bn':

	if tld == 'in':
	return 'bn-IN'

	# Fallback
	return 'bn-BD'

	elif iso_639_1_code == 'ca':
	return 'ca-ES'

	elif iso_639_1_code == 'cs':
	return 'cs-CZ'

	elif iso_639_1_code == 'da':
	return 'da-DK'

	elif iso_639_1_code == 'en':

	if tld == 'uk':
	return 'en-GB'

	elif tld in {
	'au',
	'ca',
	'gh',
	'in',
	'ie',
	'ke',
	'nz',
	'ng',
	'ph',
	'sg',
	'za',
	'tz',
	}:
	return f'en-{tld.upper()}'

	# Fallback
	return 'en-US'

	elif iso_639_1_code == 'es':

	if tld in {
	'ar',
	'bo',
	'cl',
	'co',
	'cr',
	'ec',
	'sv',
	'es',
	'us',
	'gt',
	'hn',
	'mx',
	'ni',
	'pa',
	'py',
	'pe',
	'pr',
	'do',
	'uy',
	've',
	}:
	return f'es-{tld.upper()}'

	# Fallback
	return 'es-ES'

	elif iso_639_1_code == 'eu':
	return 'eu-ES'

	elif iso_639_1_code == 'fil':
	return 'fil-PH'

	elif iso_639_1_code == 'fr':
	if tld == 'ca':
	return 'fr-CA'

	return 'fr-FR'

	elif iso_639_1_code == 'gl':
	return 'gl-ES'

	elif iso_639_1_code == 'ka':
	return 'ka-GE'

	elif iso_639_1_code == 'zu':
	return 'zu-ZA'

	elif iso_639_1_code == 'jv':
	return 'jv-ID'

	elif iso_639_1_code == 'km':
	return 'km-KH'

	elif iso_639_1_code == 'lo':
	return 'lo-LA'

	elif iso_639_1_code == 'ne':
	return 'ne-NP'

	elif iso_639_1_code == 'nb':
	return 'nb-NO'

	elif iso_639_1_code == 'pt':
	if tld == 'br':
	return 'pt-BR'

	# Fallback
	return 'pt-PT'

	elif iso_639_1_code == 'si':
	return 'si-LK'

	elif iso_639_1_code == 'su':
	return 'su-ID'

	elif iso_639_1_code == 'sw':
	if tld == 'tz':
	return 'sw-TZ'

	# Fallback
	return 'sw-KE'

	elif iso_639_1_code == 'sv':
	return 'sv-SE'

	elif iso_639_1_code == 'ta':
	if tld in {
	'sg',
	'lk',
	'my',
	}:
	return f'ta-{tld.upper()}'

	# Fallback
	return 'ta-IN'

	elif iso_639_1_code == 'te':
	return 'te-IN'

	elif iso_639_1_code == 'vi':
	return 'vi-VN'

	elif iso_639_1_code == 'ur':
	if tld == 'pk':
	return 'ur-PK'

	# Fallback -- more Urdu speakers in India than Pakistan
	return 'ur-IN'

	elif iso_639_1_code == 'el':
	return 'el-GR'

	elif iso_639_1_code == 'sr':
	return 'sr-RS'

	elif iso_639_1_code == 'uk':
	return 'uk-UA'

	elif iso_639_1_code == 'he':
	return 'he-IL'

	elif iso_639_1_code == 'ar':

	if tld in {
	'il',
	'jo',
	'ae',
	'bh',
	'dz',
	'sa',
	'iq',
	'kw',
	'ma',
	'tn',
	'om',
	'ps',
	'qa',
	'lb',
	}:
	return f'ar-{tld.upper()}'

	# Fallback -- Egyptian Arabic is the most popular dialect
	return 'ar-EG'

	elif iso_639_1_code == 'hi':
	return 'hi-IN'

	elif iso_639_1_code == 'ko':
	return 'ko-KR'

	# Chinese (simplified)
	elif iso_639_1_code == 'zh' or iso_639_1_code == 'zh-Hans':
	if tld == 'hk':
	return 'zh-HK'

	# Fallback
	return 'zh'

	# Chinese (traditional)
	elif iso_639_1_code == 'yue' or iso_639_1_code == 'zh-Hant':
	if tld == 'tw':
	return 'zh-TW'

	# Fallback
	return 'yue-Hant-HK'

	elif iso_639_1_code == 'ja':
	return 'ja-JP'

	return None