JordanReiter/split_by_language.py

## split_by_language.py
'''
requires langdetect, available on pypi (pip install langdetect)
https://github.com/Mimino666/langdetect
'''
import langdetect


def split_by_language(content, delimiter='\n', joiner='\n',
                      languages=None, fail_silently=True):
    '''
    Given a stretch of text written in 2 or more languages
    This function identifies each language section, returns
    the first section, the first language, and a dictionary
    of all languages.

    Given languages, it ignores all languages other than the
    languages provided and assumes it is the previous language
    or the first identifiable language.

    If the delimiter is given as a regex, then it is used to
    split the chunks, and the value for joiner is used to
    combine the chunks at the end.

    If the delimiter is a string, any value given for joiner
    is ignored.
    '''
    try:
        chunks = content.split(delimiter)
        joiner = delimiter
    except TypeError:
        # must not be a string, so must be a regex!
        chunks = delimiter.split(content)
    first_lang = None
    last_lang = None
    lang_sections = {}
    for chunk in chunks:
        try:
            current_lang = langdetect.detect(chunk)
        except langdetect.lang_detect_exception.LangDetectException:
            if not fail_silently:
                raise
            current_lang = last_lang
        if languages and current_lang not in languages:
            if not fail_silently:
                raise ValueError("Invalid language: {}".format(current_lang))
            current_lang = last_lang
        if not first_lang:
            first_lang = current_lang
        lang_sections.setdefault(current_lang, [])
        if current_lang != last_lang:
            if len(lang_sections[current_lang]):
                # language changed but this language already has content
                if not fail_silently:
                    raise ValueError(
                        "Language {} found in different locations".format(
                            current_lang
                        )
                    )
                # assume language didn't change & is just a detection error
                current_lang = last_lang
            if None in lang_sections:
                # there was an unidentified language, so put it here
                lang_sections[current_lang] += lang_sections.pop(None)
            if last_lang in lang_sections:
                # add empty element so it ends with joiner
                lang_sections[last_lang].append('')
        lang_sections[current_lang].append(chunk)
        last_lang = current_lang
    primary = joiner.join(lang_sections.get(first_lang)).strip()
    if languages and not first_lang:
        first_lang = languages[0]
        lang_sections[first_lang] = lang_sections.pop(None, None) or []
    return (
        primary,
        first_lang,
        {kk: joiner.join(vv).strip() for kk, vv in lang_sections.items()}
    )
	'''
	requires langdetect, available on pypi (pip install langdetect)
	https://github.com/Mimino666/langdetect
	'''
	import langdetect


	def split_by_language(content, delimiter='\n', joiner='\n',
	languages=None, fail_silently=True):
	'''
	Given a stretch of text written in 2 or more languages
	This function identifies each language section, returns
	the first section, the first language, and a dictionary
	of all languages.

	Given languages, it ignores all languages other than the
	languages provided and assumes it is the previous language
	or the first identifiable language.

	If the delimiter is given as a regex, then it is used to
	split the chunks, and the value for joiner is used to
	combine the chunks at the end.

	If the delimiter is a string, any value given for joiner
	is ignored.
	'''
	try:
	chunks = content.split(delimiter)
	joiner = delimiter
	except TypeError:
	# must not be a string, so must be a regex!
	chunks = delimiter.split(content)
	first_lang = None
	last_lang = None
	lang_sections = {}
	for chunk in chunks:
	try:
	current_lang = langdetect.detect(chunk)
	except langdetect.lang_detect_exception.LangDetectException:
	if not fail_silently:
	raise
	current_lang = last_lang
	if languages and current_lang not in languages:
	if not fail_silently:
	raise ValueError("Invalid language: {}".format(current_lang))
	current_lang = last_lang
	if not first_lang:
	first_lang = current_lang
	lang_sections.setdefault(current_lang, [])
	if current_lang != last_lang:
	if len(lang_sections[current_lang]):
	# language changed but this language already has content
	if not fail_silently:
	raise ValueError(
	"Language {} found in different locations".format(
	current_lang
	)
	)
	# assume language didn't change & is just a detection error
	current_lang = last_lang
	if None in lang_sections:
	# there was an unidentified language, so put it here
	lang_sections[current_lang] += lang_sections.pop(None)
	if last_lang in lang_sections:
	# add empty element so it ends with joiner
	lang_sections[last_lang].append('')
	lang_sections[current_lang].append(chunk)
	last_lang = current_lang
	primary = joiner.join(lang_sections.get(first_lang)).strip()
	if languages and not first_lang:
	first_lang = languages[0]
	lang_sections[first_lang] = lang_sections.pop(None, None) or []
	return (
	primary,
	first_lang,
	{kk: joiner.join(vv).strip() for kk, vv in lang_sections.items()}
	)