Skip to content

Instantly share code, notes, and snippets.

@JordanReiter
Created May 9, 2019 17:13
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save JordanReiter/5f63c43d542ad0e6a78f17a733e5626a to your computer and use it in GitHub Desktop.
Save JordanReiter/5f63c43d542ad0e6a78f17a733e5626a to your computer and use it in GitHub Desktop.
Given a block with mixed languages, split into individual sections by language
'''
requires langdetect, available on pypi (pip install langdetect)
https://github.com/Mimino666/langdetect
'''
import langdetect
def split_by_language(content, delimiter='\n', joiner='\n',
languages=None, fail_silently=True):
'''
Given a stretch of text written in 2 or more languages
This function identifies each language section, returns
the first section, the first language, and a dictionary
of all languages.
Given languages, it ignores all languages other than the
languages provided and assumes it is the previous language
or the first identifiable language.
If the delimiter is given as a regex, then it is used to
split the chunks, and the value for joiner is used to
combine the chunks at the end.
If the delimiter is a string, any value given for joiner
is ignored.
'''
try:
chunks = content.split(delimiter)
joiner = delimiter
except TypeError:
# must not be a string, so must be a regex!
chunks = delimiter.split(content)
first_lang = None
last_lang = None
lang_sections = {}
for chunk in chunks:
try:
current_lang = langdetect.detect(chunk)
except langdetect.lang_detect_exception.LangDetectException:
if not fail_silently:
raise
current_lang = last_lang
if languages and current_lang not in languages:
if not fail_silently:
raise ValueError("Invalid language: {}".format(current_lang))
current_lang = last_lang
if not first_lang:
first_lang = current_lang
lang_sections.setdefault(current_lang, [])
if current_lang != last_lang:
if len(lang_sections[current_lang]):
# language changed but this language already has content
if not fail_silently:
raise ValueError(
"Language {} found in different locations".format(
current_lang
)
)
# assume language didn't change & is just a detection error
current_lang = last_lang
if None in lang_sections:
# there was an unidentified language, so put it here
lang_sections[current_lang] += lang_sections.pop(None)
if last_lang in lang_sections:
# add empty element so it ends with joiner
lang_sections[last_lang].append('')
lang_sections[current_lang].append(chunk)
last_lang = current_lang
primary = joiner.join(lang_sections.get(first_lang)).strip()
if languages and not first_lang:
first_lang = languages[0]
lang_sections[first_lang] = lang_sections.pop(None, None) or []
return (
primary,
first_lang,
{kk: joiner.join(vv).strip() for kk, vv in lang_sections.items()}
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment