mindey/md_language_splitter_autodetect.py

## md_language_splitter_autodetect.py
import os
import collections
import langdetect

LANGUAGE_CODES = os.listdir(langdetect.PROFILES_DIRECTORY)

def detect_language(text, max_length=2):
    """ Make sure we return N-letter keys for languages"""
    shorter = {'zh-cn': 'cn', 'zh-tw': 'zh'}
    code = langdetect.detect(text)
    short_code = shorter.get(code) if len(code) > max_length else code
    return short_code[:max_length]

def split(text, sep='.:', ends=['\n', ':'], min_key_length=2, max_key_length=2,
          autodetect=True, pargraph_sep='\n\n', markdown=False, title=False):
    """
    Splits text by `sep`, and combines texts with same keys before `ends`,
    if they are not shorter/longer than `min_key_length` and `max_key_length`.
    Assigns the rest of the parts to key called None. Returns a dict.

    Detects language if not present, treating each paragraph separately.

    Tip:
         Change 'markdown' to True to get result combined back to markdown.
         Pass title=True to convert to title version, using the ':' as end.
    """

    result = collections.defaultdict(str)
    lang_seq = []

    for token in text.split(sep):
        if not token:
            continue

        name = None
        chunk = token

        if len(token[:max_key_length+1]) == max_key_length+1:

            for symbol in ends:
                pos = token[:max_key_length+1].find(symbol)

                if min_key_length <= pos <= max_key_length:
                    name, chunk = token[:pos], token[pos+1:]

        if not name:
            if autodetect:

                paragraphs = chunk.split(pargraph_sep)
                number_of_paragraphs = len(paragraphs)
                for i, paragraph in enumerate(paragraphs):
                    if not paragraph:
                        continue

                    name = detect_language(paragraph)
                    result[name] += paragraph

                    if i < number_of_paragraphs - 1:
                        result[name] += pargraph_sep

                    if name not in lang_seq:
                        lang_seq.append(name)
            else:
                result[name] += chunk

                if name not in lang_seq:
                    lang_seq.append(name)
        else:
            result[name] += chunk

            if name not in lang_seq:
                lang_seq.append(name)

    result = collections.OrderedDict(
        [(lang, result[lang]) for lang in lang_seq]
    )


    if markdown:

        text_md = ''

        for lang in lang_seq:
            text_md += '{sep}{lang}{end}{text}'.format(
                sep = sep,
                lang = lang,
                end = ends[0] if not title else ends[1],
                text = result[lang],
            )

        return text_md.strip()

    return result

def test_title():
    text = '.:en:hello world.:lt:smart world.:ja:今日は、世界'
    expect = collections.OrderedDict(
        [('en', 'hello world'),
         ('lt', 'smart world'),
         ('ja', '今日は、世界')]
    )
    assert(
        split(text) == expect
    )

def test_body():
    text ='''.:en
some text

which is good

.:ru
несколько текста

.:en
so want to try

.:lt
nieko sau, viskas gerai

.:cn
中文也可以的
'''
    expect = collections.OrderedDict(
        [('en', 'some text\n\nwhich is good\n\nso want to try\n\n'),
         ('ru', 'несколько текста\n\n'),
         ('lt', 'nieko sau, viskas gerai\n\n'),
         ('cn', '中文也可以的\n')]
    )

    assert(
        split(text) == expect
    )

def test_partial_autodetect():
    text = 'hello world.:lt:smart world.:ja:今日は、世界'
    expect = collections.OrderedDict(
        [('en', 'hello world'),
         ('lt', 'smart world'),
         ('ja', '今日は、世界')]
    )

    result = split(text)

    assert(
        result == expect
    )


def test_autodetect():
    text = '''some text
which is good

несколько текста

so want to try

šienpjovys džemas

中文也可以的
'''
    expect = collections.OrderedDict(
        [('en', 'some text\nwhich is good\n\nso want to try\n\n'),
         ('ru', 'несколько текста\n\n'),
         ('lt', 'šienpjovys džemas\n\n'),
         ('cn', '中文也可以的\n')]
    )

    result = split(text)

    assert(
        result == expect
    )

def test_markdown():

    text = '''中文也可以的

some text
which is good

несколько текста

so want to try

šienpjovys džemas'''

    expect = '''.:cn
中文也可以的

.:en
some text
which is good

so want to try

.:ru
несколько текста

.:lt
šienpjovys džemas'''

    result = split(text, markdown=True)

    assert(
        result == expect
    )

def test_markdown_title():

    text = '''世界，你好.:lt:Sveikas, Pasauli'''


    expect = '.:cn:世界，你好.:lt:Sveikas, Pasauli'

    result = split(text, markdown=True, title=True)

    assert(
        result == expect
    )

if __name__ == '__main__':
    test_title()
    test_body()
    test_partial_autodetect()
    test_autodetect()
    test_markdown()
    test_markdown_title()
	import os
	import collections
	import langdetect

	LANGUAGE_CODES = os.listdir(langdetect.PROFILES_DIRECTORY)

	def detect_language(text, max_length=2):
	""" Make sure we return N-letter keys for languages"""
	shorter = {'zh-cn': 'cn', 'zh-tw': 'zh'}
	code = langdetect.detect(text)
	short_code = shorter.get(code) if len(code) > max_length else code
	return short_code[:max_length]

	def split(text, sep='.:', ends=['\n', ':'], min_key_length=2, max_key_length=2,
	autodetect=True, pargraph_sep='\n\n', markdown=False, title=False):
	"""
	Splits text by `sep`, and combines texts with same keys before `ends`,
	if they are not shorter/longer than `min_key_length` and `max_key_length`.
	Assigns the rest of the parts to key called None. Returns a dict.

	Detects language if not present, treating each paragraph separately.

	Tip:
	Change 'markdown' to True to get result combined back to markdown.
	Pass title=True to convert to title version, using the ':' as end.
	"""

	result = collections.defaultdict(str)
	lang_seq = []

	for token in text.split(sep):
	if not token:
	continue

	name = None
	chunk = token

	if len(token[:max_key_length+1]) == max_key_length+1:

	for symbol in ends:
	pos = token[:max_key_length+1].find(symbol)

	if min_key_length <= pos <= max_key_length:
	name, chunk = token[:pos], token[pos+1:]

	if not name:
	if autodetect:

	paragraphs = chunk.split(pargraph_sep)
	number_of_paragraphs = len(paragraphs)
	for i, paragraph in enumerate(paragraphs):
	if not paragraph:
	continue

	name = detect_language(paragraph)
	result[name] += paragraph

	if i < number_of_paragraphs - 1:
	result[name] += pargraph_sep

	if name not in lang_seq:
	lang_seq.append(name)
	else:
	result[name] += chunk

	if name not in lang_seq:
	lang_seq.append(name)
	else:
	result[name] += chunk

	if name not in lang_seq:
	lang_seq.append(name)

	result = collections.OrderedDict(
	[(lang, result[lang]) for lang in lang_seq]
	)


	if markdown:

	text_md = ''

	for lang in lang_seq:
	text_md += '{sep}{lang}{end}{text}'.format(
	sep = sep,
	lang = lang,
	end = ends[0] if not title else ends[1],
	text = result[lang],
	)

	return text_md.strip()

	return result

	def test_title():
	text = '.:en:hello world.:lt:smart world.:ja:今日は、世界'
	expect = collections.OrderedDict(
	[('en', 'hello world'),
	('lt', 'smart world'),
	('ja', '今日は、世界')]
	)
	assert(
	split(text) == expect
	)

	def test_body():
	text ='''.:en
	some text

	which is good

	.:ru
	несколько текста

	.:en
	so want to try

	.:lt
	nieko sau, viskas gerai

	.:cn
	中文也可以的
	'''
	expect = collections.OrderedDict(
	[('en', 'some text\n\nwhich is good\n\nso want to try\n\n'),
	('ru', 'несколько текста\n\n'),
	('lt', 'nieko sau, viskas gerai\n\n'),
	('cn', '中文也可以的\n')]
	)

	assert(
	split(text) == expect
	)

	def test_partial_autodetect():
	text = 'hello world.:lt:smart world.:ja:今日は、世界'
	expect = collections.OrderedDict(
	[('en', 'hello world'),
	('lt', 'smart world'),
	('ja', '今日は、世界')]
	)

	result = split(text)

	assert(
	result == expect
	)


	def test_autodetect():
	text = '''some text
	which is good

	несколько текста

	so want to try

	šienpjovys džemas

	中文也可以的
	'''
	expect = collections.OrderedDict(
	[('en', 'some text\nwhich is good\n\nso want to try\n\n'),
	('ru', 'несколько текста\n\n'),
	('lt', 'šienpjovys džemas\n\n'),
	('cn', '中文也可以的\n')]
	)

	result = split(text)

	assert(
	result == expect
	)

	def test_markdown():

	text = '''中文也可以的

	some text
	which is good

	несколько текста

	so want to try

	šienpjovys džemas'''

	expect = '''.:cn
	中文也可以的

	.:en
	some text
	which is good

	so want to try

	.:ru
	несколько текста

	.:lt
	šienpjovys džemas'''

	result = split(text, markdown=True)

	assert(
	result == expect
	)

	def test_markdown_title():

	text = '''世界，你好.:lt:Sveikas, Pasauli'''


	expect = '.:cn:世界，你好.:lt:Sveikas, Pasauli'

	result = split(text, markdown=True, title=True)

	assert(
	result == expect
	)

	if __name__ == '__main__':
	test_title()
	test_body()
	test_partial_autodetect()
	test_autodetect()
	test_markdown()
	test_markdown_title()