olbat/strpreprocess.py

## strpreprocess.py
#!/usr/bin/python3

import sys
import unicodedata
import regex  # https://pypi.python.org/pypi/regex/

SPECIAL_CHARS = r"\-'"  # FIXME: French specific

RE_WHITESPACES = regex.compile(r"\p{Zs}+")
RE_SENTENCE_TERMS = regex.compile(r" *\p{STerm}+ *")
RE_NON_LETTERS = \
    regex.compile(r"[^ \n{}\p{{Alphabetic}}]+".format(SPECIAL_CHARS))
RE_SPACES = regex.compile(r" +")


def preprocess(s, unorm='NFC'):
    '''
    Return a sanitized version of _s_ (with a final newline)
    '''
    s = s.strip()
    if not s:
        return s

    s = s.lower()
    s = regex.sub(RE_WHITESPACES, ' ', s)  # unify whitespaces
    s = regex.sub(RE_SENTENCE_TERMS, '\n', s)  # one sentence per line
    s = regex.sub(RE_NON_LETTERS, ' ', s)  # strip non-letters/special
    s = regex.sub(RE_SPACES, ' ', s)  # compact spaces after strip

    if unorm:  # unicode normalization
        s = unicodedata.normalize(unorm, s)

    if s[-1] == '\n':
        return s
    else:
        return s + '\n'


if __name__ == '__main__':
    for line in sys.stdin:
        line = preprocess(line)
        if line:
            sys.stdout.write(line)
	#!/usr/bin/python3

	import sys
	import unicodedata
	import regex # https://pypi.python.org/pypi/regex/

	SPECIAL_CHARS = r"\-'" # FIXME: French specific

	RE_WHITESPACES = regex.compile(r"\p{Zs}+")
	RE_SENTENCE_TERMS = regex.compile(r" \p{STerm}+ ")
	RE_NON_LETTERS = \
	regex.compile(r"[^ \n{}\p{{Alphabetic}}]+".format(SPECIAL_CHARS))
	RE_SPACES = regex.compile(r" +")


	def preprocess(s, unorm='NFC'):
	'''
	Return a sanitized version of _s_ (with a final newline)
	'''
	s = s.strip()
	if not s:
	return s

	s = s.lower()
	s = regex.sub(RE_WHITESPACES, ' ', s) # unify whitespaces
	s = regex.sub(RE_SENTENCE_TERMS, '\n', s) # one sentence per line
	s = regex.sub(RE_NON_LETTERS, ' ', s) # strip non-letters/special
	s = regex.sub(RE_SPACES, ' ', s) # compact spaces after strip

	if unorm: # unicode normalization
	s = unicodedata.normalize(unorm, s)

	if s[-1] == '\n':
	return s
	else:
	return s + '\n'


	if __name__ == '__main__':
	for line in sys.stdin:
	line = preprocess(line)
	if line:
	sys.stdout.write(line)