Skip to content

Instantly share code, notes, and snippets.

@olbat olbat/
Last active Mar 20, 2018

What would you like to do?
Python string pre-processing for ML
import sys
import unicodedata
import regex #
SPECIAL_CHARS = r"\-'" # FIXME: French specific
RE_WHITESPACES = regex.compile(r"\p{Zs}+")
RE_SENTENCE_TERMS = regex.compile(r" *\p{STerm}+ *")
regex.compile(r"[^ \n{}\p{{Alphabetic}}]+".format(SPECIAL_CHARS))
RE_SPACES = regex.compile(r" +")
def preprocess(s, unorm='NFC'):
Return a sanitized version of _s_ (with a final newline)
s = s.strip()
if not s:
return s
s = s.lower()
s = regex.sub(RE_WHITESPACES, ' ', s) # unify whitespaces
s = regex.sub(RE_SENTENCE_TERMS, '\n', s) # one sentence per line
s = regex.sub(RE_NON_LETTERS, ' ', s) # strip non-letters/special
s = regex.sub(RE_SPACES, ' ', s) # compact spaces after strip
if unorm: # unicode normalization
s = unicodedata.normalize(unorm, s)
if s[-1] == '\n':
return s
return s + '\n'
if __name__ == '__main__':
for line in sys.stdin:
line = preprocess(line)
if line:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.