Skip to content

Instantly share code, notes, and snippets.

@olbat
Last active March 20, 2018 09:30
Show Gist options
  • Save olbat/4e2cfd073fdcf076926b55e6483f1b13 to your computer and use it in GitHub Desktop.
Save olbat/4e2cfd073fdcf076926b55e6483f1b13 to your computer and use it in GitHub Desktop.
Python string pre-processing for ML
#!/usr/bin/python3
import sys
import unicodedata
import regex # https://pypi.python.org/pypi/regex/
SPECIAL_CHARS = r"\-'" # FIXME: French specific
RE_WHITESPACES = regex.compile(r"\p{Zs}+")
RE_SENTENCE_TERMS = regex.compile(r" *\p{STerm}+ *")
RE_NON_LETTERS = \
regex.compile(r"[^ \n{}\p{{Alphabetic}}]+".format(SPECIAL_CHARS))
RE_SPACES = regex.compile(r" +")
def preprocess(s, unorm='NFC'):
'''
Return a sanitized version of _s_ (with a final newline)
'''
s = s.strip()
if not s:
return s
s = s.lower()
s = regex.sub(RE_WHITESPACES, ' ', s) # unify whitespaces
s = regex.sub(RE_SENTENCE_TERMS, '\n', s) # one sentence per line
s = regex.sub(RE_NON_LETTERS, ' ', s) # strip non-letters/special
s = regex.sub(RE_SPACES, ' ', s) # compact spaces after strip
if unorm: # unicode normalization
s = unicodedata.normalize(unorm, s)
if s[-1] == '\n':
return s
else:
return s + '\n'
if __name__ == '__main__':
for line in sys.stdin:
line = preprocess(line)
if line:
sys.stdout.write(line)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment