Last active
December 11, 2015 16:33
-
-
Save alvations/55d78f627ac8bac0bf34 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This is a script used to clean control characters from the | |
- NTU -Multilingual Corpus (http://web.mysites.ntu.edu.sg/fcbond/open/pubs/2012-ijalp-ntumc.pdf) | |
- SeedLing Corpus (http://www.aclweb.org/anthology/W/W14/W14-2211.pdf) | |
- DSL Corpus Collection (https://comparable.limsi.fr/bucc2014/4.pdf) | |
""" | |
import re | |
import unicodedata | |
# A full list of unicode characters. | |
all_chars = (unichr(i) for i in xrange(0x110000)) | |
# A list of control (non-printable) characters. | |
control_chars = ''.join(c for c in all_chars if unicodedata.category(c)[0] == 'C') | |
# Compile the regexes to escape. | |
cc_re = re.compile('[%s]' % re.escape(control_chars)) | |
def rm_control_chars(s): # see http://www.unicode.org/reports/tr44/#General_Category_Values | |
return cc_re.sub('', s) | |
# A faster version of cleaner from NTU-MC and DSLCC | |
def rm_control_chars_fast(s): | |
cleaned_s = ''.join(c for c in s if not c in control_chars) | |
return ' '.join(cleaned_s.split()) # remove duplicated whitespaces. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment