Last active
September 6, 2017 17:53
-
-
Save floer32/c10f50a302613ed0639d94895ac9623a to your computer and use it in GitHub Desktop.
(OLD; newer version is here: https://github.com/hangtwenty/presswork/blob/master/presswork/text/clean.py ) I wanted to "ensure" strings had been sanitized (avoid running redundantly). This is ONLY ONE type of sanitization, removing control chars (BESIDES NEWLINES), because that is what my current project needed. But I thought the design could be…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" throw your strings to SanitizedString and "ensure" they have been sanitized, such as removing control characters. | |
SanitizedString will avoid running redundantly, by checking type of the input (good for Very Big Strings) | |
>>> hello = SanitizedString(chr(0) + "hello") | |
>>> assert hello == "hello" | |
>>> assert chr(0) not in hello | |
>>> assert SanitizedString(hello) == hello | |
at time of writing there is only one sanitization filter in use: | |
remove all control characters besides newlines and carriage returns. (remove null bytes etc.) | |
other filter functions could be added, as needed, to SANITIZERS. | |
(exploratory testing yielded undesirable behavior when feeding in null bytes and so on.) | |
more info & doctests below | |
""" | |
import re | |
from UserString import UserString | |
_all_control_char_numbers = range(0, 32) + range(127, 160) | |
_char_numbers_besides_newlines = [c for c in _all_control_char_numbers if c not in (ord("\n"), ord('\n'))] | |
all_control_chars = map(unichr, _all_control_char_numbers) | |
control_chars_besides_newlines = map(unichr, _char_numbers_besides_newlines) | |
# (note, flags=re.UNICODE is *not* required, it doesn't matter if using regex against unicode strings, | |
# flag is about the regex itself & control chars are all ASCII) | |
re_control_chars = re.compile('[%s]' % re.escape(''.join(all_control_chars))) | |
re_control_chars_besides_newlines = re.compile('[%s]' % re.escape(''.join(control_chars_besides_newlines))) | |
def remove_control_characters(string_or_unicode, keep_newlines=False): | |
""" remove control characters regardless of whether they are ASCII or unicode | |
adapted solution from here, https://stackoverflow.com/a/93029/884640 | |
... surprised there is no stdlib function for it, but this will do. | |
some redundancy in this test but just to be thorough as well as obvious... | |
... it does repeat the definition of control_chars basically for example, but that is intentional | |
>>> import random, sys | |
>>> null_byte = chr(0) | |
>>> basic_input = "hello" + null_byte + "world" | |
>>> assert null_byte in basic_input | |
>>> assert null_byte not in remove_control_characters(basic_input) | |
>>> all_chars = (unichr(i) for i in xrange(sys.maxunicode)) | |
>>> all_chars_as_list = list(all_chars) | |
>>> random.shuffle(all_chars_as_list) | |
>>> all_chars_shuffled = "".join(all_chars_as_list) | |
>>> del all_chars, all_chars_as_list | |
>>> all_chars_except_control_chars = remove_control_characters(all_chars_shuffled) | |
>>> assert null_byte not in all_chars_except_control_chars | |
>>> control_char_nums = range(0, 32) + range(127, 160) | |
>>> for character in map(unichr, control_char_nums): | |
... assert character not in all_chars_except_control_chars | |
>>> newline = chr(10) # (using chr(10) because putting literal newline in doctest/docstring messes it up) | |
>>> x = remove_control_characters(newline + "hello" + null_byte + newline, keep_newlines=True) | |
>>> assert x == newline + "hello" + newline | |
>>> x = remove_control_characters(newline + "hello" + null_byte + newline, keep_newlines=False) | |
>>> assert x == "hello" | |
:param string_or_unicode: | |
:return: | |
""" | |
if keep_newlines: | |
return re_control_chars_besides_newlines.sub(u'', string_or_unicode) | |
else: | |
return re_control_chars.sub(u'', string_or_unicode) | |
class SanitizedString(UserString): | |
""" sanitizes string upon input - unless it's already been sanitized. | |
SanitizedString will avoid running redundantly, by checking type of the input (good for Very Big Strings) | |
>>> assert SanitizedString(u"hello") == u"hello" | |
>>> assert isinstance(u"hello", unicode) | |
>>> assert not SanitizedString("") # confirm truthiness is same as normal strings | |
>>> assert not SanitizedString(u"") # confirm truthiness is same as normal strings | |
>>> assert SanitizedString("hello") | |
>>> null_byte = chr(0) | |
>>> assert null_byte | |
>>> assert null_byte != '' | |
>>> assert SanitizedString(null_byte) == '' | |
>>> assert SanitizedString(null_byte + "hello") == "hello" | |
>>> assert SanitizedString(SanitizedString(SanitizedString(SanitizedString(u'idempotent')))) == u'idempotent' | |
>>> hi_san = SanitizedString('hi') | |
>>> # when avoiding redundant sanitization, we would expect the internal string to be exact same object | |
>>> assert SanitizedString(SanitizedString(hi_san)).data is hi_san.data | |
""" | |
SANITIZERS = ( | |
lambda s: remove_control_characters(s, keep_newlines=True), | |
) | |
def __init__(self, string): | |
if isinstance(string, SanitizedString): | |
self.data = string.data | |
else: | |
for sanitizer in self.SANITIZERS: | |
string = sanitizer(string) | |
self.data = string | |
def __unicode__(self): | |
return unicode(self.data) |
SECURITY NOTE - remember what context you are sanitizing for!
NEWLINES ARE PART OF "CONTROL CHARACTERS." Sometimes you want them, sometimes you don't. Don't forget about [CRLF Injection]
- Remember, some cases WILL NEED TO remove newlines/CRs, such as sanitizing for security purposes. Don't forget about CRLF Injection
- On the other hand if you are just 'cleaning up text' for some other purpose, you might have a need to keep the newlines/CRs.
Q: Hmm do the regexes need re.UNICODE
? Seems like they are OK without it but wonder if there are spooky edge cases I haven't tried.
A: Not in this case because all control chars are ASCII. The re.UNICODE
flag is about the substance of the regex, not of the strings you are using it against
OLD; newer version is here: https://github.com/hangtwenty/presswork/blob/master/presswork/text/clean.py
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I've done similar things a few times but this particular iteration came up while hacking around on this project https://github.com/hangtwenty/presswork
Not trying to turn this into a lib right now, until I have another 1 or 2 cases come up... Can vary so much per use-case/project, so a generic lib wouldn't be very useful unless done carefully. Leaving this as a 'starter'/reminder for now.