Create a gist now

Instantly share code, notes, and snippets.

@durden /sanitize.py
Last active Dec 10, 2015

What would you like to do?
Different ways to 'sanitize' a list of strings to ensure they don't contain any of the contents of another 'ignored' list
"""
Demonstration of ways to implement this API:
sanitize(unsanitized_input, ignored_words)
Related discussions:
- Modifying a list while looping over it:
- http://stackoverflow.com/questions/1207406/remove-items-from-a-list-while-iterating-in-python
- Remove all occurences of a value in a list:
- http://stackoverflow.com/questions/1157106/remove-all-occurences-of-a-value-from-a-python-list
"""
# 1. Using set difference
def sanitize_1(unsanitized_input, ignored_words):
"""
Take in two lists and santize first list to ensure it doesn't contain any
items from the second list, return sanitized list
"""
# Downsides:
# - Sets are unordered so if user_input was a sentence we lose the
# ordering because set difference will create a new set and not
# maintain ordering.
return list(set(unsanitized_input) - set(ignored_words))
# 2. Using sets and instersection()
def sanitize_2(unsanitized_input, ignored_words):
"""
Take in two lists and santize first list to ensure it doesn't contain any
items from the second list, return sanitized list
"""
# Downsides:
# - We must loop over words individually, which could be slower.
# - Doesn't handle if an ignore word is present more than once.
ignored_words = set(ignored_words)
for ignore in ignored_words.intersection(unsanitized_input):
unsanitized_input.remove(ignore)
return unsanitized_input
# 3. Using sets and instersection() with ability to remove ALL ignored words.
def sanitize_3(unsanitized_input, ignored_words):
"""
Take in two lists and santize first list to ensure it doesn't contain any
items from the second list, return sanitized list
"""
# Downsides:
# - Looping over list while removing from it?
# http://stackoverflow.com/questions/1207406/remove-items-from-a-list-while-iterating-in-python
ignored_words = set(ignored_words)
for ignore in ignored_words.intersection(unsanitized_input):
while ignore in unsanitized_input:
unsanitized_input.remove(ignore)
return unsanitized_input
# 4. Using #3 with knowledge of list remove/mutable semantics considered
def sanitize_4(unsanitized_input, ignored_words):
"""
Take in two lists and santize first list to ensure it doesn't contain any
items from the second list, return sanitized list
"""
# Downsides:
# - Looping over list while removing from it?
# http://stackoverflow.com/questions/1207406/remove-items-from-a-list-while-iterating-in-python
ignored_words = set(ignored_words)
for ignore in ignored_words.intersection(unsanitized_input):
while ignore in unsanitized_input:
unsanitized_input.remove(ignore)
# Note: The return is not technically necessary because remove() does an
# in-place removal so the original list passed in is modified.
# 5. Using #4 without implicit side-effect of modifying list user passed in.
# This demonstrates the functional paradigm of functions never modifiying
# input parameters and always having the same output for the same input.
def sanitize_5(unsanitized_input, ignored_words):
"""
Take in two lists and santize first list to ensure it doesn't contain any
items from the second list, return sanitized list
"""
# Downsides:
# - Requires extra memory usage because original unsanitized_input is
# copied. Also, the final memory usage is increased in this version by
# the number of words/characters that are NOT in the ignored_words
# because those are removed.
# Copy input list (various ways to accomplish, but list() syntax is more
# clear IMO)
unsanitized_input = list(unsanitized_input)
# unsanitized_input[:] = unsanitized_input
# import copy
# unsanitized_input = copy(unsanitized_input)
ignored_words = set(ignored_words)
for ignore in ignored_words.intersection(unsanitized_input):
while ignore in unsanitized_input:
unsanitized_input.remove(ignore)
# Return is required now b/c we didn't touch the original list, we have our
# own local copy that is unsanitized
return unsanitized_input
def check_results():
user_input = 'the cat walked down the road.'.split()
ignore_words = ['the', 'um', 'eh']
print 'sanitize_1', sanitize_1(user_input, ignore_words)
print 'sanitize_2', sanitize_2(user_input, ignore_words)
print 'sanitize_3', sanitize_3(user_input, ignore_words)
# Create new list so sanitize_4 gets fresh copy of words to ignore instead
# of an already sanitized list (previous calls modified original
# user_input)
user_input = 'the cat walked down the road.'.split()
sanitize_4(user_input, ignore_words)
print 'sanitize_4', user_input
# Start fresh again, previous call modified in-place again.
user_input = 'the cat walked down the road.'.split()
user_input_returned = sanitize_5(user_input, ignore_words)
print 'sanitize_5', user_input_returned
print ' original list', user_input, id(user_input)
print ' sanitized list', user_input_returned, id(user_input_returned)
def check_performance():
from timeit import timeit
from functools import partial
import sys
def _sanitize_funcs():
module = sys.modules[__name__]
# Loop over them in order so we can print them as we 'improved' the
# algorithm
for obj in sorted(vars(module).values()):
if callable(obj) and obj.__name__.startswith('sanitize_'):
yield obj
# This is needed to setup the local variables and have a context for
# the function.
ignore_words = ['the', 'eh' 'um', 'ugh']
lots_of_input = 'the cat walked down the road.'
user_input = lots_of_input.split()
for func in _sanitize_funcs():
print '%s %f secs' % (func.__name__,
timeit(partial(func, user_input, ignore_words)))
def main():
print '----- Function Results -----'
check_results()
print '----- Function Performance -----'
check_performance()
if __name__ == "__main__":
main()

glenbot commented Jan 30, 2013

i got significantly different results on my Macbook Air:

----- Function Results -----
sanitize_1 ['down', 'road.', 'walked', 'cat']
sanitize_2 ['cat', 'walked', 'down', 'the', 'road.']
sanitize_3 ['cat', 'walked', 'down', 'road.']
sanitize_4 ['cat', 'walked', 'down', 'road.']
sanitize_5 ['cat', 'walked', 'down', 'road.']
original list ['the', 'cat', 'walked', 'down', 'the', 'road.'] 4353351976
sanitized list ['cat', 'walked', 'down', 'road.'] 4353542768
----- Function Performance -----
sanitize_2 0.950371 secs
sanitize_3 0.955089 secs
sanitize_4 0.980491 secs
sanitize_1 1.505497 secs
sanitize_5 1.411789 secs

Note that sanitize_2 didn't even work.

glenbot commented Jan 30, 2013

There is a scoping issue so I changed the following:

    # This is needed to setup the local variables and have a context for
    # the function.
    ignore_words = ['the', 'um', 'eh']
    lots_of_input = 'the cat walked down the road.'

    for func in _sanitize_funcs():
        user_input = lots_of_input.split()
        print '%s %f secs' % (func.__name__,
                              timeit(partial(func, user_input, ignore_words)))

Notice where I moved user_input to.

I also added three more function to test other patterns:

def sanitize_6(unsanitized_input, ignored_words):
    new_list = []
    for w in unsanitized_input:
        if w not in ignored_words:
            new_list.append(w)
    return new_list


def sanitize_7(unsanitized_input, ignored_words):
    return [w for w in unsanitized_input if w not in ignored_words]


def sanitize_8(unsanitized_input, ignored_words):
    for w in ignored_words:
        try:
            unsanitized_input.pop(unsanitized_input.index(w))
        except ValueError:
            pass
    return unsanitized_input

Here are the results:

$ python sanitize.py 
----- Function Results -----
sanitize_1 ['down', 'road.', 'walked', 'cat']
sanitize_2 ['cat', 'walked', 'down', 'the', 'road.']
sanitize_3 ['cat', 'walked', 'down', 'road.']
sanitize_4 ['cat', 'walked', 'down', 'road.']
sanitize_5 ['cat', 'walked', 'down', 'road.']
sanitize_6 ['cat', 'walked', 'down', 'road.']
sanitize_7 ['cat', 'walked', 'down', 'road.']
sanitize_8 ['cat', 'walked', 'down', 'road.']
  original list ['the', 'cat', 'walked', 'down', 'the', 'road.'] 4337426728
  sanitized list ['cat', 'walked', 'down', 'road.'] 4337618024
----- Function Performance -----
sanitize_2 0.965178 secs
sanitize_3 0.972548 secs
sanitize_4 0.971117 secs
sanitize_1 1.561817 secs
sanitize_6 1.808824 secs
sanitize_5 2.613811 secs
sanitize_7 1.333645 secs
sanitize_8 7.508463 secs

glenbot commented Jan 30, 2013

sanitize_8 is broken :/

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment