Skip to content

Instantly share code, notes, and snippets.

@durden
Last active April 26, 2023 11:13
Show Gist options
  • Save durden/4504120 to your computer and use it in GitHub Desktop.
Save durden/4504120 to your computer and use it in GitHub Desktop.
Different ways to 'sanitize' a list of strings to ensure they don't contain any of the contents of another 'ignored' list
"""
Demonstration of ways to implement this API:
sanitize(unsanitized_input, ignored_words)
Related discussions:
- Modifying a list while looping over it:
- http://stackoverflow.com/questions/1207406/remove-items-from-a-list-while-iterating-in-python
- Remove all occurences of a value in a list:
- http://stackoverflow.com/questions/1157106/remove-all-occurences-of-a-value-from-a-python-list
"""
# 1. Using set difference
def sanitize_1(unsanitized_input, ignored_words):
"""
Take in two lists and santize first list to ensure it doesn't contain any
items from the second list, return sanitized list
"""
# Downsides:
# - Sets are unordered so if user_input was a sentence we lose the
# ordering because set difference will create a new set and not
# maintain ordering.
return list(set(unsanitized_input) - set(ignored_words))
# 2. Using sets and instersection()
def sanitize_2(unsanitized_input, ignored_words):
"""
Take in two lists and santize first list to ensure it doesn't contain any
items from the second list, return sanitized list
"""
# Downsides:
# - We must loop over words individually, which could be slower.
# - Doesn't handle if an ignore word is present more than once.
ignored_words = set(ignored_words)
for ignore in ignored_words.intersection(unsanitized_input):
unsanitized_input.remove(ignore)
return unsanitized_input
# 3. Using sets and instersection() with ability to remove ALL ignored words.
def sanitize_3(unsanitized_input, ignored_words):
"""
Take in two lists and santize first list to ensure it doesn't contain any
items from the second list, return sanitized list
"""
# Downsides:
# - Looping over list while removing from it?
# http://stackoverflow.com/questions/1207406/remove-items-from-a-list-while-iterating-in-python
ignored_words = set(ignored_words)
for ignore in ignored_words.intersection(unsanitized_input):
while ignore in unsanitized_input:
unsanitized_input.remove(ignore)
return unsanitized_input
# 4. Using #3 with knowledge of list remove/mutable semantics considered
def sanitize_4(unsanitized_input, ignored_words):
"""
Take in two lists and santize first list to ensure it doesn't contain any
items from the second list, return sanitized list
"""
# Downsides:
# - Looping over list while removing from it?
# http://stackoverflow.com/questions/1207406/remove-items-from-a-list-while-iterating-in-python
ignored_words = set(ignored_words)
for ignore in ignored_words.intersection(unsanitized_input):
while ignore in unsanitized_input:
unsanitized_input.remove(ignore)
# Note: The return is not technically necessary because remove() does an
# in-place removal so the original list passed in is modified.
# 5. Using #4 without implicit side-effect of modifying list user passed in.
# This demonstrates the functional paradigm of functions never modifiying
# input parameters and always having the same output for the same input.
def sanitize_5(unsanitized_input, ignored_words):
"""
Take in two lists and santize first list to ensure it doesn't contain any
items from the second list, return sanitized list
"""
# Downsides:
# - Requires extra memory usage because original unsanitized_input is
# copied. Also, the final memory usage is increased in this version by
# the number of words/characters that are NOT in the ignored_words
# because those are removed.
# Copy input list (various ways to accomplish, but list() syntax is more
# clear IMO)
unsanitized_input = list(unsanitized_input)
# unsanitized_input[:] = unsanitized_input
# import copy
# unsanitized_input = copy(unsanitized_input)
ignored_words = set(ignored_words)
for ignore in ignored_words.intersection(unsanitized_input):
while ignore in unsanitized_input:
unsanitized_input.remove(ignore)
# Return is required now b/c we didn't touch the original list, we have our
# own local copy that is unsanitized
return unsanitized_input
def check_results():
user_input = 'the cat walked down the road.'.split()
ignore_words = ['the', 'um', 'eh']
print 'sanitize_1', sanitize_1(user_input, ignore_words)
print 'sanitize_2', sanitize_2(user_input, ignore_words)
print 'sanitize_3', sanitize_3(user_input, ignore_words)
# Create new list so sanitize_4 gets fresh copy of words to ignore instead
# of an already sanitized list (previous calls modified original
# user_input)
user_input = 'the cat walked down the road.'.split()
sanitize_4(user_input, ignore_words)
print 'sanitize_4', user_input
# Start fresh again, previous call modified in-place again.
user_input = 'the cat walked down the road.'.split()
user_input_returned = sanitize_5(user_input, ignore_words)
print 'sanitize_5', user_input_returned
print ' original list', user_input, id(user_input)
print ' sanitized list', user_input_returned, id(user_input_returned)
def check_performance():
from timeit import timeit
from functools import partial
import sys
def _sanitize_funcs():
module = sys.modules[__name__]
# Loop over them in order so we can print them as we 'improved' the
# algorithm
for obj in sorted(vars(module).values()):
if callable(obj) and obj.__name__.startswith('sanitize_'):
yield obj
# This is needed to setup the local variables and have a context for
# the function.
ignore_words = ['the', 'eh' 'um', 'ugh']
lots_of_input = 'the cat walked down the road.'
user_input = lots_of_input.split()
for func in _sanitize_funcs():
print '%s %f secs' % (func.__name__,
timeit(partial(func, user_input, ignore_words)))
def main():
print '----- Function Results -----'
check_results()
print '----- Function Performance -----'
check_performance()
if __name__ == "__main__":
main()
@glenbot
Copy link

glenbot commented Jan 30, 2013

sanitize_8 is broken :/

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment