Skip to content

Instantly share code, notes, and snippets.

@jhyland87
Created July 22, 2016 17:31
Show Gist options
  • Save jhyland87/2bac8523407dbf9c922bcc7cb83fa954 to your computer and use it in GitHub Desktop.
Save jhyland87/2bac8523407dbf9c922bcc7cb83fa954 to your computer and use it in GitHub Desktop.
import fileinput
import sys
import string
reload(sys)
sys.setdefaultencoding("utf-8")
def sanitizeData( data ):
# This might be useful for later
#if not isinstance( dirty_string, unicode):
# return data
def _sanatizeStr( dirty_string ):
dirty_string = str( dirty_string )
unsupported_str = 'BADSTRING'
replacement_chars = {
u'\u201C': '"', # RIGHT DOUBLE QUOTATION MARK
u'\u201D': '"', # RIGHT DOUBLE QUOTATION MARK
u'\u2018': "'", # LEFT SINGLE QUOTATION MARK
u'\u2019': "'", # RIGHT SINGLE QUOTATION MARK
u'\u2014': "-", # EM DASH
u'\u2013': "-", # EM DASH
u'\u02DC': "~", # SMALL TILDE
u'\u201A': "'", # SINGLE LOW-9 QUOTATION MARK
u'\u201E': '"', # DOUBLE LOW-9 QUOTATION MARK
u'\u00A0': " ", # NON-BREAKING SPACE BAR
u'\u2011': "-", # NON-BREAKING HYPHEN
u'\u2022': '-', # NON-BREAKING HYPHEN
u'\u20AC': unsupported_str, # EURO SIGN
u'\u0192': unsupported_str, # LATIN SMALL LETTER F WITH HOOK
u'\u2026': unsupported_str, # HORIZONTAL ELLIPSIS
u'\u2020': unsupported_str, # DAGGER
u'\u2021': unsupported_str, # DOUBLE DAGGER
u'\u02C6': unsupported_str, # MODIFIER LETTER CIRCUMFLEX ACCENT
u'\u2030': unsupported_str, # PER MILLE SIGN
u'\u0160': unsupported_str, # LATIN CAPITAL LETTER S WITH CARON
u'\u2039': unsupported_str, # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
u'\u0152': unsupported_str, # LATIN CAPITAL LIGATURE OE
u'\u017D': unsupported_str, # LATIN CAPITAL LETTER Z WITH CARON
u'\u2122': unsupported_str, # TRADE MARK SIGN
u'\u0161': unsupported_str, # LATIN SMALL LETTER S WITH CARON
u'\u203A': unsupported_str, # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
u'\u0153': unsupported_str, # LATIN SMALL LIGATURE OE
u'\u017E': unsupported_str, # LATIN SMALL LETTER Z WITH CARON
u'\u0178': unsupported_str, # LATIN CAPITAL LETTER Y WITH DIAERESIS
u'\u0420': unsupported_str,
u'\u043E': unsupported_str,
u'\u0441': unsupported_str,
u'\u0438': unsupported_str,
u'\u044F': unsupported_str,
u'\u0103': unsupported_str
}
dirty_string = dirty_string.translate(None, string.punctuation)
# Loop through the replacement characters, replacing key with the value in the dirty_string
for k, v in replacement_chars.iteritems():
#print("Replacing %s -> %s" % (k, v))
pre_repl = dirty_string
dirty_string = dirty_string.replace(k, v)
# If there WAS an update when replacing this character, and the character is an 'unsupported' character, then throw a hissy-fit
if str(pre_repl) != str(dirty_string) and v == unsupported_str:
#if pre_repl != dirty_string and v == unsupported_str:
print "BAD STRING FOUND"
exit()
return str(dirty_string)
def _sanatizeList( listData ):
for subkey, subvalue in enumerate(listData):
if isinstance( subvalue, str ):
listData[subkey] = _sanatizeStr( subvalue )
else:
listData[subkey] = sanitizeData( subvalue )
return listData
def _sanatizeDict( dictData ):
for key, value in dictData.items():
if isinstance( value, str ):
dictData[key] = _sanatizeStr( value )
else:
dictData[key] = sanitizeData( value )
return dictData
if isinstance( data, list ):
return _sanatizeList( data )
if isinstance( data, dict ):
return _sanatizeDict( data )
# Commenting this out, because sometimes the type is 'unicode', so isinstance for str fails
#if isinstance( data, str ):
# return _sanatizeStr( data )
return _sanatizeStr( data )
dirty_string = raw_input("Enter Dirty String: ") # Python 2.x
clean_string = sanitizeData(dirty_string)
print '\nRESULTS'
print '{0:<25}: {1:<25}'.format( 'Original (Dirty)', dirty_string )
print 'Type: %s' % type(dirty_string)
print '{0:<25}: {1:<25}'.format( 'Modified (Sanitized)', clean_string )
print 'Type: %s' % type(clean_string)
# EXAMPLE RESULTS:
# Wayne’s Candies -> Wayne's Candies
# Quotes: ‘ ’ “ ” -> ' ' " "
# Euro: €123 -> BAD STRING FOUND
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment