Skip to content

Instantly share code, notes, and snippets.

@LinuxBozo
Created May 3, 2017 18:35
Show Gist options
  • Save LinuxBozo/451db2dce7f2728195ea0901ac6208bb to your computer and use it in GitHub Desktop.
Save LinuxBozo/451db2dce7f2728195ea0901ac6208bb to your computer and use it in GitHub Desktop.
def scrubMicrosoftChars(scrubbedString="", charset='utf-8'):
""" Repair Microsoft Special Characters by mapping to standard characters """
if not scrubbedString:
return ""
charset = charset.lower()
scrubbable = ['iso-8859-1', 'windows-1252']
if charset not in scrubbable:
# since it's not a "friendly" charset, don't scrub, just return it
return scrubbedString
# if the string is already unicode, then we can can skip the
# unicode encoding step. Most likely this string is the title
# of the default page which is the same as the title for the
# folder, which has already been converted
if not isinstance(scrubbedString, unicode):
scrubbedString = unicode(scrubbedString, charset, errors='replace')
# : null
scrubbedString = scrubbedString.replace(u"\x00", "") # UTF-8
scrubbedString = scrubbedString.replace(u"\u0000", "") # Unicode
# : backspace
scrubbedString = scrubbedString.replace(u"\x08", "") # UTF-8
scrubbedString = scrubbedString.replace(u"\u0008", "") # Unicode
# : line tabulation
scrubbedString = scrubbedString.replace(u"\x0B", "") # UTF-8
scrubbedString = scrubbedString.replace(u"\u000B", "") # Unicode
# page break: form feed
scrubbedString = scrubbedString.replace(u"\x0C", "\n") # UTF-8
scrubbedString = scrubbedString.replace(u"\u000C", "") # Unicode
# : shift out
scrubbedString = scrubbedString.replace(u"\x0E", "") # UTF-8
scrubbedString = scrubbedString.replace(u"\u000F", "") # Unicode
# : shift iun
scrubbedString = scrubbedString.replace(u"\x0F", "") # UTF-8
scrubbedString = scrubbedString.replace(u"\u000F", "") # Unicode
# : data link escape
scrubbedString = scrubbedString.replace(u"\x10", "") # UTF-8
scrubbedString = scrubbedString.replace(u"\u0010", "") # Unicode
# : device control one
scrubbedString = scrubbedString.replace(u"\x11", "") # UTF-8
scrubbedString = scrubbedString.replace(u"\u0011", "") # Unicode
# : device control two
scrubbedString = scrubbedString.replace(u"\x12", "") # UTF-8
scrubbedString = scrubbedString.replace(u"\u0012", "") # Unicode
# : device control three
scrubbedString = scrubbedString.replace(u"\x13", "") # UTF-8
scrubbedString = scrubbedString.replace(u"\u0013", "") # Unicode
# : device control four
scrubbedString = scrubbedString.replace(u"\x14", "") # UTF-8
scrubbedString = scrubbedString.replace(u"\u0014", "") # Unicode
# : negative acknowledgement
scrubbedString = scrubbedString.replace(u"\x15", "") # UTF-8
scrubbedString = scrubbedString.replace(u"\u0015", "") # Unicode
# : synchronous idle
scrubbedString = scrubbedString.replace(u"\x16", "") # UTF-8
scrubbedString = scrubbedString.replace(u"\u0016", "") # Unicode
# : end of transmission block
scrubbedString = scrubbedString.replace(u"\x17", "") # UTF-8
scrubbedString = scrubbedString.replace(u"\u0017", "") # Unicode
# : cancel
scrubbedString = scrubbedString.replace(u"\x18", "") # UTF-8
scrubbedString = scrubbedString.replace(u"\u0018", "") # Unicode
# : end of medium
scrubbedString = scrubbedString.replace(u"\x19", "") # UTF-8
scrubbedString = scrubbedString.replace(u"\u0019", "") # Unicode
# : substitute
scrubbedString = scrubbedString.replace(u"\x1A", "") # UTF-8
scrubbedString = scrubbedString.replace(u"\u001A", "") # Unicode
# : escape
scrubbedString = scrubbedString.replace(u"\x1B", "") # UTF-8
scrubbedString = scrubbedString.replace(u"\u001B", "") # Unicode
# left double quotation mark: information separator four
scrubbedString = scrubbedString.replace(u"\x1C", "\"") # UTF-8
scrubbedString = scrubbedString.replace(u"\u001C", "\"") # Unicode
# right double quotation mark: information separator three
scrubbedString = scrubbedString.replace(u"\x1D", "\"") # UTF-8
scrubbedString = scrubbedString.replace(u"\u001D", "\"") # Unicode
# : information separator two
scrubbedString = scrubbedString.replace(u"\x1E", "") # UTF-8
scrubbedString = scrubbedString.replace(u"\u001E", "") # Unicode
# : information separator one
scrubbedString = scrubbedString.replace(u"\x1F", "") # UTF-8
scrubbedString = scrubbedString.replace(u"\u001F", "") # Unicode
# quotation mark
scrubbedString = scrubbedString.replace(u"\x22", "\"") # UTF-8
scrubbedString = scrubbedString.replace(u"\u0022", "\"") # Unicode
scrubbedString = scrubbedString.replace(u"\u02BA", "\"") # Unicode
scrubbedString = scrubbedString.replace(u"\xCA\xBA", "\"") #Unicode to UTF-8
scrubbedString = scrubbedString.replace(u"\u02DD", "\"") # Unicode
scrubbedString = scrubbedString.replace(u"\xCA\x9D", "\"") #Unicode to UTF-8
scrubbedString = scrubbedString.replace(u"\u030B", "\"") # Unicode
scrubbedString = scrubbedString.replace(u"\xCC\x8B", "\"") #Unicode to UTF-8
scrubbedString = scrubbedString.replace(u"\u2033", "\"") # Unicode
scrubbedString = scrubbedString.replace(u"\xE2\x80\xB3", "\"") #Unicode to UTF-8
scrubbedString = scrubbedString.replace(u"\u3003", "\"") # Unicode
scrubbedString = scrubbedString.replace(u"\xE3\x80\x83", "\"") #Unicode to UTF-8
scrubbedString = scrubbedString.replace(u"\u2036", "\"") # Unicode
# apostrophe
scrubbedString = scrubbedString.replace(u"\x27", "'") # UTF-8
scrubbedString = scrubbedString.replace(u"\u0027", "'") # Unicode
scrubbedString = scrubbedString.replace(u"\u02BC", "'") # Unicode
scrubbedString = scrubbedString.replace(u"\xCA\xBC", "'") #Unicode to UTF-8
scrubbedString = scrubbedString.replace(u"\u0313", "'") # Unicode
scrubbedString = scrubbedString.replace(u"\xCC\x93", "'") #Unicode to UTF-8
scrubbedString = scrubbedString.replace(u"\u0315", "'") # Unicode
scrubbedString = scrubbedString.replace(u"\xCC\x95", "'") #Unicode to UTF-8
# grave accent
scrubbedString = scrubbedString.replace(u"\x60", "`") # UTF-8
scrubbedString = scrubbedString.replace(u"\u0060", "`") # Unicode
scrubbedString = scrubbedString.replace(u"\u02CB", "`") # Unicode
scrubbedString = scrubbedString.replace(u"\xCB\x8B", "`") #Unicode to UTF-8
scrubbedString = scrubbedString.replace(u"\u0300", "`") # Unicode
scrubbedString = scrubbedString.replace(u"\xCC\x80", "`") #Unicode to UTF-8
scrubbedString = scrubbedString.replace(u"\u2035", "`") # Unicode
scrubbedString = scrubbedString.replace(u"\xE2\x80\xB5", "`") #Unicode to UTF-8
# low single quotation mark: break permitted here
scrubbedString = scrubbedString.replace(u"\u0082", "'") # Unicode
scrubbedString = scrubbedString.replace(u"\xC2\x82", "'") #Unicode to UTF-8
scrubbedString = scrubbedString.replace(u"\u201A", "'") # Unicode
scrubbedString = scrubbedString.replace(u"\xE2\x80\x9A", "'") #Unicode to UTF-8
# acute accent
scrubbedString = scrubbedString.replace(u"\u00B4", "'") # Unicode
scrubbedString = scrubbedString.replace(u"\xC2\xB4", "'") #Unicode to UTF-8
scrubbedString = scrubbedString.replace(u"\u02CA", "'") # Unicode
scrubbedString = scrubbedString.replace(u"\xCB\x8A", "'") #Unicode to UTF-8
scrubbedString = scrubbedString.replace(u"\u0301", "'") # Unicode
scrubbedString = scrubbedString.replace(u"\xCC\x81", "'") #Unicode to UTF-8
# high single quotation mark
scrubbedString = scrubbedString.replace(u"\u201B", "'") # Unicode
scrubbedString = scrubbedString.replace(u"\xE2\x80\x9B", "'") #Unicode to UTF-8
# low double quotation mark:
scrubbedString = scrubbedString.replace(u"\u0084", "\"") # Unicode
scrubbedString = scrubbedString.replace(u"\xC2\x84", "\"") #Unicode to UTF-8
scrubbedString = scrubbedString.replace(u"\u201E", "\"") # Unicode
scrubbedString = scrubbedString.replace(u"\xE2\x80\x9E", "\"") #Unicode to UTF-8
# high double quotation maruk
scrubbedString = scrubbedString.replace(u"\u201F", "\"") # Unicode
scrubbedString = scrubbedString.replace(u"\xE2\x80\x9F", "\"") #Unicode to UTF-8
# left single quotation mark: private use one
scrubbedString = scrubbedString.replace(u"\u0091", "'") # Unicode
scrubbedString = scrubbedString.replace(u"\xC2\x91", "'") #Unicode to UTF-8
scrubbedString = scrubbedString.replace(u"\u2018", "'") # Unicode
scrubbedString = scrubbedString.replace(u"\xE2\x80\x98", "'") #Unicode to UTF-8
# right single quotation mark: private use two
scrubbedString = scrubbedString.replace(u"\u0092", "'") # Unicode
scrubbedString = scrubbedString.replace(u"\xC2\x92", "'") #Unicode to UTF-8
scrubbedString = scrubbedString.replace(u"\u2019", "'") # Unicode
scrubbedString = scrubbedString.replace(u"\xE2\x80\x99", "'") #Unicode to UTF-8
# left double quotation mark: set transmit state
scrubbedString = scrubbedString.replace(u"\u0093", "\"") # Unicode
scrubbedString = scrubbedString.replace(u"\xC2\x93", "\"") #Unicode to UTF-8
scrubbedString = scrubbedString.replace(u"\u201C", "\"") # Unicode
scrubbedString = scrubbedString.replace(u"\xE2\x80\x9C", "\"") #Unicode to UTF-8
# right double quotation mark: cancel Character
scrubbedString = scrubbedString.replace(u"\u0094", "\"") # Unicode
scrubbedString = scrubbedString.replace(u"\xC2\x94", "\"") #Unicode to UTF-8
scrubbedString = scrubbedString.replace(u"\u201D", "\"") # Unicode
scrubbedString = scrubbedString.replace(u"\xE2\x80\x9D", "\"") #Unicode to UTF-8
# bullet: message waiting
scrubbedString = scrubbedString.replace(u"\u0095", "-") # Unicode
scrubbedString = scrubbedString.replace(u"\xC2\x95", "-") #Unicode to UTF-8
# bullet: start of guarded area
scrubbedString = scrubbedString.replace(u"\u0096", "-") # Unicode
scrubbedString = scrubbedString.replace(u"\xC2\x96", "-") #Unicode to UTF-8
# primue
scrubbedString = scrubbedString.replace(u"\u02B9", "'") # Unicode
scrubbedString = scrubbedString.replace(u"\xCA\xB9", "'") #Unicode to UTF-8
scrubbedString = scrubbedString.replace(u"\u2032", "'") # Unicode
scrubbedString = scrubbedString.replace(u"\xE2\x80\xB2", "'") #Unicode to UTF-8
# double apostrophe
scrubbedString = scrubbedString.replace(u"\u02EE", "\"") # Unicode
scrubbedString = scrubbedString.replace(u"\xCB\xAE", "\"") #Unicode to UTF-8
# en dash
scrubbedString = scrubbedString.replace(u"\u2013", "-") # Unicode
scrubbedString = scrubbedString.replace(u"\xE2\x80\x93", "-") #Unicode to UTF-8
# em dash
scrubbedString = scrubbedString.replace(u"\u2014", "-") # Unicode
scrubbedString = scrubbedString.replace(u"\xE2\x80\x94", "-") #Unicode to UTF-8
#return scrubbedString.encode('utf-8', 'replace')
return scrubbedString
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment