Skip to content

Instantly share code, notes, and snippets.

@jhyland87
Created June 21, 2016 22:56
Show Gist options
  • Save jhyland87/5b4a3b1bbca473ad09a8da9a5bdadd54 to your computer and use it in GitHub Desktop.
Save jhyland87/5b4a3b1bbca473ad09a8da9a5bdadd54 to your computer and use it in GitHub Desktop.
import fileinput
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
name = raw_input("Enter your name: ") # Python 2.x
#ascii = name.decode('unicode_escape').encode('ascii','ignore')
#utf8 = name.decode('unicode_escape').encode('utf8','ignore')
def replace_chars(string):
chars = {
'\xc2\x82' : ',', # High code comma
'\xc2\x84' : ',,', # High code double comma
'\xc2\x85' : '...', # Tripple dot
'\xc2\x88' : '^', # High carat
'\xc2\x91' : '\'', # Forward single quote
'\xc2\x92' : '\'', # Reverse single quote
'\xc2\x93' : '"', # Forward double quote
'\xc2\x94' : '"', # Reverse double quote
'\xc2\x95' : ' ',
'\xc2\x96' : '-', # High hyphen
'\xc2\x97' : '--', # Double hyphen
'\xc2\x99' : ' ',
'\xc2\xa0' : ' ',
'\xc2\xa6' : '|', # Split vertical bar
'\xc2\xab' : '<<', # Double less than
'\xc2\xbb' : '>>', # Double greater than
'\xc2\xbc' : '1/4', # one quarter
'\xc2\xbd' : '1/2', # one half
'\xc2\xbe' : '3/4', # three quarters
'\xca\xbf' : '\x27', # c-single quote
'\xcc\xa8' : '', # modifier - under curve
'\xcc\xb1' : '' # modifier - under line
}
for k, v in chars.iteritems():
string = string.replace(k, v)
return string
def replace_chars1(text):
while True:
match = POSSIBLE_UTF8_SEQUENCE.search(text)
if match:
fixed = match.group(1).encode('latin-1').decode('utf-8')
text = text[:match.start()] + fixed + text[match.end():]
else:
return text
cp1252 = {
# from http://www.microsoft.com/typography/unicode/1252.htm
u"\x80": u"\u20AC", # EURO SIGN
u"\x82": u"\u201A", # SINGLE LOW-9 QUOTATION MARK
u"\x83": u"\u0192", # LATIN SMALL LETTER F WITH HOOK
u"\x84": u"\u201E", # DOUBLE LOW-9 QUOTATION MARK
u"\x85": u"\u2026", # HORIZONTAL ELLIPSIS
u"\x86": u"\u2020", # DAGGER
u"\x87": u"\u2021", # DOUBLE DAGGER
u"\x88": u"\u02C6", # MODIFIER LETTER CIRCUMFLEX ACCENT
u"\x89": u"\u2030", # PER MILLE SIGN
u"\x8A": u"\u0160", # LATIN CAPITAL LETTER S WITH CARON
u"\x8B": u"\u2039", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
u"\x8C": u"\u0152", # LATIN CAPITAL LIGATURE OE
u"\x8E": u"\u017D", # LATIN CAPITAL LETTER Z WITH CARON
u"\x91": u"\u2018", # LEFT SINGLE QUOTATION MARK
u"\x92": u"\u2019", # RIGHT SINGLE QUOTATION MARK
u"\x93": u"\u201C", # LEFT DOUBLE QUOTATION MARK
u"\x94": u"\u201D", # RIGHT DOUBLE QUOTATION MARK
u"\x95": u"\u2022", # BULLET
u"\x96": u"\u2013", # EN DASH
u"\x97": u"\u2014", # EM DASH
u"\x98": u"\u02DC", # SMALL TILDE
u"\x99": u"\u2122", # TRADE MARK SIGN
u"\x9A": u"\u0161", # LATIN SMALL LETTER S WITH CARON
u"\x9B": u"\u203A", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
u"\x9C": u"\u0153", # LATIN SMALL LIGATURE OE
u"\x9E": u"\u017E", # LATIN SMALL LETTER Z WITH CARON
u"\x9F": u"\u0178", # LATIN CAPITAL LETTER Y WITH DIAERESIS
}
import re
def kill_gremlins(text):
# map cp1252 gremlins to real unicode characters
if re.search(u"[\x80-\x9f]", text):
def fixup(m):
s = m.group(0)
return cp1252.get(s, s)
if isinstance(text, type("")):
# make sure we have a unicode string
text = unicode(text, "iso-8859-1")
text = re.sub(u"[\x80-\x9f]", fixup, text)
return text
a = name.decode('unicode_escape').encode('ascii', 'replace')
b = name.decode('utf-8').encode('ascii', 'replace')
c = name.encode("ascii", "replace")
d = replace_chars(name)
print('Original: %s\nA: %s\nB: %s\nC: %s\nD: %s' % (name, a, b, c, d))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment