jhyland87/thingy.py

## thingy.py
import fileinput
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

name = raw_input("Enter your name: ")   # Python 2.x

#ascii = name.decode('unicode_escape').encode('ascii','ignore')
#utf8 = name.decode('unicode_escape').encode('utf8','ignore')


def replace_chars(string):
	chars = {
		'\xc2\x82' : ',',        # High code comma
		'\xc2\x84' : ',,',       # High code double comma
		'\xc2\x85' : '...',      # Tripple dot
		'\xc2\x88' : '^',        # High carat
		'\xc2\x91' : '\'',     # Forward single quote
		'\xc2\x92' : '\'',     # Reverse single quote
		'\xc2\x93' : '"',     # Forward double quote
		'\xc2\x94' : '"',     # Reverse double quote
		'\xc2\x95' : ' ',
		'\xc2\x96' : '-',        # High hyphen
		'\xc2\x97' : '--',       # Double hyphen
		'\xc2\x99' : ' ',
		'\xc2\xa0' : ' ',
		'\xc2\xa6' : '|',        # Split vertical bar
		'\xc2\xab' : '<<',       # Double less than
		'\xc2\xbb' : '>>',       # Double greater than
		'\xc2\xbc' : '1/4',      # one quarter
		'\xc2\xbd' : '1/2',      # one half
		'\xc2\xbe' : '3/4',      # three quarters
		'\xca\xbf' : '\x27',     # c-single quote
		'\xcc\xa8' : '',         # modifier - under curve
		'\xcc\xb1' : ''          # modifier - under line
	}
	for k, v in chars.iteritems():
		string = string.replace(k, v)
	return string

def replace_chars1(text):
	while True:
		match = POSSIBLE_UTF8_SEQUENCE.search(text)
		if match:
			fixed = match.group(1).encode('latin-1').decode('utf-8')
			text = text[:match.start()] + fixed + text[match.end():]
		else:
			return text

cp1252 = {
    # from http://www.microsoft.com/typography/unicode/1252.htm
    u"\x80": u"\u20AC", # EURO SIGN
    u"\x82": u"\u201A", # SINGLE LOW-9 QUOTATION MARK
    u"\x83": u"\u0192", # LATIN SMALL LETTER F WITH HOOK
    u"\x84": u"\u201E", # DOUBLE LOW-9 QUOTATION MARK
    u"\x85": u"\u2026", # HORIZONTAL ELLIPSIS
    u"\x86": u"\u2020", # DAGGER
    u"\x87": u"\u2021", # DOUBLE DAGGER
    u"\x88": u"\u02C6", # MODIFIER LETTER CIRCUMFLEX ACCENT
    u"\x89": u"\u2030", # PER MILLE SIGN
    u"\x8A": u"\u0160", # LATIN CAPITAL LETTER S WITH CARON
    u"\x8B": u"\u2039", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
    u"\x8C": u"\u0152", # LATIN CAPITAL LIGATURE OE
    u"\x8E": u"\u017D", # LATIN CAPITAL LETTER Z WITH CARON
    u"\x91": u"\u2018", # LEFT SINGLE QUOTATION MARK
    u"\x92": u"\u2019", # RIGHT SINGLE QUOTATION MARK
    u"\x93": u"\u201C", # LEFT DOUBLE QUOTATION MARK
    u"\x94": u"\u201D", # RIGHT DOUBLE QUOTATION MARK
    u"\x95": u"\u2022", # BULLET
    u"\x96": u"\u2013", # EN DASH
    u"\x97": u"\u2014", # EM DASH
    u"\x98": u"\u02DC", # SMALL TILDE
    u"\x99": u"\u2122", # TRADE MARK SIGN
    u"\x9A": u"\u0161", # LATIN SMALL LETTER S WITH CARON
    u"\x9B": u"\u203A", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
    u"\x9C": u"\u0153", # LATIN SMALL LIGATURE OE
    u"\x9E": u"\u017E", # LATIN SMALL LETTER Z WITH CARON
    u"\x9F": u"\u0178", # LATIN CAPITAL LETTER Y WITH DIAERESIS
}

import re

def kill_gremlins(text):
    # map cp1252 gremlins to real unicode characters
    if re.search(u"[\x80-\x9f]", text):
        def fixup(m):
            s = m.group(0)
            return cp1252.get(s, s)
        if isinstance(text, type("")):
            # make sure we have a unicode string
            text = unicode(text, "iso-8859-1")
        text = re.sub(u"[\x80-\x9f]", fixup, text)
    return text

a = name.decode('unicode_escape').encode('ascii', 'replace')
b = name.decode('utf-8').encode('ascii', 'replace')
c = name.encode("ascii", "replace")
d = replace_chars(name)

print('Original: %s\nA: %s\nB: %s\nC: %s\nD: %s' % (name, a, b, c, d))
	import fileinput
	import sys
	reload(sys)
	sys.setdefaultencoding("utf-8")

	name = raw_input("Enter your name: ") # Python 2.x

	#ascii = name.decode('unicode_escape').encode('ascii','ignore')
	#utf8 = name.decode('unicode_escape').encode('utf8','ignore')


	def replace_chars(string):
	chars = {
	'\xc2\x82' : ',', # High code comma
	'\xc2\x84' : ',,', # High code double comma
	'\xc2\x85' : '...', # Tripple dot
	'\xc2\x88' : '^', # High carat
	'\xc2\x91' : '\'', # Forward single quote
	'\xc2\x92' : '\'', # Reverse single quote
	'\xc2\x93' : '"', # Forward double quote
	'\xc2\x94' : '"', # Reverse double quote
	'\xc2\x95' : ' ',
	'\xc2\x96' : '-', # High hyphen
	'\xc2\x97' : '--', # Double hyphen
	'\xc2\x99' : ' ',
	'\xc2\xa0' : ' ',
	'\xc2\xa6' : '\|', # Split vertical bar
	'\xc2\xab' : '<<', # Double less than
	'\xc2\xbb' : '>>', # Double greater than
	'\xc2\xbc' : '1/4', # one quarter
	'\xc2\xbd' : '1/2', # one half
	'\xc2\xbe' : '3/4', # three quarters
	'\xca\xbf' : '\x27', # c-single quote
	'\xcc\xa8' : '', # modifier - under curve
	'\xcc\xb1' : '' # modifier - under line
	}
	for k, v in chars.iteritems():
	string = string.replace(k, v)
	return string

	def replace_chars1(text):
	while True:
	match = POSSIBLE_UTF8_SEQUENCE.search(text)
	if match:
	fixed = match.group(1).encode('latin-1').decode('utf-8')
	text = text[:match.start()] + fixed + text[match.end():]
	else:
	return text

	cp1252 = {
	# from http://www.microsoft.com/typography/unicode/1252.htm
	u"\x80": u"\u20AC", # EURO SIGN
	u"\x82": u"\u201A", # SINGLE LOW-9 QUOTATION MARK
	u"\x83": u"\u0192", # LATIN SMALL LETTER F WITH HOOK
	u"\x84": u"\u201E", # DOUBLE LOW-9 QUOTATION MARK
	u"\x85": u"\u2026", # HORIZONTAL ELLIPSIS
	u"\x86": u"\u2020", # DAGGER
	u"\x87": u"\u2021", # DOUBLE DAGGER
	u"\x88": u"\u02C6", # MODIFIER LETTER CIRCUMFLEX ACCENT
	u"\x89": u"\u2030", # PER MILLE SIGN
	u"\x8A": u"\u0160", # LATIN CAPITAL LETTER S WITH CARON
	u"\x8B": u"\u2039", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
	u"\x8C": u"\u0152", # LATIN CAPITAL LIGATURE OE
	u"\x8E": u"\u017D", # LATIN CAPITAL LETTER Z WITH CARON
	u"\x91": u"\u2018", # LEFT SINGLE QUOTATION MARK
	u"\x92": u"\u2019", # RIGHT SINGLE QUOTATION MARK
	u"\x93": u"\u201C", # LEFT DOUBLE QUOTATION MARK
	u"\x94": u"\u201D", # RIGHT DOUBLE QUOTATION MARK
	u"\x95": u"\u2022", # BULLET
	u"\x96": u"\u2013", # EN DASH
	u"\x97": u"\u2014", # EM DASH
	u"\x98": u"\u02DC", # SMALL TILDE
	u"\x99": u"\u2122", # TRADE MARK SIGN
	u"\x9A": u"\u0161", # LATIN SMALL LETTER S WITH CARON
	u"\x9B": u"\u203A", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
	u"\x9C": u"\u0153", # LATIN SMALL LIGATURE OE
	u"\x9E": u"\u017E", # LATIN SMALL LETTER Z WITH CARON
	u"\x9F": u"\u0178", # LATIN CAPITAL LETTER Y WITH DIAERESIS
	}

	import re

	def kill_gremlins(text):
	# map cp1252 gremlins to real unicode characters
	if re.search(u"[\x80-\x9f]", text):
	def fixup(m):
	s = m.group(0)
	return cp1252.get(s, s)
	if isinstance(text, type("")):
	# make sure we have a unicode string
	text = unicode(text, "iso-8859-1")
	text = re.sub(u"[\x80-\x9f]", fixup, text)
	return text

	a = name.decode('unicode_escape').encode('ascii', 'replace')
	b = name.decode('utf-8').encode('ascii', 'replace')
	c = name.encode("ascii", "replace")
	d = replace_chars(name)

	print('Original: %s\nA: %s\nB: %s\nC: %s\nD: %s' % (name, a, b, c, d))