Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Function to replace some annoying characters
def unicodetoascii(text):
TEXT = (text.
replace('\\xe2\\x80\\x99', "'").
replace('\\xc3\\xa9', 'e').
replace('\\xe2\\x80\\x90', '-').
replace('\\xe2\\x80\\x91', '-').
replace('\\xe2\\x80\\x92', '-').
replace('\\xe2\\x80\\x93', '-').
replace('\\xe2\\x80\\x94', '-').
replace('\\xe2\\x80\\x94', '-').
replace('\\xe2\\x80\\x98', "'").
replace('\\xe2\\x80\\x9b', "'").
replace('\\xe2\\x80\\x9c', '"').
replace('\\xe2\\x80\\x9c', '"').
replace('\\xe2\\x80\\x9d', '"').
replace('\\xe2\\x80\\x9e', '"').
replace('\\xe2\\x80\\x9f', '"').
replace('\\xe2\\x80\\xa6', '...').#
replace('\\xe2\\x80\\xb2', "'").
replace('\\xe2\\x80\\xb3', "'").
replace('\\xe2\\x80\\xb4', "'").
replace('\\xe2\\x80\\xb5', "'").
replace('\\xe2\\x80\\xb6', "'").
replace('\\xe2\\x80\\xb7', "'").
replace('\\xe2\\x81\\xba', "+").
replace('\\xe2\\x81\\xbb', "-").
replace('\\xe2\\x81\\xbc', "=").
replace('\\xe2\\x81\\xbd', "(").
replace('\\xe2\\x81\\xbe', ")")
)
return TEXT
@ryananguiano

This comment has been minimized.

Copy link

ryananguiano commented Apr 5, 2017

This worked for me:

LATIN_1_CHARS = (
    ('\xe2\x80\x99', "'"),
    ('\xc3\xa9', 'e'),
    ('\xe2\x80\x90', '-'),
    ('\xe2\x80\x91', '-'),
    ('\xe2\x80\x92', '-'),
    ('\xe2\x80\x93', '-'),
    ('\xe2\x80\x94', '-'),
    ('\xe2\x80\x94', '-'),
    ('\xe2\x80\x98', "'"),
    ('\xe2\x80\x9b', "'"),
    ('\xe2\x80\x9c', '"'),
    ('\xe2\x80\x9c', '"'),
    ('\xe2\x80\x9d', '"'),
    ('\xe2\x80\x9e', '"'),
    ('\xe2\x80\x9f', '"'),
    ('\xe2\x80\xa6', '...'),
    ('\xe2\x80\xb2', "'"),
    ('\xe2\x80\xb3', "'"),
    ('\xe2\x80\xb4', "'"),
    ('\xe2\x80\xb5', "'"),
    ('\xe2\x80\xb6', "'"),
    ('\xe2\x80\xb7', "'"),
    ('\xe2\x81\xba', "+"),
    ('\xe2\x81\xbb', "-"),
    ('\xe2\x81\xbc', "="),
    ('\xe2\x81\xbd', "("),
    ('\xe2\x81\xbe', ")")
)


def clean_latin1(data):
    try:
        return data.encode('utf-8')
    except UnicodeDecodeError:
        data = data.decode('iso-8859-1')
        for _hex, _char in LATIN_1_CHARS:
            data = data.replace(_hex, _char)
        return data.encode('utf8')

@soundmasteraj

This comment has been minimized.

Copy link

soundmasteraj commented Apr 15, 2018

Thanks for sharing this

@vaibhavhrt

This comment has been minimized.

Copy link

vaibhavhrt commented Dec 5, 2018

For me \xe2\x80\x8b is causing problems, you might wanna add it to your list as well.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.