Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
This to remove non ASCII characters from string
import re
ini_string = "'technews One lone dude awaits iPad 2 at Apple\x89Ûªs SXSW store"
res1 = " ".join(re.split("[^A-Za-z0-9]+", ini_string))
print(res1)
if re.match("[^\t\r\n\x20-\x7E]+", ini_string):
print("found")
result = ini_string.encode().decode('ascii', 'replace').replace(u'\ufffd', '`')
result2 = ini_string.encode().decode("utf-8").replace(u"\x89Ûª", "`").encode("utf-8")
print(result2)
def replace(m):
return bytes.fromhex(''.join(m.groups(''))).decode('utf-8-be')
unicode_escape = re.compile(
r'(?<!\\)'
r'(?:\\u([dD][89abAB][a-fA-F0-9]{2})\\u([dD][c-fC-F][a-fA-F0-9]{2})'
r'|\\u([a-fA-F0-9]{4}))')
result1 = unicode_escape.sub(replace, ini_string)
print(result1)
new_string = ini_string.encode('ascii',errors='ignore')
print(new_string)
for x in ini_string:
if ord(x) > 126:
print(x)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment