Skip to content

Instantly share code, notes, and snippets.

@AvinashDalvi89
Created January 16, 2020 09:52
Show Gist options
  • Save AvinashDalvi89/ca1e50f1cb1a32f7544f2f0af1fb928d to your computer and use it in GitHub Desktop.
Save AvinashDalvi89/ca1e50f1cb1a32f7544f2f0af1fb928d to your computer and use it in GitHub Desktop.
This to remove non ASCII characters from string
import re
ini_string = "'technews One lone dude awaits iPad 2 at Apple\x89Ûªs SXSW store"
res1 = " ".join(re.split("[^A-Za-z0-9]+", ini_string))
print(res1)
if re.match("[^\t\r\n\x20-\x7E]+", ini_string):
print("found")
result = ini_string.encode().decode('ascii', 'replace').replace(u'\ufffd', '`')
result2 = ini_string.encode().decode("utf-8").replace(u"\x89Ûª", "`").encode("utf-8")
print(result2)
def replace(m):
return bytes.fromhex(''.join(m.groups(''))).decode('utf-8-be')
unicode_escape = re.compile(
r'(?<!\\)'
r'(?:\\u([dD][89abAB][a-fA-F0-9]{2})\\u([dD][c-fC-F][a-fA-F0-9]{2})'
r'|\\u([a-fA-F0-9]{4}))')
result1 = unicode_escape.sub(replace, ini_string)
print(result1)
new_string = ini_string.encode('ascii',errors='ignore')
print(new_string)
for x in ini_string:
if ord(x) > 126:
print(x)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment