This to remove non ASCII characters from string
import re | |
ini_string = "'technews One lone dude awaits iPad 2 at Apple\x89Ûªs SXSW store" | |
res1 = " ".join(re.split("[^A-Za-z0-9]+", ini_string)) | |
print(res1) | |
if re.match("[^\t\r\n\x20-\x7E]+", ini_string): | |
print("found") | |
result = ini_string.encode().decode('ascii', 'replace').replace(u'\ufffd', '`') | |
result2 = ini_string.encode().decode("utf-8").replace(u"\x89Ûª", "`").encode("utf-8") | |
print(result2) | |
def replace(m): | |
return bytes.fromhex(''.join(m.groups(''))).decode('utf-8-be') | |
unicode_escape = re.compile( | |
r'(?<!\\)' | |
r'(?:\\u([dD][89abAB][a-fA-F0-9]{2})\\u([dD][c-fC-F][a-fA-F0-9]{2})' | |
r'|\\u([a-fA-F0-9]{4}))') | |
result1 = unicode_escape.sub(replace, ini_string) | |
print(result1) | |
new_string = ini_string.encode('ascii',errors='ignore') | |
print(new_string) | |
for x in ini_string: | |
if ord(x) > 126: | |
print(x) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment