Created
January 16, 2020 09:52
-
-
Save AvinashDalvi89/ca1e50f1cb1a32f7544f2f0af1fb928d to your computer and use it in GitHub Desktop.
This to remove non ASCII characters from string
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
ini_string = "'technews One lone dude awaits iPad 2 at Apple\x89Ûªs SXSW store" | |
res1 = " ".join(re.split("[^A-Za-z0-9]+", ini_string)) | |
print(res1) | |
if re.match("[^\t\r\n\x20-\x7E]+", ini_string): | |
print("found") | |
result = ini_string.encode().decode('ascii', 'replace').replace(u'\ufffd', '`') | |
result2 = ini_string.encode().decode("utf-8").replace(u"\x89Ûª", "`").encode("utf-8") | |
print(result2) | |
def replace(m): | |
return bytes.fromhex(''.join(m.groups(''))).decode('utf-8-be') | |
unicode_escape = re.compile( | |
r'(?<!\\)' | |
r'(?:\\u([dD][89abAB][a-fA-F0-9]{2})\\u([dD][c-fC-F][a-fA-F0-9]{2})' | |
r'|\\u([a-fA-F0-9]{4}))') | |
result1 = unicode_escape.sub(replace, ini_string) | |
print(result1) | |
new_string = ini_string.encode('ascii',errors='ignore') | |
print(new_string) | |
for x in ini_string: | |
if ord(x) > 126: | |
print(x) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment