Created
February 24, 2017 14:34
-
-
Save vi3k6i5/52b4ed862deaed9005bf89b7b00f4c74 to your computer and use it in GitHub Desktop.
python code to clean text.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import string | |
text_translator = str.maketrans({ord(c): " " for c in string.punctuation}) | |
def clean_text(text, remove_punctuation_all=False): | |
if not text: | |
return '' | |
try: | |
text = text.replace(chr(160), " ") | |
text = ''.join([i if ord(i) < 128 else ' ' for i in text]) | |
except Exception as e: | |
try: | |
text = text.encode('utf-8') | |
text = text.decode('utf-8') | |
except Exception as e: | |
return "" | |
try: | |
text = text.encode('ascii', 'ignore').decode("utf-8") | |
text = text.translate(text_translator) | |
except Exception as e: | |
return "" | |
while ' ' in text: | |
text = text.replace(' ', ' ') | |
text = text.strip() | |
return text | |
>>> clean_text("Hashimoto's_thyroiditis' Notice <<'s>>") | |
# 'Hashimoto s thyroiditis Notice s' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment