Skip to content

Instantly share code, notes, and snippets.

@vi3k6i5
Created February 24, 2017 14:34
Show Gist options
  • Save vi3k6i5/52b4ed862deaed9005bf89b7b00f4c74 to your computer and use it in GitHub Desktop.
Save vi3k6i5/52b4ed862deaed9005bf89b7b00f4c74 to your computer and use it in GitHub Desktop.
python code to clean text.
import re
import string
text_translator = str.maketrans({ord(c): " " for c in string.punctuation})
def clean_text(text, remove_punctuation_all=False):
if not text:
return ''
try:
text = text.replace(chr(160), " ")
text = ''.join([i if ord(i) < 128 else ' ' for i in text])
except Exception as e:
try:
text = text.encode('utf-8')
text = text.decode('utf-8')
except Exception as e:
return ""
try:
text = text.encode('ascii', 'ignore').decode("utf-8")
text = text.translate(text_translator)
except Exception as e:
return ""
while ' ' in text:
text = text.replace(' ', ' ')
text = text.strip()
return text
>>> clean_text("Hashimoto's_thyroiditis' Notice <<'s>>")
# 'Hashimoto s thyroiditis Notice s'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment