Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
import re
# clean text from noise
def clean_text(text):
# filter to allow only alphabets
text = re.sub(r'[^a-zA-Z\']', ' ', text)
# remove Unicode characters
text = re.sub(r'[^\x00-\x7F]+', '', text)
# convert to lowercase to maintain consistency
text = text.lower()
return text
train['clean_text'] = train.tweet.apply(clean_text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment