import re | |
# clean text from noise | |
def clean_text(text): | |
# filter to allow only alphabets | |
text = re.sub(r'[^a-zA-Z\']', ' ', text) | |
# remove Unicode characters | |
text = re.sub(r'[^\x00-\x7F]+', '', text) | |
# convert to lowercase to maintain consistency | |
text = text.lower() | |
return text | |
train['clean_text'] = train.tweet.apply(clean_text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment