Last active
April 11, 2020 22:04
-
-
Save makispl/7c60c38c5f1e4cc04b74d030354103fd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Replace addresses (hhtp, email), numbers (plain, phone), money symbols | |
training_set['SMS'] = training_set['SMS'].str.replace(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', ' ') | |
training_set['SMS'] = training_set['SMS'].str.replace(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', ' ') | |
training_set['SMS'] = training_set['SMS'].str.replace(r'£|\$', ' ') | |
training_set['SMS'] = training_set['SMS'].str.replace(r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', ' ') | |
training_set['SMS'] = training_set['SMS'].str.replace(r'\d+(\.\d+)?', ' ') | |
# Remove punctuation, collapse all whitespace (spaces, line breaks, tabs) into a single space & eliminate any leading/trailing whitespace. | |
training_set['SMS'] = training_set['SMS'].str.replace(r'[^\w\d\s]', ' ') | |
training_set['SMS'] = training_set['SMS'].str.replace(r'\s+', ' ') | |
training_set['SMS'] = training_set['SMS'].str.replace(r'^\s+|\s+?$', '') | |
# Lowercase the entire corpus | |
training_set['SMS'] = training_set['SMS'].str.lower() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment