Last active
April 25, 2020 03:52
-
-
Save tcramm0nd/e328b8981c73220bd9f39dee2dc1c856 to your computer and use it in GitHub Desktop.
Basic NLP Text Preprocessing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def text_preprocessing(text): | |
'''This is a pretty basic NLP text cleaner that takes in a corpus/text, | |
applies some cleaning functions to remove undesirable | |
characters, and returns the text in the same format | |
This requires the following to be imported: | |
re | |
nltk | |
''' | |
# text cleaning | |
text = text.lower() | |
text = re.sub('\[.*?\]', '', text) | |
text = re.sub('https?://\S+|www\.\S+', '', text) | |
text = re.sub('<.*?>+', '', text) | |
text = re.sub('[%s]' % re.escape(string.punctuation), '', text) | |
text = re.sub('\n', '', text) | |
text = re.sub('\w*\d\w*', '', text) | |
# splits the text into tokens using a regular expression | |
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') | |
tokenized_text = tokenizer.tokenize(text) | |
# a list of stopwords to remove | |
stop_words = set(stopwords.words('english')) | |
# creates a new list of tokenized words without the stopwords | |
no_stop_words = [w for w in tokenized_text if w not in stop_words] | |
text = ' '.join(no_stop_words) | |
return text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment