Created
January 22, 2021 16:09
-
-
Save AyishaR/6b352bcebf6bfb8d08fa8dc883906a06 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Remove html tags | |
def removeHTML(sentence): | |
regex = re.compile('<.*?>') | |
return re.sub(regex, ' ', sentence) | |
# Remove URLs | |
def removeURL(sentence): | |
regex = re.compile('http[s]?://\S+') | |
return re.sub(regex, ' ', sentence) | |
# remove numbers, punctuation and any special characters (keep only alphabets) | |
def onlyAlphabets(sentence): | |
regex = re.compile('[^a-zA-Z]') | |
return re.sub(regex, ' ', sentence) | |
def removeRecurring(sentence): | |
return re.sub(r'(.)\1{2,}', r'\1', sentence) | |
# Defining stopwords | |
stop = nltk.corpus.stopwords.words('english') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment