Skip to content

Instantly share code, notes, and snippets.

@AyishaR
Created January 22, 2021 16:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AyishaR/6b352bcebf6bfb8d08fa8dc883906a06 to your computer and use it in GitHub Desktop.
Save AyishaR/6b352bcebf6bfb8d08fa8dc883906a06 to your computer and use it in GitHub Desktop.
# Remove html tags
def removeHTML(sentence):
regex = re.compile('<.*?>')
return re.sub(regex, ' ', sentence)
# Remove URLs
def removeURL(sentence):
regex = re.compile('http[s]?://\S+')
return re.sub(regex, ' ', sentence)
# remove numbers, punctuation and any special characters (keep only alphabets)
def onlyAlphabets(sentence):
regex = re.compile('[^a-zA-Z]')
return re.sub(regex, ' ', sentence)
def removeRecurring(sentence):
return re.sub(r'(.)\1{2,}', r'\1', sentence)
# Defining stopwords
stop = nltk.corpus.stopwords.words('english')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment