Skip to content

Instantly share code, notes, and snippets.

@amankharwal
Created November 3, 2021 07:06
Show Gist options
  • Save amankharwal/8a31997c99ff6cb7789ae4e8a4cc3d84 to your computer and use it in GitHub Desktop.
Save amankharwal/8a31997c99ff6cb7789ae4e8a4cc3d84 to your computer and use it in GitHub Desktop.
import nltk
import re
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words('english'))
def clean(text):
text = str(text).lower()
text = re.sub('\[.*?\]', '', text)
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
text = [word for word in text.split(' ') if word not in stopword]
text=" ".join(text)
text = [stemmer.stem(word) for word in text.split(' ')]
text=" ".join(text)
return text
data["text"] = data["text"].apply(clean)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment