Skip to content

Instantly share code, notes, and snippets.

@amankharwal
Last active September 13, 2021 10:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amankharwal/7bd066407d6fe32592b257b393472446 to your computer and use it in GitHub Desktop.
Save amankharwal/7bd066407d6fe32592b257b393472446 to your computer and use it in GitHub Desktop.
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words('english'))
def clean(text):
text = str(text).lower()
text = re.sub('\[.*?\]', '', text)
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
text = [word for word in text.split(' ') if word not in stopword]
text=" ".join(text)
text = [stemmer.stem(word) for word in text.split(' ')]
text=" ".join(text)
return text
data["tweet"] = data["tweet"].apply(clean)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment