Skip to content

Instantly share code, notes, and snippets.

@dee-walia20
Last active August 31, 2020 18:46
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dee-walia20/69475529762ba273b16654058b1260ed to your computer and use it in GitHub Desktop.
Save dee-walia20/69475529762ba273b16654058b1260ed to your computer and use it in GitHub Desktop.
Data Cleaning
import nltk
import string
import re
from nltk.stem.snowball import SnowballStemmer
stopwords=nltk.corpus.stopwords.words('english')
snowball_stemmer=SnowballStemmer(language='english')
def treat_text(text):
edited_text=re.sub('\W'," ",text) #replace any sumbol with whitespace
edited_text=re.sub(" "," ",edited_text) #replace double whitespace with single whitespace
edited_text=edited_text.split(" ") #split the sentence into array of strings
edited_text=" ".join([char for char in edited_text if char!= ""]) #remove any empty string from text
edited_text=edited_text.lower() #lowercase
edited_text=re.sub('\d+',"",edited_text) #Removing numerics
edited_text=re.split('\W+',edited_text) #spliting based on whitespace or whitespaces
edited_text=" ".join([snowball_stemmer.stem(word) for word in edited_text if word not in stopwords]) #Snowball Stemmer
return edited_text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment