Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
stop_words = set(stopwords.words('english'))
def text_cleaner(text):
   newString = text.lower()
   newString = BeautifulSoup(newString, "lxml").text
   newString = re.sub(r'\([^)]*\)', '', newString)
   newString = re.sub('"','', newString)
   newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])    
   newString = re.sub(r"'s\b","",newString)
   newString = re.sub("[^a-zA-Z]", " ", newString)
   tokens = [w for w in newString.split() if not w in stop_words]
   long_words=[]
   for i in tokens:
       if len(i)>=3:                  #removing short word
           long_words.append(i)   
   return (" ".join(long_words)).strip()
cleaned_text = []
for t in data['Text']:
   cleaned_text.append(text_cleaner(t))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.