Skip to content

Instantly share code, notes, and snippets.

@aravindpai
Created May 27, 2019 06:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aravindpai/c8364edf7c2d5083c97de71607d4bb25 to your computer and use it in GitHub Desktop.
Save aravindpai/c8364edf7c2d5083c97de71607d4bb25 to your computer and use it in GitHub Desktop.
stop_words = set(stopwords.words('english'))
def text_cleaner(text):
   newString = text.lower()
   newString = BeautifulSoup(newString, "lxml").text
   newString = re.sub(r'\([^)]*\)', '', newString)
   newString = re.sub('"','', newString)
   newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])    
   newString = re.sub(r"'s\b","",newString)
   newString = re.sub("[^a-zA-Z]", " ", newString)
   tokens = [w for w in newString.split() if not w in stop_words]
   long_words=[]
   for i in tokens:
       if len(i)>=3:                  #removing short word
           long_words.append(i)   
   return (" ".join(long_words)).strip()
cleaned_text = []
for t in data['Text']:
   cleaned_text.append(text_cleaner(t))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment