aravindpai/textcleaning.py

## textcleaning.py
stop_words = set(stopwords.words('english'))
def text_cleaner(text):
    newString = text.lower()
    newString = BeautifulSoup(newString, "lxml").text
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = re.sub('"','', newString)
    newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])
    newString = re.sub(r"'s\b","",newString)
    newString = re.sub("[^a-zA-Z]", " ", newString)
    tokens = [w for w in newString.split() if not w in stop_words]
    long_words=[]
    for i in tokens:
        if len(i)>=3:                  #removing short word
            long_words.append(i)
    return (" ".join(long_words)).strip()

cleaned_text = []
for t in data['Text']:
    cleaned_text.append(text_cleaner(t))
	stop_words = set(stopwords.words('english'))
	def text_cleaner(text):
	newString = text.lower()
	newString = BeautifulSoup(newString, "lxml").text
	newString = re.sub(r'\([^)]*\)', '', newString)
	newString = re.sub('"','', newString)
	newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])
	newString = re.sub(r"'s\b","",newString)
	newString = re.sub("[^a-zA-Z]", " ", newString)
	tokens = [w for w in newString.split() if not w in stop_words]
	long_words=[]
	for i in tokens:
	if len(i)>=3: #removing short word
	long_words.append(i)
	return (" ".join(long_words)).strip()

	cleaned_text = []
	for t in data['Text']:
	cleaned_text.append(text_cleaner(t))