susanli2016/clean_text

## clean_text
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string

        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

df['post'] = df['post'].apply(clean_text)
print_plot(10)
	REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\\|@,;]')
	BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
	STOPWORDS = set(stopwords.words('english'))

	def clean_text(text):
	"""
	text: a string

	return: modified initial string
	"""
	text = BeautifulSoup(text, "lxml").text # HTML decoding
	text = text.lower() # lowercase text
	text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
	text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
	text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
	return text

	df['post'] = df['post'].apply(clean_text)
	print_plot(10)