Skip to content

Instantly share code, notes, and snippets.

@susanli2016
Created September 23, 2018 18:36
Show Gist options
  • Save susanli2016/29d4cf9b7db4c6804e174b8eac2891e8 to your computer and use it in GitHub Desktop.
Save susanli2016/29d4cf9b7db4c6804e174b8eac2891e8 to your computer and use it in GitHub Desktop.
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
"""
text: a string
return: modified initial string
"""
text = BeautifulSoup(text, "lxml").text # HTML decoding
text = text.lower() # lowercase text
text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
return text
df['post'] = df['post'].apply(clean_text)
print_plot(10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment