Skip to content

Instantly share code, notes, and snippets.

@alinazhanguwo
Created April 24, 2019 15:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alinazhanguwo/a4b8825fb48bd54f072ec58b7e68f43d to your computer and use it in GitHub Desktop.
Save alinazhanguwo/a4b8825fb48bd54f072ec58b7e68f43d to your computer and use it in GitHub Desktop.
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;#]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z +_]')
STOPWORDS = stopwords.words('english')
STOPWORDS.extend(['rt', 'http']) # extend stopwords; rt means re-tweet
STOPWORDS = set(STOPWORDS)
def text_prepare(text):
"""
text: a string
return: modified initial string
"""
text = text.lower() # lowercase text
text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
text = ' '.join([word for word in text.split() if word not in STOPWORDS]) # delete stopwords from text
text = text.strip()
return text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment