Skip to content

Instantly share code, notes, and snippets.

@chapter09
Created May 9, 2019 17:53
Show Gist options
  • Save chapter09/7a4cf6b4cd516ad32a3767682d5f62a4 to your computer and use it in GitHub Desktop.
Save chapter09/7a4cf6b4cd516ad32a3767682d5f62a4 to your computer and use it in GitHub Desktop.
import re
from nltk.tokenize.treebank import TreebankWordTokenizer
def preprocess_tweet(tweet):
# # all caps
# allcaps_regex = re.compile(r"([^a-z0-9()<>'`\-]){2,}")
# tweet = re.sub(allcaps_regex, '\1' + ' <allcaps> ', tweet)
# lowercase
tweet = tweet.lower()
slash_regex = re.compile(r"/")
user_regex = re.compile(r"@[\S]+")
hash_regex = re.compile(r"#(\w+)")
url_regex = re.compile(r"(http|https|ftp)://[a-zA-Z0-9\./]+")
emoji_heart_regex = re.compile(r"<3")
emoji_smile1_regex = re.compile(r"[8:=;]['`\-]?[)d]+|[)d]+['`\-]?[8:=;]")
emoji_smile2_regex = re.compile(r"\^(_|\.)\^")
emoji_lol_regex = re.compile(r"[8:=;]['`\-]?p+")
emoji_sad1_regex = re.compile(r"[8:=;]['`\-]?\(+|\)+['`\-]?[8:=;]")
emoji_sad2_regex = re.compile(r">(_|.)<")
emoji_neutral_regex = re.compile(r"[8:=;]['`\-]?[\/|l*]")
number_regex = re.compile(r"[-+]?[.\d]*[\d]+[:,.\d]*")
# repeating punctuations
rpt_punc_regex = re.compile(r"([!?.])\1{1,}")
# repeating words like hurrrryyyyyy
rpt_word_regex = re.compile(r"\b(\S*?)(.)\2{2,}\b", re.IGNORECASE)
tweet = re.sub(url_regex, ' <url> ', tweet)
tweet = re.sub(slash_regex, ' / ', tweet)
tweet = re.sub(user_regex, ' <user> ', tweet)
tweet = re.sub(hash_regex, ' <hashtag> ', tweet)
tweet = re.sub(emoji_heart_regex, ' <heart> ', tweet)
tweet = re.sub(emoji_smile1_regex, ' <smile> ', tweet)
tweet = re.sub(emoji_smile2_regex, ' <smile> ', tweet)
tweet = re.sub(emoji_lol_regex, ' <lolface> ', tweet)
tweet = re.sub(emoji_sad1_regex, ' <sadface> ', tweet)
tweet = re.sub(emoji_sad2_regex, ' <sadface> ', tweet)
tweet = re.sub(emoji_neutral_regex, ' <neutralface> ', tweet)
tweet = re.sub(number_regex, ' <number> ', tweet)
tweet = re.sub(rpt_punc_regex, r' \1' + ' <repeat> ', tweet)
tweet = re.sub(rpt_word_regex, r'\1' + r'\2' + ' <elong> ', tweet)
# split punctuation and words
word_bound_regex = re.compile(r"(\w+)([.,!,?]+)")
tweet = re.sub(word_bound_regex, r'\1' + r' \2', tweet)
tokenizer = TreebankWordTokenizer()
# to keep <> special word
tokenizer.PARENS_BRACKETS = (re.compile(r'[\]\[\(\)\{\}]'), r' \g<0> ')
tweet_toks = tokenizer.tokenize(tweet, convert_parentheses=False)
return tweet_toks
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment