Skip to content

Instantly share code, notes, and snippets.

@MandarGogate
Created June 22, 2020 15:37
Show Gist options
  • Save MandarGogate/96535ac95f8c4a6753aa25c9f7690d4a to your computer and use it in GitHub Desktop.
Save MandarGogate/96535ac95f8c4a6753aa25c9f7690d4a to your computer and use it in GitHub Desktop.
TweetProcessing
from nltk.tokenize import TweetTokenizer
from emoji import demojize
import re
tokenizer = TweetTokenizer()
def normalizeToken(token):
lowercased_token = token.lower()
if token.startswith("@"):
return "@USER"
elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
return "HTTPURL"
elif len(token) == 1:
return demojize(token)
else:
if token == "’":
return "'"
elif token == "…":
return "..."
else:
return token
def normalizeTweet(tweet):
tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
normTweet = " ".join([normalizeToken(token) for token in tokens])
normTweet = normTweet.replace("cannot ", "can not ").replace("n't ", " n't ").replace("n 't ", " n't ").replace("ca n't", "can't").replace("ai n't", "ain't")
normTweet = normTweet.replace("'m ", " 'm ").replace("'re ", " 're ").replace("'s ", " 's ").replace("'ll ", " 'll ").replace("'d ", " 'd ").replace("'ve ", " 've ")
normTweet = normTweet.replace(" p . m .", " p.m.").replace(" p . m ", " p.m ").replace(" a . m .", " a.m.").replace(" a . m ", " a.m ")
normTweet = re.sub(r",([0-9]{2,4}) , ([0-9]{2,4})", r",\1,\2", normTweet)
normTweet = re.sub(r"([0-9]{1,3}) / ([0-9]{2,4})", r"\1/\2", normTweet)
normTweet = re.sub(r"([0-9]{1,3})- ([0-9]{2,4})", r"\1-\2", normTweet)
return " ".join(normTweet.split())
if __name__ == "__main__":
print(normalizeTweet(
"SC has first two presumptive cases of coronavirus, DHEC confirms https://postandcourier.com/health/covid19/sc-has-first-two-presumptive-cases-of-coronavirus-dhec-confirms/article_bddfe4ae-5fd3-11ea-9ce4-5f495366cee6.html?utm_medium=social&utm_source=twitter&utm_campaign=user-share… via @postandcourier"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment