Skip to content

Instantly share code, notes, and snippets.

@bicachu
Created December 14, 2020 00:58
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bicachu/09cc71bb4b0e3711eaf1556b12fa7ad7 to your computer and use it in GitHub Desktop.
Save bicachu/09cc71bb4b0e3711eaf1556b12fa7ad7 to your computer and use it in GitHub Desktop.
Complete tweet preprocessing file with all functions needed
import pandas as pd
import re
import gensim
from nltk.stem import WordNetLemmatizer
punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@' # define a string of punctuation symbols
# Functions to clean tweets
def remove_links(tweet):
"""Takes a string and removes web links from it"""
tweet = re.sub(r'http\S+', '', tweet) # remove http links
tweet = re.sub(r'bit.ly/\S+', '', tweet) # remove bitly links
tweet = tweet.strip('[link]') # remove [links]
tweet = re.sub(r'pic.twitter\S+','', tweet)
return tweet
def remove_users(tweet):
"""Takes a string and removes retweet and @user information"""
tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove re-tweet
tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at
return tweet
def remove_hashtags(tweet):
"""Takes a string and removes any hash tags"""
tweet = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove hash tags
return tweet
def remove_av(tweet):
"""Takes a string and removes AUDIO/VIDEO tags or labels"""
tweet = re.sub('VIDEO:', '', tweet) # remove 'VIDEO:' from start of tweet
tweet = re.sub('AUDIO:', '', tweet) # remove 'AUDIO:' from start of tweet
return tweet
def tokenize(tweet):
"""Returns tokenized representation of words in lemma form excluding stopwords"""
result = []
for token in gensim.utils.simple_preprocess(tweet):
if token not in gensim.parsing.preprocessing.STOPWORDS \
and len(token) > 2: # drops words with less than 3 characters
result.append(lemmatize(token))
return result
def lemmatize(token):
"""Returns lemmatization of a token"""
return WordNetLemmatizer().lemmatize(token, pos='v')
def preprocess_tweet(tweet):
"""Main master function to clean tweets, stripping noisy characters, and tokenizing use lemmatization"""
tweet = remove_users(tweet)
tweet = remove_links(tweet)
tweet = remove_hashtags(tweet)
tweet = remove_av(tweet)
tweet = tweet.lower() # lower case
tweet = re.sub('[' + punctuation + ']+', ' ', tweet) # strip punctuation
tweet = re.sub('\s+', ' ', tweet) # remove double spacing
tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
tweet_token_list = tokenize(tweet) # apply lemmatization and tokenization
tweet = ' '.join(tweet_token_list)
return tweet
def basic_clean(tweet):
"""Main master function to clean tweets only without tokenization or removal of stopwords"""
tweet = remove_users(tweet)
tweet = remove_links(tweet)
tweet = remove_hashtags(tweet)
tweet = remove_av(tweet)
tweet = tweet.lower() # lower case
tweet = re.sub('[' + punctuation + ']+', ' ', tweet) # strip punctuation
tweet = re.sub('\s+', ' ', tweet) # remove double spacing
tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
tweet = re.sub('📝 …', '', tweet)
return tweet
def tokenize_tweets(df):
"""Main function to read in and return cleaned and preprocessed dataframe.
This can be used in Jupyter notebooks by importing this module and calling the tokenize_tweets() function
Args:
df = data frame object to apply cleaning to
Returns:
pandas data frame with cleaned tokens
"""
df['tokens'] = df.tweet.apply(preprocess_tweet)
num_tweets = len(df)
print('Complete. Number of Tweets that have been cleaned and tokenized : {}'.format(num_tweets))
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment