-
-
Save bicachu/09cc71bb4b0e3711eaf1556b12fa7ad7 to your computer and use it in GitHub Desktop.
Complete tweet preprocessing file with all functions needed
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import re | |
import gensim | |
from nltk.stem import WordNetLemmatizer | |
punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@' # define a string of punctuation symbols | |
# Functions to clean tweets | |
def remove_links(tweet): | |
"""Takes a string and removes web links from it""" | |
tweet = re.sub(r'http\S+', '', tweet) # remove http links | |
tweet = re.sub(r'bit.ly/\S+', '', tweet) # remove bitly links | |
tweet = tweet.strip('[link]') # remove [links] | |
tweet = re.sub(r'pic.twitter\S+','', tweet) | |
return tweet | |
def remove_users(tweet): | |
"""Takes a string and removes retweet and @user information""" | |
tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove re-tweet | |
tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at | |
return tweet | |
def remove_hashtags(tweet): | |
"""Takes a string and removes any hash tags""" | |
tweet = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove hash tags | |
return tweet | |
def remove_av(tweet): | |
"""Takes a string and removes AUDIO/VIDEO tags or labels""" | |
tweet = re.sub('VIDEO:', '', tweet) # remove 'VIDEO:' from start of tweet | |
tweet = re.sub('AUDIO:', '', tweet) # remove 'AUDIO:' from start of tweet | |
return tweet | |
def tokenize(tweet): | |
"""Returns tokenized representation of words in lemma form excluding stopwords""" | |
result = [] | |
for token in gensim.utils.simple_preprocess(tweet): | |
if token not in gensim.parsing.preprocessing.STOPWORDS \ | |
and len(token) > 2: # drops words with less than 3 characters | |
result.append(lemmatize(token)) | |
return result | |
def lemmatize(token): | |
"""Returns lemmatization of a token""" | |
return WordNetLemmatizer().lemmatize(token, pos='v') | |
def preprocess_tweet(tweet): | |
"""Main master function to clean tweets, stripping noisy characters, and tokenizing use lemmatization""" | |
tweet = remove_users(tweet) | |
tweet = remove_links(tweet) | |
tweet = remove_hashtags(tweet) | |
tweet = remove_av(tweet) | |
tweet = tweet.lower() # lower case | |
tweet = re.sub('[' + punctuation + ']+', ' ', tweet) # strip punctuation | |
tweet = re.sub('\s+', ' ', tweet) # remove double spacing | |
tweet = re.sub('([0-9]+)', '', tweet) # remove numbers | |
tweet_token_list = tokenize(tweet) # apply lemmatization and tokenization | |
tweet = ' '.join(tweet_token_list) | |
return tweet | |
def basic_clean(tweet): | |
"""Main master function to clean tweets only without tokenization or removal of stopwords""" | |
tweet = remove_users(tweet) | |
tweet = remove_links(tweet) | |
tweet = remove_hashtags(tweet) | |
tweet = remove_av(tweet) | |
tweet = tweet.lower() # lower case | |
tweet = re.sub('[' + punctuation + ']+', ' ', tweet) # strip punctuation | |
tweet = re.sub('\s+', ' ', tweet) # remove double spacing | |
tweet = re.sub('([0-9]+)', '', tweet) # remove numbers | |
tweet = re.sub('📝 …', '', tweet) | |
return tweet | |
def tokenize_tweets(df): | |
"""Main function to read in and return cleaned and preprocessed dataframe. | |
This can be used in Jupyter notebooks by importing this module and calling the tokenize_tweets() function | |
Args: | |
df = data frame object to apply cleaning to | |
Returns: | |
pandas data frame with cleaned tokens | |
""" | |
df['tokens'] = df.tweet.apply(preprocess_tweet) | |
num_tweets = len(df) | |
print('Complete. Number of Tweets that have been cleaned and tokenized : {}'.format(num_tweets)) | |
return df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment