bicachu/tweet_preprocessor.py Secret

## tweet_preprocessor.py
import pandas as pd
import re
import gensim
from nltk.stem import WordNetLemmatizer

punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'         # define a string of punctuation symbols

# Functions to clean tweets
def remove_links(tweet):
    """Takes a string and removes web links from it"""
    tweet = re.sub(r'http\S+', '', tweet)   # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet)  # remove bitly links
    tweet = tweet.strip('[link]')   # remove [links]
    tweet = re.sub(r'pic.twitter\S+','', tweet)
    return tweet

def remove_users(tweet):
    """Takes a string and removes retweet and @user information"""
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove re-tweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove tweeted at
    return tweet

def remove_hashtags(tweet):
    """Takes a string and removes any hash tags"""
    tweet = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove hash tags
    return tweet

def remove_av(tweet):
    """Takes a string and removes AUDIO/VIDEO tags or labels"""
    tweet = re.sub('VIDEO:', '', tweet)  # remove 'VIDEO:' from start of tweet
    tweet = re.sub('AUDIO:', '', tweet)  # remove 'AUDIO:' from start of tweet
    return tweet

def tokenize(tweet):
    """Returns tokenized representation of words in lemma form excluding stopwords"""
    result = []
    for token in gensim.utils.simple_preprocess(tweet):
        if token not in gensim.parsing.preprocessing.STOPWORDS \
                and len(token) > 2:  # drops words with less than 3 characters
            result.append(lemmatize(token))
    return result

def lemmatize(token):
    """Returns lemmatization of a token"""
    return WordNetLemmatizer().lemmatize(token, pos='v')

def preprocess_tweet(tweet):
    """Main master function to clean tweets, stripping noisy characters, and tokenizing use lemmatization"""
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = remove_hashtags(tweet)
    tweet = remove_av(tweet)
    tweet = tweet.lower()  # lower case
    tweet = re.sub('[' + punctuation + ']+', ' ', tweet)  # strip punctuation
    tweet = re.sub('\s+', ' ', tweet)  # remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet)  # remove numbers
    tweet_token_list = tokenize(tweet)  # apply lemmatization and tokenization
    tweet = ' '.join(tweet_token_list)
    return tweet

def basic_clean(tweet):
    """Main master function to clean tweets only without tokenization or removal of stopwords"""
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = remove_hashtags(tweet)
    tweet = remove_av(tweet)
    tweet = tweet.lower()  # lower case
    tweet = re.sub('[' + punctuation + ']+', ' ', tweet)  # strip punctuation
    tweet = re.sub('\s+', ' ', tweet)  # remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet)  # remove numbers
    tweet = re.sub('📝 …', '', tweet)
    return tweet

def tokenize_tweets(df):
    """Main function to read in and return cleaned and preprocessed dataframe.
    This can be used in Jupyter notebooks by importing this module and calling the tokenize_tweets() function

    Args:
        df = data frame object to apply cleaning to

    Returns:
        pandas data frame with cleaned tokens
    """

    df['tokens'] = df.tweet.apply(preprocess_tweet)
    num_tweets = len(df)
    print('Complete. Number of Tweets that have been cleaned and tokenized : {}'.format(num_tweets))
    return df
	import pandas as pd
	import re
	import gensim
	from nltk.stem import WordNetLemmatizer

	punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{\|}~•@' # define a string of punctuation symbols

	# Functions to clean tweets
	def remove_links(tweet):
	"""Takes a string and removes web links from it"""
	tweet = re.sub(r'http\S+', '', tweet) # remove http links
	tweet = re.sub(r'bit.ly/\S+', '', tweet) # remove bitly links
	tweet = tweet.strip('[link]') # remove [links]
	tweet = re.sub(r'pic.twitter\S+','', tweet)
	return tweet

	def remove_users(tweet):
	"""Takes a string and removes retweet and @user information"""
	tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove re-tweet
	tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at
	return tweet

	def remove_hashtags(tweet):
	"""Takes a string and removes any hash tags"""
	tweet = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove hash tags
	return tweet

	def remove_av(tweet):
	"""Takes a string and removes AUDIO/VIDEO tags or labels"""
	tweet = re.sub('VIDEO:', '', tweet) # remove 'VIDEO:' from start of tweet
	tweet = re.sub('AUDIO:', '', tweet) # remove 'AUDIO:' from start of tweet
	return tweet

	def tokenize(tweet):
	"""Returns tokenized representation of words in lemma form excluding stopwords"""
	result = []
	for token in gensim.utils.simple_preprocess(tweet):
	if token not in gensim.parsing.preprocessing.STOPWORDS \
	and len(token) > 2: # drops words with less than 3 characters
	result.append(lemmatize(token))
	return result

	def lemmatize(token):
	"""Returns lemmatization of a token"""
	return WordNetLemmatizer().lemmatize(token, pos='v')

	def preprocess_tweet(tweet):
	"""Main master function to clean tweets, stripping noisy characters, and tokenizing use lemmatization"""
	tweet = remove_users(tweet)
	tweet = remove_links(tweet)
	tweet = remove_hashtags(tweet)
	tweet = remove_av(tweet)
	tweet = tweet.lower() # lower case
	tweet = re.sub('[' + punctuation + ']+', ' ', tweet) # strip punctuation
	tweet = re.sub('\s+', ' ', tweet) # remove double spacing
	tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
	tweet_token_list = tokenize(tweet) # apply lemmatization and tokenization
	tweet = ' '.join(tweet_token_list)
	return tweet

	def basic_clean(tweet):
	"""Main master function to clean tweets only without tokenization or removal of stopwords"""
	tweet = remove_users(tweet)
	tweet = remove_links(tweet)
	tweet = remove_hashtags(tweet)
	tweet = remove_av(tweet)
	tweet = tweet.lower() # lower case
	tweet = re.sub('[' + punctuation + ']+', ' ', tweet) # strip punctuation
	tweet = re.sub('\s+', ' ', tweet) # remove double spacing
	tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
	tweet = re.sub('📝 …', '', tweet)
	return tweet

	def tokenize_tweets(df):
	"""Main function to read in and return cleaned and preprocessed dataframe.
	This can be used in Jupyter notebooks by importing this module and calling the tokenize_tweets() function

	Args:
	df = data frame object to apply cleaning to

	Returns:
	pandas data frame with cleaned tokens
	"""

	df['tokens'] = df.tweet.apply(preprocess_tweet)
	num_tweets = len(df)
	print('Complete. Number of Tweets that have been cleaned and tokenized : {}'.format(num_tweets))
	return df