gracecarrillo/Preprocessing Pipeline

## Preprocessing Pipeline
#----------------------------- ETL ---------------------------------------#


# Pipeline like preprocessing with helper functions

nltk.download('stopwords')
stop_words = stopwords.words('english')

# cleaning helper function -----------------------------#
def processTweet(tweet):
    """
    Takes in a string of text, then performs the following:
    1. Removes links, special characters and other bulk cleaning
    2. Returns a list of the tidy text
    """
    # Remove HTML special entities (e.g. &amp;)
    tweet = re.sub(r'\&\w*;', '', tweet)
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+','',tweet)
    # Remove tickers
    tweet = re.sub(r'\$\w*', '', tweet)
    # To lowercase
    tweet = tweet.lower()
    # Remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*\/\w*', '', tweet)
    # Remove hashtags
    tweet = re.sub(r'#\w*', '', tweet)
    # Remove words with 2 or fewer letters
    tweet = re.sub(r'\b\w{1,2}\b', '', tweet)
    # Remove whitespace (including new line characters)
    tweet = re.sub(r'\s\s+', ' ', tweet)
    # Remove single space remaining at the front of the tweet.
    tweet = tweet.lstrip(' ')
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    tweet = ''.join(c for c in tweet if c <= '\uFFFF')
    return tweet

# tokenize helper function ------------------------------------#
def text_process(tweet):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in list(tweet) if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)

    # Now just remove any stopwords
    return [word for word in nopunc.lower().split() if word.lower() not in stopwords.words('english')]

# Lexicon normalisation with Stemming ----------------------------------------------#
def stemming(tokens):
  """
  Takes in a string of text, then performs the following:
  1. Replace words for its root based on orter Stemmer rule.
  2. Returns normalised text
   """
  stemmer = PorterStemmer()
  x = [stemmer.stem(w) for w in tokens]

  return ' '.join(x)
	#----------------------------- ETL ---------------------------------------#


	# Pipeline like preprocessing with helper functions

	nltk.download('stopwords')
	stop_words = stopwords.words('english')

	# cleaning helper function -----------------------------#
	def processTweet(tweet):
	"""
	Takes in a string of text, then performs the following:
	1. Removes links, special characters and other bulk cleaning
	2. Returns a list of the tidy text
	"""
	# Remove HTML special entities (e.g. &)
	tweet = re.sub(r'\&\w*;', '', tweet)
	#Convert @username to AT_USER
	tweet = re.sub('@[^\s]+','',tweet)
	# Remove tickers
	tweet = re.sub(r'\$\w*', '', tweet)
	# To lowercase
	tweet = tweet.lower()
	# Remove hyperlinks
	tweet = re.sub(r'https?:\/\/.\/\w', '', tweet)
	# Remove hashtags
	tweet = re.sub(r'#\w*', '', tweet)
	# Remove words with 2 or fewer letters
	tweet = re.sub(r'\b\w{1,2}\b', '', tweet)
	# Remove whitespace (including new line characters)
	tweet = re.sub(r'\s\s+', ' ', tweet)
	# Remove single space remaining at the front of the tweet.
	tweet = tweet.lstrip(' ')
	# Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
	tweet = ''.join(c for c in tweet if c <= '\uFFFF')
	return tweet

	# tokenize helper function ------------------------------------#
	def text_process(tweet):
	"""
	Takes in a string of text, then performs the following:
	1. Remove all punctuation
	2. Remove all stopwords
	3. Returns a list of the cleaned text
	"""
	# Check characters to see if they are in punctuation
	nopunc = [char for char in list(tweet) if char not in string.punctuation]

	# Join the characters again to form the string.
	nopunc = ''.join(nopunc)

	# Now just remove any stopwords
	return [word for word in nopunc.lower().split() if word.lower() not in stopwords.words('english')]

	# Lexicon normalisation with Stemming ----------------------------------------------#
	def stemming(tokens):
	"""
	Takes in a string of text, then performs the following:
	1. Replace words for its root based on orter Stemmer rule.
	2. Returns normalised text
	"""
	stemmer = PorterStemmer()
	x = [stemmer.stem(w) for w in tokens]

	return ' '.join(x)