Last active
January 27, 2020 13:51
-
-
Save gracecarrillo/53ab0c64121514abe02a74e483fd29ce to your computer and use it in GitHub Desktop.
The objective of this step is to clean noise those are less relevant to find the sentiment of tweets such as punctuation, special characters, numbers, and terms which don’t carry much weightage in context to the text.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#----------------------------- ETL ---------------------------------------# | |
# Pipeline like preprocessing with helper functions | |
nltk.download('stopwords') | |
stop_words = stopwords.words('english') | |
# cleaning helper function -----------------------------# | |
def processTweet(tweet): | |
""" | |
Takes in a string of text, then performs the following: | |
1. Removes links, special characters and other bulk cleaning | |
2. Returns a list of the tidy text | |
""" | |
# Remove HTML special entities (e.g. &) | |
tweet = re.sub(r'\&\w*;', '', tweet) | |
#Convert @username to AT_USER | |
tweet = re.sub('@[^\s]+','',tweet) | |
# Remove tickers | |
tweet = re.sub(r'\$\w*', '', tweet) | |
# To lowercase | |
tweet = tweet.lower() | |
# Remove hyperlinks | |
tweet = re.sub(r'https?:\/\/.*\/\w*', '', tweet) | |
# Remove hashtags | |
tweet = re.sub(r'#\w*', '', tweet) | |
# Remove words with 2 or fewer letters | |
tweet = re.sub(r'\b\w{1,2}\b', '', tweet) | |
# Remove whitespace (including new line characters) | |
tweet = re.sub(r'\s\s+', ' ', tweet) | |
# Remove single space remaining at the front of the tweet. | |
tweet = tweet.lstrip(' ') | |
# Remove characters beyond Basic Multilingual Plane (BMP) of Unicode: | |
tweet = ''.join(c for c in tweet if c <= '\uFFFF') | |
return tweet | |
# tokenize helper function ------------------------------------# | |
def text_process(tweet): | |
""" | |
Takes in a string of text, then performs the following: | |
1. Remove all punctuation | |
2. Remove all stopwords | |
3. Returns a list of the cleaned text | |
""" | |
# Check characters to see if they are in punctuation | |
nopunc = [char for char in list(tweet) if char not in string.punctuation] | |
# Join the characters again to form the string. | |
nopunc = ''.join(nopunc) | |
# Now just remove any stopwords | |
return [word for word in nopunc.lower().split() if word.lower() not in stopwords.words('english')] | |
# Lexicon normalisation with Stemming ----------------------------------------------# | |
def stemming(tokens): | |
""" | |
Takes in a string of text, then performs the following: | |
1. Replace words for its root based on orter Stemmer rule. | |
2. Returns normalised text | |
""" | |
stemmer = PorterStemmer() | |
x = [stemmer.stem(w) for w in tokens] | |
return ' '.join(x) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment