Last active
October 31, 2018 09:59
-
-
Save DFoly/5d335e6a28ac1b14bd9f439ca1268338 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def clean_tweets(self, df): | |
""" | |
Takes raw tweets and cleans them | |
so we can carry out analysis | |
remove stopwords, punctuation, | |
lower case, html, emoticons. | |
This will be done using Regex | |
? means option so colou?r matches | |
both color and colour. | |
""" | |
# text preprocessing | |
stopword_list = stopwords.words('english') | |
#ps = PorterStemmer() | |
wordnet_lemmatizer = WordNetLemmatizer() | |
df["clean_tweets"] = None | |
df['len'] = None | |
for i in range(0,len(df['tweet'])): | |
# get rid of anything that isnt a letter | |
exclusion_list = ['[^a-zA-Z]','rt', 'http', 'co', 'RT'] | |
exclusions = '|'.join(exclusion_list) | |
text = re.sub(exclusions, ' ' , df['tweet'][i]) | |
text = text.lower() | |
words = text.split() | |
words = [wordnet_lemmatizer.lemmatize(word) for word in words if not word in stopword_list] | |
# only use stem of word | |
#words = [ps.stem(word) for word in words] | |
df['clean_tweets'][i] = ' '.join(words) | |
# Create column with data length | |
df['len'] = np.array([len(tweet) for tweet in data["clean_tweets"]]) | |
return df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment