Skip to content

Instantly share code, notes, and snippets.

@DFoly
Last active October 31, 2018 09:59
Show Gist options
  • Save DFoly/5d335e6a28ac1b14bd9f439ca1268338 to your computer and use it in GitHub Desktop.
Save DFoly/5d335e6a28ac1b14bd9f439ca1268338 to your computer and use it in GitHub Desktop.
def clean_tweets(self, df):
"""
Takes raw tweets and cleans them
so we can carry out analysis
remove stopwords, punctuation,
lower case, html, emoticons.
This will be done using Regex
? means option so colou?r matches
both color and colour.
"""
# text preprocessing
stopword_list = stopwords.words('english')
#ps = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
df["clean_tweets"] = None
df['len'] = None
for i in range(0,len(df['tweet'])):
# get rid of anything that isnt a letter
exclusion_list = ['[^a-zA-Z]','rt', 'http', 'co', 'RT']
exclusions = '|'.join(exclusion_list)
text = re.sub(exclusions, ' ' , df['tweet'][i])
text = text.lower()
words = text.split()
words = [wordnet_lemmatizer.lemmatize(word) for word in words if not word in stopword_list]
# only use stem of word
#words = [ps.stem(word) for word in words]
df['clean_tweets'][i] = ' '.join(words)
# Create column with data length
df['len'] = np.array([len(tweet) for tweet in data["clean_tweets"]])
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment