Skip to content

Instantly share code, notes, and snippets.

@ravikiranj
Last active December 9, 2020 02:57
Show Gist options
  • Save ravikiranj/2639031 to your computer and use it in GitHub Desktop.
Save ravikiranj/2639031 to your computer and use it in GitHub Desktop.
preprocess tweets
#import regex
import re
#start process_tweet
def processTweet(tweet):
# process the tweets
#Convert to lower case
tweet = tweet.lower()
#Convert www.* or https?://* to URL
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
#Convert @username to AT_USER
tweet = re.sub('@[^\s]+','AT_USER',tweet)
#Remove additional white spaces
tweet = re.sub('[\s]+', ' ', tweet)
#Replace #word with word
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
#trim
tweet = tweet.strip('\'"')
return tweet
#end
#Read the tweets one by one and process it
fp = open('data/sampleTweets.txt', 'r')
line = fp.readline()
while line:
processedTweet = processTweet(line)
print processedTweet
line = fp.readline()
#end loop
fp.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment