Skip to content

Instantly share code, notes, and snippets.

@vickyqian
Last active April 10, 2020 10:48
Show Gist options
  • Save vickyqian/24d2fb7dde4328f3c2ed868511209f70 to your computer and use it in GitHub Desktop.
Save vickyqian/24d2fb7dde4328f3c2ed868511209f70 to your computer and use it in GitHub Desktop.
Preprocess Tweets
###Preprocess tweets
def processTweet2(tweet):
# process the tweets
#Convert to lower case
tweet = tweet.lower()
#Convert www.* or https?://* to URL
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
#Convert @username to AT_USER
tweet = re.sub('@[^\s]+','AT_USER',tweet)
#Remove additional white spaces
tweet = re.sub('[\s]+', ' ', tweet)
#Replace #word with word
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
#trim
tweet = tweet.strip('\'"')
return tweet
###get stopword list
def getStopWordList(stopWordListFileName):
#read the stopwords file and build a list
stopWords = []
stopWords.append('AT_USER')
stopWords.append('URL')
fp = open(stopWordListFileName, 'r')
line = fp.readline()
while line:
word = line.strip()
stopWords.append(word)
line = fp.readline()
fp.close()
return stopWords
stopWords = []
st = open('stopwords.txt', 'r')
stopWords = getStopWordList('stopwords.txt')
def replaceTwoOrMore(s):
#look for 2 or more repetitions of character and replace with the character itself
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
return pattern.sub(r"\1\1", s)
#end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment