Skip to content

Instantly share code, notes, and snippets.

@mjcreativeventures
Created February 15, 2016 05:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save mjcreativeventures/63db5fe6e70b3e468ad3 to your computer and use it in GitHub Desktop.
Save mjcreativeventures/63db5fe6e70b3e468ad3 to your computer and use it in GitHub Desktop.
Supervised Classification of Tweets
import nltk
import random
import re
STATIONS = [
'Admiralty MRT',
'Aljunied MRT',
'Ang Mo Kio MRT',
'Bartley MRT',
'Bayfront MRT',
'Bedok MRT',
'Bishan MRT',
'Bras Basah MRT',
'Botanic Gardens MRT',
'Braddell MRT',
'Bukit Batok MRT',
'Bukit Gombak MRT',
'Caldecott MRT',
'Choa Chu Kang MRT',
'Boon Keng MRT',
'Boon Lay MRT',
'Buangkok MRT',
'Bugis MRT',
'Buona Vista MRT',
'Changi Airport MRT',
'Chinatown MRT',
'Clarke Quay MRT',
'Chinese Garden MRT',
'City Hall MRT',
'Clementi MRT',
'Commonwealth MRT',
'Dakota MRT',
'Dhoby Ghaut MRT',
'Dover MRT',
'Esplanade MRT',
'Eunos MRT',
'Expo MRT',
'Farrer Park MRT',
'Farrer Road MRT',
'HarbourFront MRT',
'Haw Par Villa MRT',
'Holland Village MRT',
'Hougang MRT',
'Joo Koon MRT',
'Jurong East MRT',
'Kallang MRT',
'Kovan MRT',
'Kembangan MRT',
'Kent Ridge MRT',
'Khatib MRT',
'Kranji MRT',
'Lakeside MRT',
'Labrador Park MRT',
'Lavender MRT',
'Little India MRT',
'Lorong Chuan MRT',
'Marina Bay MRT',
'Marsiling MRT',
'MacPherson MRT',
'Marymount MRT',
'Mountbatten MRT',
'Newton MRT',
'Nicoll Highway MRT',
'one-north MRT',
'Novena MRT',
'Orchard MRT',
'Outram Park MRT',
'Pasir Ris MRT',
'Pasir Panjang MRT',
'Paya Lebar MRT',
'Pioneer MRT',
'Potong Pasir MRT',
'Promenade MRT',
'Punggol MRT',
'Queenstown MRT',
'Raffles Place MRT',
'Redhill MRT',
'Sembawang MRT',
'Sengkang MRT',
'Serangoon MRT',
'Simei MRT',
'Somerset MRT',
'Stadium MRT',
'Tampines MRT',
'Tai Seng MRT',
'Tanah Merah MRT',
'Tanjong Pagar MRT',
'Tiong Bahru MRT',
'Telok Blangah MRT',
'Toa Payoh MRT',
'Woodlands MRT',
'Woodleigh MRT',
'Yew Tree MRT',
'Yio Chu Kang MRT',
'Yishun MRT'
]
# regular expressions used to clean up the tweet data
mrt_station_re = re.compile('|'.join(STATIONS).lower())
http_re = re.compile(r'\s+http://[^\s]*')
remove_ellipsis_re = re.compile(r'\.\.\.')
at_sign_re = re.compile(r'\@\S+')
punct_re = re.compile(r"[\"'\[\],.:;()\-&!]")
price_re = re.compile(r"\d+\.\d\d")
number_re = re.compile(r"\d+")
# converts to lower case and clean up the text
def normalize_tweet(tweet):
t = tweet.lower()
t = re.sub(price_re, 'PRICE', t)
t = re.sub(remove_ellipsis_re, '', t)
t = re.sub(mrt_station_re, 'MRT_STATION', t)
t = re.sub(http_re, ' LINK', t)
t = re.sub(punct_re, '', t)
t = re.sub(at_sign_re, '@', t)
t = re.sub(number_re, 'NUM', t)
return t
def tweet_features(tweet_data):
features = {}
tweet = normalize_tweet(tweet_data['tweet'])
for bigrams in nltk.bigrams(tweet.split(' ')):
features['contains(%s)' % ','.join(bigrams)] = True
return features
# reads three lines of text from a file
read3lines = lambda x: [ x.readline().strip(), x.readline().strip(), x.readline() ]
data = []
with open('labelled_tweets.data') as f:
tweet, label, newline = read3lines(f)
while len(tweet) > 0:
data.append({ 'tweet': tweet, 'label': label })
tweet, label, newline = read3lines(f)
#random.shuffle(data)
# we split the data into two parts
# the first part (90% of the data) is for training
# the remaining 10% of the data is for testing
size = int(len(data) * 0.9)
train_data = data[:size]
test_data = data[size:]
# generate features for tweet
train_set = [ (tweet_features(d), d['label']) for d in train_data ]
test_set = [ (tweet_features(d), d['label']) for d in test_data ]
# pick a classifier
classifier = nltk.NaiveBayesClassifier
# train classifier using training set
classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier.show_most_informative_features(20)
# collect tweets that were wrongly classified
errors = []
for d in test_data:
label = d['label']
guess = classifier.classify(tweet_features(d))
if guess != label:
errors.append( (label, guess, d) )
for (label, guess, d) in sorted(errors):
print 'correct label: %s\nguessed label: %s\ntweet=%s\n' % (label, guess, d['tweet'])
print 'Total errors: %d' % len(errors)
print 'Accuracy: ', nltk.classify.accuracy(classifier, test_set)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment