mjcreativeventures/tweet_classifier.py

## tweet_classifier.py
import nltk
import random
import re

STATIONS = [
        'Admiralty MRT',
        'Aljunied MRT',
        'Ang Mo Kio MRT',
        'Bartley MRT',
        'Bayfront MRT',
        'Bedok MRT',
        'Bishan MRT',
        'Bras Basah MRT',
        'Botanic Gardens MRT',
        'Braddell MRT',
        'Bukit Batok MRT',
        'Bukit Gombak MRT',
        'Caldecott MRT',
        'Choa Chu Kang MRT',
        'Boon Keng MRT',
        'Boon Lay MRT',
        'Buangkok  MRT',
        'Bugis MRT',
        'Buona Vista MRT',
        'Changi Airport MRT',
        'Chinatown MRT',
        'Clarke Quay MRT',
        'Chinese Garden MRT',
        'City Hall MRT',
        'Clementi MRT',
        'Commonwealth MRT',
        'Dakota MRT',
        'Dhoby Ghaut MRT',
        'Dover MRT',
        'Esplanade MRT',
        'Eunos MRT',
        'Expo MRT',
        'Farrer Park MRT',
        'Farrer Road MRT',
        'HarbourFront MRT',
        'Haw Par Villa MRT',
        'Holland Village MRT',
        'Hougang MRT',
        'Joo Koon MRT',
        'Jurong East MRT',
        'Kallang MRT',
        'Kovan MRT',
        'Kembangan MRT',
        'Kent Ridge MRT',
        'Khatib MRT',
        'Kranji MRT',
        'Lakeside MRT',
        'Labrador Park MRT',
        'Lavender MRT',
        'Little India MRT',
        'Lorong Chuan MRT',
        'Marina Bay MRT',
        'Marsiling MRT',
        'MacPherson MRT',
        'Marymount MRT',
        'Mountbatten MRT',
        'Newton MRT',
        'Nicoll Highway MRT',
        'one-north MRT',
        'Novena MRT',
        'Orchard MRT',
        'Outram Park MRT',
        'Pasir Ris MRT',
        'Pasir Panjang MRT',
        'Paya Lebar MRT',
        'Pioneer MRT',
        'Potong Pasir MRT',
        'Promenade MRT',
        'Punggol MRT',
        'Queenstown MRT',
        'Raffles Place MRT',
        'Redhill MRT',
        'Sembawang MRT',
        'Sengkang MRT',
        'Serangoon MRT',
        'Simei MRT',
        'Somerset MRT',
        'Stadium MRT',
        'Tampines MRT',
        'Tai Seng MRT',
        'Tanah Merah MRT',
        'Tanjong Pagar MRT',
        'Tiong Bahru MRT',
        'Telok Blangah MRT',
        'Toa Payoh MRT',
        'Woodlands MRT',
        'Woodleigh MRT',
        'Yew Tree MRT',
        'Yio Chu Kang MRT',
        'Yishun MRT'
        ]

# regular expressions used to clean up the tweet data
mrt_station_re = re.compile('|'.join(STATIONS).lower())
http_re = re.compile(r'\s+http://[^\s]*')
remove_ellipsis_re = re.compile(r'\.\.\.')
at_sign_re = re.compile(r'\@\S+')
punct_re = re.compile(r"[\"'\[\],.:;()\-&!]")
price_re = re.compile(r"\d+\.\d\d")
number_re = re.compile(r"\d+")

# converts to lower case and clean up the text
def normalize_tweet(tweet):
    t = tweet.lower()
    t = re.sub(price_re, 'PRICE', t)
    t = re.sub(remove_ellipsis_re, '', t)
    t = re.sub(mrt_station_re, 'MRT_STATION', t)
    t = re.sub(http_re, ' LINK', t)
    t = re.sub(punct_re, '', t)
    t = re.sub(at_sign_re, '@', t)
    t = re.sub(number_re, 'NUM', t)
    return t

def tweet_features(tweet_data):
    features = {}

    tweet = normalize_tweet(tweet_data['tweet'])
    for bigrams in nltk.bigrams(tweet.split(' ')):
        features['contains(%s)' % ','.join(bigrams)] = True

    return features

# reads three lines of text from a file
read3lines = lambda x: [ x.readline().strip(), x.readline().strip(), x.readline() ]

data = []
with open('labelled_tweets.data') as f:
    tweet, label, newline = read3lines(f)

    while len(tweet) > 0:
        data.append({ 'tweet': tweet, 'label': label })
        tweet, label, newline = read3lines(f)

#random.shuffle(data)

# we split the data into two parts
# the first part (90% of the data) is for training
# the remaining 10% of the data is for testing
size = int(len(data) * 0.9)

train_data = data[:size]
test_data = data[size:]

# generate features for tweet
train_set = [ (tweet_features(d), d['label']) for d in train_data ]
test_set  = [ (tweet_features(d), d['label']) for d in test_data  ]

# pick a classifier
classifier = nltk.NaiveBayesClassifier

# train classifier using training set
classifier = nltk.NaiveBayesClassifier.train(train_set)

classifier.show_most_informative_features(20)

# collect tweets that were wrongly classified
errors = []
for d in test_data:
    label = d['label']
    guess = classifier.classify(tweet_features(d))
    if guess != label:
        errors.append( (label, guess, d) )

for (label, guess, d) in sorted(errors):
    print 'correct label: %s\nguessed label: %s\ntweet=%s\n' % (label, guess, d['tweet'])

print 'Total errors: %d' % len(errors)

print 'Accuracy: ', nltk.classify.accuracy(classifier, test_set)
	import nltk
	import random
	import re

	STATIONS = [
	'Admiralty MRT',
	'Aljunied MRT',
	'Ang Mo Kio MRT',
	'Bartley MRT',
	'Bayfront MRT',
	'Bedok MRT',
	'Bishan MRT',
	'Bras Basah MRT',
	'Botanic Gardens MRT',
	'Braddell MRT',
	'Bukit Batok MRT',
	'Bukit Gombak MRT',
	'Caldecott MRT',
	'Choa Chu Kang MRT',
	'Boon Keng MRT',
	'Boon Lay MRT',
	'Buangkok MRT',
	'Bugis MRT',
	'Buona Vista MRT',
	'Changi Airport MRT',
	'Chinatown MRT',
	'Clarke Quay MRT',
	'Chinese Garden MRT',
	'City Hall MRT',
	'Clementi MRT',
	'Commonwealth MRT',
	'Dakota MRT',
	'Dhoby Ghaut MRT',
	'Dover MRT',
	'Esplanade MRT',
	'Eunos MRT',
	'Expo MRT',
	'Farrer Park MRT',
	'Farrer Road MRT',
	'HarbourFront MRT',
	'Haw Par Villa MRT',
	'Holland Village MRT',
	'Hougang MRT',
	'Joo Koon MRT',
	'Jurong East MRT',
	'Kallang MRT',
	'Kovan MRT',
	'Kembangan MRT',
	'Kent Ridge MRT',
	'Khatib MRT',
	'Kranji MRT',
	'Lakeside MRT',
	'Labrador Park MRT',
	'Lavender MRT',
	'Little India MRT',
	'Lorong Chuan MRT',
	'Marina Bay MRT',
	'Marsiling MRT',
	'MacPherson MRT',
	'Marymount MRT',
	'Mountbatten MRT',
	'Newton MRT',
	'Nicoll Highway MRT',
	'one-north MRT',
	'Novena MRT',
	'Orchard MRT',
	'Outram Park MRT',
	'Pasir Ris MRT',
	'Pasir Panjang MRT',
	'Paya Lebar MRT',
	'Pioneer MRT',
	'Potong Pasir MRT',
	'Promenade MRT',
	'Punggol MRT',
	'Queenstown MRT',
	'Raffles Place MRT',
	'Redhill MRT',
	'Sembawang MRT',
	'Sengkang MRT',
	'Serangoon MRT',
	'Simei MRT',
	'Somerset MRT',
	'Stadium MRT',
	'Tampines MRT',
	'Tai Seng MRT',
	'Tanah Merah MRT',
	'Tanjong Pagar MRT',
	'Tiong Bahru MRT',
	'Telok Blangah MRT',
	'Toa Payoh MRT',
	'Woodlands MRT',
	'Woodleigh MRT',
	'Yew Tree MRT',
	'Yio Chu Kang MRT',
	'Yishun MRT'
	]

	# regular expressions used to clean up the tweet data
	mrt_station_re = re.compile('\|'.join(STATIONS).lower())
	http_re = re.compile(r'\s+http://[^\s]*')
	remove_ellipsis_re = re.compile(r'\.\.\.')
	at_sign_re = re.compile(r'\@\S+')
	punct_re = re.compile(r"[\"'\[\],.:;()\-&!]")
	price_re = re.compile(r"\d+\.\d\d")
	number_re = re.compile(r"\d+")

	# converts to lower case and clean up the text
	def normalize_tweet(tweet):
	t = tweet.lower()
	t = re.sub(price_re, 'PRICE', t)
	t = re.sub(remove_ellipsis_re, '', t)
	t = re.sub(mrt_station_re, 'MRT_STATION', t)
	t = re.sub(http_re, ' LINK', t)
	t = re.sub(punct_re, '', t)
	t = re.sub(at_sign_re, '@', t)
	t = re.sub(number_re, 'NUM', t)
	return t

	def tweet_features(tweet_data):
	features = {}

	tweet = normalize_tweet(tweet_data['tweet'])
	for bigrams in nltk.bigrams(tweet.split(' ')):
	features['contains(%s)' % ','.join(bigrams)] = True

	return features

	# reads three lines of text from a file
	read3lines = lambda x: [ x.readline().strip(), x.readline().strip(), x.readline() ]

	data = []
	with open('labelled_tweets.data') as f:
	tweet, label, newline = read3lines(f)

	while len(tweet) > 0:
	data.append({ 'tweet': tweet, 'label': label })
	tweet, label, newline = read3lines(f)

	#random.shuffle(data)

	# we split the data into two parts
	# the first part (90% of the data) is for training
	# the remaining 10% of the data is for testing
	size = int(len(data) * 0.9)

	train_data = data[:size]
	test_data = data[size:]

	# generate features for tweet
	train_set = [ (tweet_features(d), d['label']) for d in train_data ]
	test_set = [ (tweet_features(d), d['label']) for d in test_data ]

	# pick a classifier
	classifier = nltk.NaiveBayesClassifier

	# train classifier using training set
	classifier = nltk.NaiveBayesClassifier.train(train_set)

	classifier.show_most_informative_features(20)

	# collect tweets that were wrongly classified
	errors = []
	for d in test_data:
	label = d['label']
	guess = classifier.classify(tweet_features(d))
	if guess != label:
	errors.append( (label, guess, d) )

	for (label, guess, d) in sorted(errors):
	print 'correct label: %s\nguessed label: %s\ntweet=%s\n' % (label, guess, d['tweet'])

	print 'Total errors: %d' % len(errors)

	print 'Accuracy: ', nltk.classify.accuracy(classifier, test_set)