Skip to content

Instantly share code, notes, and snippets.

@meqif
Created September 11, 2012 21:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save meqif/3702194 to your computer and use it in GitHub Desktop.
Save meqif/3702194 to your computer and use it in GitHub Desktop.
Twitter account spam/ham classifier
#!/usr/bin/env python
"""
Simple Naive Bayes tweet classifier.
It analyses a number of tweets of a given user and determines if that user is
a spammer.
"""
from __future__ import division
import json
import re
import os
import pickle
import requests
import numpy
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import StratifiedKFold
from sklearn import cross_validation
def get_user_data(user, count=2):
"""Retrieve latest tweet from a user, given a username."""
url = 'https://api.twitter.com/1/statuses/user_timeline.json' \
'?include_entities=true' \
'&include_rts=true' \
'&screen_name=%s' \
'&count=%d' % (user, count)
req = requests.get(url)
data = req.json
assert req.status_code == 200, data['error']
return data
def analyse(tweet):
"""Analyse characteristics of a tweet, converting them into numerical values."""
tweetSize = len(tweet['text'])
nUrls = len(tweet['entities']['urls'])
if nUrls == 0:
ratio = 0
else:
if tweet['in_reply_to'] != None:
ratio = len('@' + tweet['in_reply_to'] + ' ' +
tweet['entities']['urls'][0]['url']) / tweetSize
else:
ratio = 0
return [
ratio,
nUrls,
len(tweet['screen_name']),
tweet['screen_name'].count('x'),
len(re.findall(r'\d', tweet['screen_name'])),
tweet['followers_count'],
tweet['friends_count']
]
def main(user, count, verbose=False):
def extract(x):
"""Extract relevant characteristics from a tweet."""
return {
u'screen_name': x['user']['screen_name'],
u'text': x['text'],
u'in_reply_to': x['in_reply_to_screen_name'],
u'entities': x['entities'],
u'followers_count': x['user']['followers_count'],
u'friends_count': x['user']['friends_count']
}
user_data = get_user_data(user, count)
user_data = map(extract, user_data)
user_data = map(analyse, user_data)
model_file = 'curbstomp_model.pkl'
if os.path.exists(model_file):
with open(model_file) as f:
model = pickle.load(f)
else:
with open('spam.json') as f:
spam = map(extract, json.loads(f.read()))
spam = map(analyse, spam)
with open('ham.json') as f:
ham = map(extract, json.loads(f.read()))
ham = map(analyse, ham)
training_data = numpy.array(spam + ham)
classes = numpy.array(len(spam) * [1] + len(ham) * [0])
skf = StratifiedKFold(classes, 2)
gnb = GaussianNB()
for train_index, test_index in skf:
x_train, x_test = training_data[train_index], training_data[test_index]
y_train, y_test = classes[train_index], classes[test_index]
model = gnb.fit(x_train, y_train)
pred = model.predict(x_test)
print "Number of mislabeled points : %d" % (y_test != pred).sum()
# print "Verdict: %s" % model.predict(user_data).sum()
# Export model
with open(model_file, 'w') as f:
pickle.dump(model, f)
if verbose:
print "Verdict (spam): %s" % model.predict(spam)
print "Verdict (ham): %s" % model.predict(ham)
scores = cross_validation.cross_val_score(gnb, training_data, classes, cv=2)
print scores
print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)
# Actually use model with input
if verbose:
print "Analysis result: %s" % model.predict(user_data)
print "Verdict: {:.2%} probability of being a spammer".format(sum(model.predict(user_data)) / len(user_data))
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('username',
help="Twitter user whose tweets will be analysed")
parser.add_argument('-v', '--verbose', action='store_true',
help="show each tweet's probability of being spam")
parser.add_argument('-n', type=int, default=50, dest='count',
help="how many tweets should be used for analysis")
args = parser.parse_args()
main(args.username, args.count, args.verbose)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment