meqif/curbstomp.py

## curbstomp.py
#!/usr/bin/env python
"""
Simple Naive Bayes tweet classifier.

It analyses a number of tweets of a given user and determines if that user is
a spammer.

"""

from __future__ import division
import json
import re
import os
import pickle

import requests
import numpy

from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import StratifiedKFold
from sklearn import cross_validation


def get_user_data(user, count=2):
    """Retrieve latest tweet from a user, given a username."""
    url = 'https://api.twitter.com/1/statuses/user_timeline.json' \
        '?include_entities=true' \
        '&include_rts=true' \
        '&screen_name=%s' \
        '&count=%d' % (user, count)
    req = requests.get(url)
    data = req.json

    assert req.status_code == 200, data['error']

    return data


def analyse(tweet):
    """Analyse characteristics of a tweet, converting them into numerical values."""
    tweetSize = len(tweet['text'])
    nUrls = len(tweet['entities']['urls'])
    if nUrls == 0:
        ratio = 0
    else:
        if tweet['in_reply_to'] != None:
            ratio = len('@' + tweet['in_reply_to'] + ' ' +
                tweet['entities']['urls'][0]['url']) / tweetSize
        else:
            ratio = 0

    return [
        ratio,
        nUrls,
        len(tweet['screen_name']),
        tweet['screen_name'].count('x'),
        len(re.findall(r'\d', tweet['screen_name'])),
        tweet['followers_count'],
        tweet['friends_count']
    ]


def main(user, count, verbose=False):
    def extract(x):
        """Extract relevant characteristics from a tweet."""
        return {
            u'screen_name': x['user']['screen_name'],
            u'text': x['text'],
            u'in_reply_to': x['in_reply_to_screen_name'],
            u'entities': x['entities'],
            u'followers_count': x['user']['followers_count'],
            u'friends_count': x['user']['friends_count']
        }

    user_data = get_user_data(user, count)
    user_data = map(extract, user_data)
    user_data = map(analyse, user_data)

    model_file = 'curbstomp_model.pkl'
    if os.path.exists(model_file):
        with open(model_file) as f:
            model = pickle.load(f)
    else:
        with open('spam.json') as f:
            spam = map(extract, json.loads(f.read()))
            spam = map(analyse, spam)

        with open('ham.json') as f:
            ham = map(extract, json.loads(f.read()))
            ham = map(analyse, ham)

        training_data = numpy.array(spam + ham)
        classes = numpy.array(len(spam) * [1] + len(ham) * [0])

        skf = StratifiedKFold(classes, 2)
        gnb = GaussianNB()

        for train_index, test_index in skf:
            x_train, x_test = training_data[train_index], training_data[test_index]
            y_train, y_test = classes[train_index], classes[test_index]

            model = gnb.fit(x_train, y_train)
            pred = model.predict(x_test)
            print "Number of mislabeled points : %d" % (y_test != pred).sum()
            # print "Verdict: %s" % model.predict(user_data).sum()

        # Export model
        with open(model_file, 'w') as f:
            pickle.dump(model, f)

        if verbose:
            print "Verdict (spam): %s" % model.predict(spam)
            print "Verdict (ham): %s" % model.predict(ham)

            scores = cross_validation.cross_val_score(gnb, training_data, classes, cv=2)
            print scores
            print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)

    # Actually use model with input
    if verbose:
        print "Analysis result: %s" % model.predict(user_data)

    print "Verdict: {:.2%} probability of being a spammer".format(sum(model.predict(user_data)) / len(user_data))

if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('username',
        help="Twitter user whose tweets will be analysed")
    parser.add_argument('-v', '--verbose', action='store_true',
        help="show each tweet's probability of being spam")
    parser.add_argument('-n', type=int, default=50, dest='count',
        help="how many tweets should be used for analysis")
    args = parser.parse_args()

    main(args.username, args.count, args.verbose)
	#!/usr/bin/env python
	"""
	Simple Naive Bayes tweet classifier.

	It analyses a number of tweets of a given user and determines if that user is
	a spammer.

	"""

	from __future__ import division
	import json
	import re
	import os
	import pickle

	import requests
	import numpy

	from sklearn.naive_bayes import GaussianNB
	from sklearn.cross_validation import StratifiedKFold
	from sklearn import cross_validation


	def get_user_data(user, count=2):
	"""Retrieve latest tweet from a user, given a username."""
	url = 'https://api.twitter.com/1/statuses/user_timeline.json' \
	'?include_entities=true' \
	'&include_rts=true' \
	'&screen_name=%s' \
	'&count=%d' % (user, count)
	req = requests.get(url)
	data = req.json

	assert req.status_code == 200, data['error']

	return data


	def analyse(tweet):
	"""Analyse characteristics of a tweet, converting them into numerical values."""
	tweetSize = len(tweet['text'])
	nUrls = len(tweet['entities']['urls'])
	if nUrls == 0:
	ratio = 0
	else:
	if tweet['in_reply_to'] != None:
	ratio = len('@' + tweet['in_reply_to'] + ' ' +
	tweet['entities']['urls'][0]['url']) / tweetSize
	else:
	ratio = 0

	return [
	ratio,
	nUrls,
	len(tweet['screen_name']),
	tweet['screen_name'].count('x'),
	len(re.findall(r'\d', tweet['screen_name'])),
	tweet['followers_count'],
	tweet['friends_count']
	]


	def main(user, count, verbose=False):
	def extract(x):
	"""Extract relevant characteristics from a tweet."""
	return {
	u'screen_name': x['user']['screen_name'],
	u'text': x['text'],
	u'in_reply_to': x['in_reply_to_screen_name'],
	u'entities': x['entities'],
	u'followers_count': x['user']['followers_count'],
	u'friends_count': x['user']['friends_count']
	}

	user_data = get_user_data(user, count)
	user_data = map(extract, user_data)
	user_data = map(analyse, user_data)

	model_file = 'curbstomp_model.pkl'
	if os.path.exists(model_file):
	with open(model_file) as f:
	model = pickle.load(f)
	else:
	with open('spam.json') as f:
	spam = map(extract, json.loads(f.read()))
	spam = map(analyse, spam)

	with open('ham.json') as f:
	ham = map(extract, json.loads(f.read()))
	ham = map(analyse, ham)

	training_data = numpy.array(spam + ham)
	classes = numpy.array(len(spam) * [1] + len(ham) * [0])

	skf = StratifiedKFold(classes, 2)
	gnb = GaussianNB()

	for train_index, test_index in skf:
	x_train, x_test = training_data[train_index], training_data[test_index]
	y_train, y_test = classes[train_index], classes[test_index]

	model = gnb.fit(x_train, y_train)
	pred = model.predict(x_test)
	print "Number of mislabeled points : %d" % (y_test != pred).sum()
	# print "Verdict: %s" % model.predict(user_data).sum()

	# Export model
	with open(model_file, 'w') as f:
	pickle.dump(model, f)

	if verbose:
	print "Verdict (spam): %s" % model.predict(spam)
	print "Verdict (ham): %s" % model.predict(ham)

	scores = cross_validation.cross_val_score(gnb, training_data, classes, cv=2)
	print scores
	print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)

	# Actually use model with input
	if verbose:
	print "Analysis result: %s" % model.predict(user_data)

	print "Verdict: {:.2%} probability of being a spammer".format(sum(model.predict(user_data)) / len(user_data))

	if __name__ == '__main__':
	import argparse

	parser = argparse.ArgumentParser()
	parser.add_argument('username',
	help="Twitter user whose tweets will be analysed")
	parser.add_argument('-v', '--verbose', action='store_true',
	help="show each tweet's probability of being spam")
	parser.add_argument('-n', type=int, default=50, dest='count',
	help="how many tweets should be used for analysis")
	args = parser.parse_args()

	main(args.username, args.count, args.verbose)