seanmckaybeck/analyze.py

## analyze.py
'''
A script for analyzing twitter stats on Ferguson
'''
import json
import re

import tweepy

def get_api():
    '''
    Creates an instance of the tweepy OAuth class
    '''
    with open('config') as f:
        api_key = f.readline().strip()
        api_secret = f.readline().strip()
        access_token = f.readline().strip()
        access_token_secret = f.readline().strip()
        auth = tweepy.OAuthHandler(api_key, api_secret)
        auth.set_access_token(access_token, access_token_secret)
        return auth


class CustomStreamListener(tweepy.StreamListener):
    '''
    Sub class of StreamListener to handle searching
    Ferguson tweets for various keywords
    '''
    def __init__(self, *args, **kwargs):
        super(CustomStreamListener, self).__init__(*args, **kwargs)
        self.count = 0
        with open('common') as f:
            self.common = set(line.strip() for line in f)
        self.all_words = {}
        self.pattern = re.compile("[^\w'#]")

    def on_status(self, status):
        print 'Got a tweet'
        self.count += 1
        tweet = status.text.lower()
        tweet = self.pattern.sub(' ', tweet)
        words = tweet.split()
        for word in words:
            if 'http' not in word and '@' not in word and \
                    len(word) > 2 and word != '' and \
                    not word.isspace() and not word.isdigit() and \
                    word not in self.common:
                if word not in self.all_words:
                    self.all_words[word] = 1
                else:
                    self.all_words[word] += 1


if __name__ == '__main__':
    l = CustomStreamListener()
    try:
        auth = get_api()
        streaming_api = tweepy.Stream(auth, l)
        streaming_api.filter(track=['Ferguson'])
    except KeyboardInterrupt:
        print '----TOTAL TWEETS----'
        print l.count
        print '--------------------'
        json_data = json.dumps(l.all_words, indent=4)
        with open('word_data.json', 'w') as f:
            print >> f, json_data

## common
the
be
to
of
and
a
in
that
have
I
it
for
not
on
with
he
as
you
do
at
this
but
his
by
from
they
we
say
her
she
or
an
will
my
one
all
would
there
their
what
so
up
out
if
about
who
get
which
go
me
when
make
can
like
time
no
just
him
know
take
person
into
year
your
good
some
could
them
see
other
than
then
now
look
only
come
its
over
think
also
back
after
use
two
how
our
work
first
well
way
even
new
want
because
any
these
give
day
most
us
i'll
i'm
until
ha
haha
hahaha
hahahaha
hahahahaha
hi
rt
re
omg
omgg
omggg
omgggg
omggggg
oh
ohh
ohhh
was
wtf
said
done
else
else's
le
such
via
que
let
still
real

## convert.py
'''
Convert the JSON data into a
large block of text for parsing
'''
import json

f = open('word_data.json')
data = json.load(f)
f.close()
final_str = ''
for word in data:
    count = data[word]
    while count > 0:
        final_str += word + ' '
        count -= 1
with open('word_block.txt', 'w') as f:
    f.write(final_str)
	'''
	A script for analyzing twitter stats on Ferguson
	'''
	import json
	import re

	import tweepy

	def get_api():
	'''
	Creates an instance of the tweepy OAuth class
	'''
	with open('config') as f:
	api_key = f.readline().strip()
	api_secret = f.readline().strip()
	access_token = f.readline().strip()
	access_token_secret = f.readline().strip()
	auth = tweepy.OAuthHandler(api_key, api_secret)
	auth.set_access_token(access_token, access_token_secret)
	return auth


	class CustomStreamListener(tweepy.StreamListener):
	'''
	Sub class of StreamListener to handle searching
	Ferguson tweets for various keywords
	'''
	def __init__(self, args, *kwargs):
	super(CustomStreamListener, self).__init__(args, *kwargs)
	self.count = 0
	with open('common') as f:
	self.common = set(line.strip() for line in f)
	self.all_words = {}
	self.pattern = re.compile("[^\w'#]")

	def on_status(self, status):
	print 'Got a tweet'
	self.count += 1
	tweet = status.text.lower()
	tweet = self.pattern.sub(' ', tweet)
	words = tweet.split()
	for word in words:
	if 'http' not in word and '@' not in word and \
	len(word) > 2 and word != '' and \
	not word.isspace() and not word.isdigit() and \
	word not in self.common:
	if word not in self.all_words:
	self.all_words[word] = 1
	else:
	self.all_words[word] += 1


	if __name__ == '__main__':
	l = CustomStreamListener()
	try:
	auth = get_api()
	streaming_api = tweepy.Stream(auth, l)
	streaming_api.filter(track=['Ferguson'])
	except KeyboardInterrupt:
	print '----TOTAL TWEETS----'
	print l.count
	print '--------------------'
	json_data = json.dumps(l.all_words, indent=4)
	with open('word_data.json', 'w') as f:
	print >> f, json_data
	the
	be
	to
	of
	and
	a
	in
	that
	have
	I
	it
	for
	not
	on
	with
	he
	as
	you
	do
	at
	this
	but
	his
	by
	from
	they
	we
	say
	her
	she
	or
	an
	will
	my
	one
	all
	would
	there
	their
	what
	so
	up
	out
	if
	about
	who
	get
	which
	go
	me
	when
	make
	can
	like
	time
	no
	just
	him
	know
	take
	person
	into
	year
	your
	good
	some
	could
	them
	see
	other
	than
	then
	now
	look
	only
	come
	its
	over
	think
	also
	back
	after
	use
	two
	how
	our
	work
	first
	well
	way
	even
	new
	want
	because
	any
	these
	give
	day
	most
	us
	i'll
	i'm
	until
	ha
	haha
	hahaha
	hahahaha
	hahahahaha
	hi
	rt
	re
	omg
	omgg
	omggg
	omgggg
	omggggg
	oh
	ohh
	ohhh
	was
	wtf
	said
	done
	else
	else's
	le
	such
	via
	que
	let
	still
	real
	'''
	Convert the JSON data into a
	large block of text for parsing
	'''
	import json

	f = open('word_data.json')
	data = json.load(f)
	f.close()
	final_str = ''
	for word in data:
	count = data[word]
	while count > 0:
	final_str += word + ' '
	count -= 1
	with open('word_block.txt', 'w') as f:
	f.write(final_str)