Skip to content

Instantly share code, notes, and snippets.

@byss
Last active August 22, 2017 21:59
Show Gist options
  • Save byss/1fbe907177c1837f951d to your computer and use it in GitHub Desktop.
Save byss/1fbe907177c1837f951d to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import re
import csv
import sys
import urlparse
import dateutil.parser
class Counter:
def __init__ (self):
self.info = {}
def incValue (self, key):
if key in self.info:
self.info [key] += 1
else:
self.info [key] = 1
def printTopElements (self, count = 20):
sortedKeys = sorted (self.info.keys (), cmp = lambda k1, k2: cmp (self.info [k1], self.info [k2]), reverse = True)
while (count < len (sortedKeys)) and (self.info [sortedKeys [count - 1]] == self.info [sortedKeys [count]]):
count += 1
sortedKeys = sortedKeys [:count]
maxWordLen = max ([len (unicode (key)) for key in sortedKeys])
for key in sortedKeys:
print '\t\t' + unicode (key).ljust (maxWordLen + 1) + ':', self.info [key]
def __len__ (self):
return len (self.info)
class TweetsStatStorage:
def __init__ (self):
self.tweetsCount = 0
self.totalTweetsLength = 0
self.repliesCount = 0
self.retweetsCount = 0
self.words = Counter ()
self.hours = Counter ()
self.linkHosts = Counter ()
self.mentions = Counter ()
self.retweets = Counter ()
def processTweetData (self, tweetData):
isReply = ('in_reply_to_user_id' in tweetData)
isRetweet = ('retweeted_status_user_id' in tweetData)
self.tweetsCount += 1
if isReply:
self.repliesCount += 1
if isRetweet:
self.retweetsCount += 1
if 'text' in tweetData:
text = unicode (tweetData ['text'], 'utf8')
self.totalTweetsLength += len (text)
text = re.sub (r'http://t.co/[a-z0-9]+', '', text, flags = re.I | re.U)
else:
return
if isRetweet:
tweetSource = re.findall ('^RT (@[a-z0-9_]+):', text)
if len (tweetSource):
self.retweets.incValue (tweetSource [0])
else:
for word in text.split ():
if not re.match ('@[a-z0-9_]+|https?://', word, re.I | re.U):
word = re.sub (r'^\W*(\w|\w.*\w)\W*$', r'\1', word, flags = re.U).lower ()
self.words.incValue (word)
if 'timestamp' in tweetData:
timestamp = dateutil.parser.parse (tweetData ['timestamp'])
self.hours.incValue (timestamp.hour)
hosts = None
if 'expanded_urls' in tweetData:
urls = tweetData ['expanded_urls'].split (',')
hosts = [urlparse.urlparse (url).netloc for url in urls]
elif 'http://' in text or 'https://' in text:
hosts = re.findall ('https?://([^/])/', text, re.I | re.U)
if hosts is not None:
for host in hosts:
self.linkHosts.incValue (host)
mentioned = re.findall ('@[a-z0-9_]+', text, re.I | re.U)
for person in mentioned:
self.mentions.incValue (person)
def process_tweets_csv (filename):
if filename != '-':
f = open (filename)
displayFilename = filename
else:
f = sys.stdin
displayFilename = '<stdin>'
f = open (filename) if (filename != '-') else sys.stdin
if f is None:
print 'Error: cannot open', displayFilename, 'for reading!'
return
tweets = TweetsStatStorage ()
csv_reader = csv.reader (f)
fields = csv_reader.next ()
for row in csv_reader:
tweetData = dict ([(fields [i], row [i]) for i in xrange (min (len (fields), len (row))) if len (row [i])])
tweets.processTweetData (tweetData)
print displayFilename + ':'
print '\tTotal tweets:', tweets.tweetsCount
print '\tTotal tweets length:', round (tweets.totalTweetsLength / 1000.0, 2),'K'
print '\tAverage tweet length:', round (1.0 * tweets.totalTweetsLength / tweets.tweetsCount, 2)
print '\tReply tweets count:', tweets.repliesCount, '(' + str (round (100.0 * tweets.repliesCount / tweets.tweetsCount, 1)) + '%)'
print '\tRetweets count:', tweets.retweetsCount, '(' + str (round (100.0 * tweets.retweetsCount / tweets.tweetsCount, 1)) + '%)'
print
print '\tTop-100 used words:'
tweets.words.printTopElements (100)
print '\t(' + str (len (tweets.words)), 'total)'
print
print '\tTop-10 hours for tweeting:'
tweets.hours.printTopElements (10)
print
print '\tTop-30 most linked websites:'
tweets.linkHosts.printTopElements (30)
print '\t(' + str (len (tweets.linkHosts)), 'total)'
print
print '\tTop-15 mentioned:'
tweets.mentions.printTopElements ()
print
print '\tTop-15 retweeted:'
tweets.retweets.printTopElements ()
def main (argv):
for filename in argv if len (argv) else ['']:
process_tweets_csv (filename)
if __name__ == '__main__':
main (sys.argv [1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment