Last active
August 22, 2017 21:59
-
-
Save byss/1fbe907177c1837f951d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import re | |
import csv | |
import sys | |
import urlparse | |
import dateutil.parser | |
class Counter: | |
def __init__ (self): | |
self.info = {} | |
def incValue (self, key): | |
if key in self.info: | |
self.info [key] += 1 | |
else: | |
self.info [key] = 1 | |
def printTopElements (self, count = 20): | |
sortedKeys = sorted (self.info.keys (), cmp = lambda k1, k2: cmp (self.info [k1], self.info [k2]), reverse = True) | |
while (count < len (sortedKeys)) and (self.info [sortedKeys [count - 1]] == self.info [sortedKeys [count]]): | |
count += 1 | |
sortedKeys = sortedKeys [:count] | |
maxWordLen = max ([len (unicode (key)) for key in sortedKeys]) | |
for key in sortedKeys: | |
print '\t\t' + unicode (key).ljust (maxWordLen + 1) + ':', self.info [key] | |
def __len__ (self): | |
return len (self.info) | |
class TweetsStatStorage: | |
def __init__ (self): | |
self.tweetsCount = 0 | |
self.totalTweetsLength = 0 | |
self.repliesCount = 0 | |
self.retweetsCount = 0 | |
self.words = Counter () | |
self.hours = Counter () | |
self.linkHosts = Counter () | |
self.mentions = Counter () | |
self.retweets = Counter () | |
def processTweetData (self, tweetData): | |
isReply = ('in_reply_to_user_id' in tweetData) | |
isRetweet = ('retweeted_status_user_id' in tweetData) | |
self.tweetsCount += 1 | |
if isReply: | |
self.repliesCount += 1 | |
if isRetweet: | |
self.retweetsCount += 1 | |
if 'text' in tweetData: | |
text = unicode (tweetData ['text'], 'utf8') | |
self.totalTweetsLength += len (text) | |
text = re.sub (r'http://t.co/[a-z0-9]+', '', text, flags = re.I | re.U) | |
else: | |
return | |
if isRetweet: | |
tweetSource = re.findall ('^RT (@[a-z0-9_]+):', text) | |
if len (tweetSource): | |
self.retweets.incValue (tweetSource [0]) | |
else: | |
for word in text.split (): | |
if not re.match ('@[a-z0-9_]+|https?://', word, re.I | re.U): | |
word = re.sub (r'^\W*(\w|\w.*\w)\W*$', r'\1', word, flags = re.U).lower () | |
self.words.incValue (word) | |
if 'timestamp' in tweetData: | |
timestamp = dateutil.parser.parse (tweetData ['timestamp']) | |
self.hours.incValue (timestamp.hour) | |
hosts = None | |
if 'expanded_urls' in tweetData: | |
urls = tweetData ['expanded_urls'].split (',') | |
hosts = [urlparse.urlparse (url).netloc for url in urls] | |
elif 'http://' in text or 'https://' in text: | |
hosts = re.findall ('https?://([^/])/', text, re.I | re.U) | |
if hosts is not None: | |
for host in hosts: | |
self.linkHosts.incValue (host) | |
mentioned = re.findall ('@[a-z0-9_]+', text, re.I | re.U) | |
for person in mentioned: | |
self.mentions.incValue (person) | |
def process_tweets_csv (filename): | |
if filename != '-': | |
f = open (filename) | |
displayFilename = filename | |
else: | |
f = sys.stdin | |
displayFilename = '<stdin>' | |
f = open (filename) if (filename != '-') else sys.stdin | |
if f is None: | |
print 'Error: cannot open', displayFilename, 'for reading!' | |
return | |
tweets = TweetsStatStorage () | |
csv_reader = csv.reader (f) | |
fields = csv_reader.next () | |
for row in csv_reader: | |
tweetData = dict ([(fields [i], row [i]) for i in xrange (min (len (fields), len (row))) if len (row [i])]) | |
tweets.processTweetData (tweetData) | |
print displayFilename + ':' | |
print '\tTotal tweets:', tweets.tweetsCount | |
print '\tTotal tweets length:', round (tweets.totalTweetsLength / 1000.0, 2),'K' | |
print '\tAverage tweet length:', round (1.0 * tweets.totalTweetsLength / tweets.tweetsCount, 2) | |
print '\tReply tweets count:', tweets.repliesCount, '(' + str (round (100.0 * tweets.repliesCount / tweets.tweetsCount, 1)) + '%)' | |
print '\tRetweets count:', tweets.retweetsCount, '(' + str (round (100.0 * tweets.retweetsCount / tweets.tweetsCount, 1)) + '%)' | |
print '\tTop-100 used words:' | |
tweets.words.printTopElements (100) | |
print '\t(' + str (len (tweets.words)), 'total)' | |
print '\tTop-10 hours for tweeting:' | |
tweets.hours.printTopElements (10) | |
print '\tTop-30 most linked websites:' | |
tweets.linkHosts.printTopElements (30) | |
print '\t(' + str (len (tweets.linkHosts)), 'total)' | |
print '\tTop-15 mentioned:' | |
tweets.mentions.printTopElements () | |
print '\tTop-15 retweeted:' | |
tweets.retweets.printTopElements () | |
def main (argv): | |
for filename in argv if len (argv) else ['']: | |
process_tweets_csv (filename) | |
if __name__ == '__main__': | |
main (sys.argv [1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment