Skip to content

Instantly share code, notes, and snippets.

Created October 26, 2013 14:07
Show Gist options
  • Save djagdish/7169862 to your computer and use it in GitHub Desktop.
Save djagdish/7169862 to your computer and use it in GitHub Desktop.
Calculating Top Collaborators from email metadata
# sample email header in JSON format
{ 'fromField': ['Deepak Jagdish', ''],
'toField': [['Daniel Smilkov', '']],
'dateField': 1372743719,
'isSent': False,
'threadid': '1439426117975266137',
import json
def filterEmails(emails):
return [email for email in emails if email is not None and email['toField'] and email['fromField']]
f = open('allemails.json')
emails = filterEmails(json.load(f))
from collections import Counter
from datetime import datetime
def getSentRcvCounters(emails, year=None):
sentCounter, rcvCounter = Counter(), Counter()
for email in emails:
if year and datetime.fromtimestamp(email['dateField']).year != year:
if email['isSent']:
for person in email['toField']:
sentCounter[person[1]] += 1
person = email['fromField']
rcvCounter[person[1]] += 1
return sentCounter, rcvCounter
def genMean(xx,p):
mean = 0.0
for x in xx:
mean += x**p
return (mean / len(xx))**(1.0/p)
def getTopcontacts(year=None):
sentCounter, rcvCounter = getSentRcvCounters(emails, year)
topcontacts = []
for email in sentCounter:
if sentCounter[email] > 0 and rcvCounter[email] > 0:
topcontacts.append((email, genMean([sentCounter[email], rcvCounter[email]], -5)))
topcontacts.sort(key=lambda x: x[1], reverse=True)
return topcontacts
def getRank(topcontacts, email):
for i, (contact, score) in enumerate(topcontacts):
if contact == email:
return i+1
return 0
results = [{'email': contact, 'ranks': []} for contact, score in getTopcontacts()[:10]]
for year in xrange(2005, 2014):
topcontacts = getTopcontacts(year)
for record in results:
record['ranks'].append(getRank(topcontacts, record['email']))
print json.dumps(results)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment