Skip to content

Instantly share code, notes, and snippets.

@ptwobrussell
Created February 8, 2014 01:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ptwobrussell/8875470 to your computer and use it in GitHub Desktop.
Save ptwobrussell/8875470 to your computer and use it in GitHub Desktop.
Improvements to Example 6-13 that use regular expressions to enable searching by an email address as opposed to an exact string match on the From: field of a JSONified mbox
import json
import pymongo # pip install pymongo
from bson import json_util # Comes with pymongo
import re
# The basis of our query
FROM = "noreply@coursera.org" # As opposed to a value like "Coursera <noreply@coursera.org>"
client = pymongo.MongoClient()
db = client.enron
mbox = db.mbox
# Get the recipient lists for each message
recipients_per_message = db.mbox.aggregate([
{"$match" : {"From" : re.compile(r".*{0}.*".format(FROM), re.IGNORECASE)}},
{"$project" : {"From" : 1, "To" : 1} },
{"$group" : {"_id" : "$From", "recipients" : {"$addToSet" : "$To" } } }
])['result'][0]['recipients']
# Collapse the lists of recipients into a single list
all_recipients = [recipient
for message in recipients_per_message
for recipient in message]
# Calculate the number of recipients per sent message and sort
recipients_per_message_totals = \
sorted([len(recipients)
for recipients in recipients_per_message])
# Demonstrate how to use $unwind followed by $group to collapse
# the recipient lists into a single list (with no duplicates
# per the $addToSet operator)
unique_recipients = db.mbox.aggregate([
{"$match" : {"From" : re.compile(r".*{0}.*".format(FROM), re.IGNORECASE)}},
{"$project" : {"From" : 1, "To" : 1} },
{"$unwind" : "$To"},
{"$group" : {"_id" : "From", "recipients" : {"$addToSet" : "$To"}} }
])['result'][0]['recipients']
print all_recipients
print "Num total recipients on all messages:", len(all_recipients)
print "Num recipients for each message:", recipients_per_message_totals
print "Num unique recipients", len(unique_recipients)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment