-
-
Save mjbommar/5f2e3feea9766159ec00 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
@date Nov 27, 2009 | |
@author: Michael Bommarito | |
@contact michael.bommarito@gmail.com | |
This is provided for purely academic purposes. | |
''' | |
import glob, re, datetime, dateutil.parser | |
reAddress = re.compile('([^ <,]+@[^ >,]+)', re.IGNORECASE) | |
def parseEmail(buffer): | |
''' | |
Parse the email text. | |
Return the sender, recipients, and date. | |
''' | |
# Read in the email and properly split it into lines. | |
lines = buffer.strip().splitlines() | |
''' | |
Parse the From address with a regular expression | |
and raise an exception if the regular expression fails. | |
''' | |
try: | |
senderAddress = reAddress.findall(lines[0])[0].lower() | |
except: | |
raise Exception("%s has invalid From: header." % email) | |
''' | |
Now try to parse recipient address from the | |
the To:, Cc:, and Bcc: lines. Start from the second line | |
and iterate until the header ends, which is denoted by | |
an empty line. | |
''' | |
recipientAddress = set() | |
for i in range(1, len(lines)): | |
# Clean up the line, just in case. | |
leftToken = lines[i].strip().lower() | |
# This means the header has ended and we should stop looking for addresses. | |
if len(leftToken) == 0: | |
break | |
''' | |
If this is the date line, parse that. | |
Note that FOIA/mail/0853426848.txt has a goofy NZ timestamp with | |
+-1300. I changed this to +1300. | |
''' | |
if leftToken.startswith('date:'): | |
dateString = lines[i].split(':', 1)[1] | |
dateString = dateString.split('(')[0].replace(',','').strip() | |
try: | |
emailDate = dateutil.parser.parse(dateString) | |
except: | |
raise Exception("bad datestring detected - %s" % (dateString)) | |
# Now if the header indicates recipients, parse the addresses from it. | |
if leftToken.startswith('to:') or leftToken.startswith('cc:') or leftToken.startswith('bcc'): | |
lineAddress = [address.lower() for address in reAddress.findall(lines[i])] | |
recipientAddress.update(lineAddress) | |
recipientAddress = [address.strip() for address in recipientAddress if len(address.strip()) > 0] | |
return (senderAddress,sorted(list(recipientAddress)), emailDate) | |
for email in glob.glob('FOIA/mail/*.txt'): | |
# Parse the email into the addresses | |
senderAddress, recipientAddress, emailDate = parseEmail(open(email).read()) | |
''' | |
Check to make sure someone didn't send the email without an actual address. | |
I have gone through and manually corrected these files by finding the matching | |
address from elsewhere in the database or from the Internet. | |
Mike Hulme needs compliant mail client!!! | |
* FOIA/mail/0837197800.txt: Neil Loader <N.J.Loader@swansea.ac.uk> | |
* FOIA/mail/0973374325.txt: barker <Terry.Barker@econ.cam.ac.uk> | |
* FOIA/mail/0973374325.txt: vira <sonia.seneviratne@env.ethz.ch> | |
* FOIA/mail/0942448792.txt: wigley <wigley@ucar.edu> | |
* FOIA/mail/0957536665.txt: t.d.davies <t.d.davies@uea.ac.uk> | |
* FOIA/mail/0986486371.txt: s.torok <simon.torok@csiro.au> | |
* FOIA/mail/1006983600.txt: All BCC? Removed | |
* FOIA/mail/1021757151.txt: s.torok <simon.torok@csiro.au> | |
* FOIA/mail/1048799107.txt: All BCC? Removed | |
* FOIA/mail/1164120712.txt: Malcolm Hughes <mhughes@ltrr.arizona.edu> | |
''' | |
if len(recipientAddress) == 0: | |
raise Exception("%s has no recipient - something is wrong." % (email)) | |
print "%s,%s,%s,%s" % (email, emailDate, senderAddress, ','.join(recipientAddress)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment