Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save tsl-lindsay/46fa475879b5f15e570e13c1a9598dd4 to your computer and use it in GitHub Desktop.
Save tsl-lindsay/46fa475879b5f15e570e13c1a9598dd4 to your computer and use it in GitHub Desktop.
# This sample code will get data from excel files that are sent as gmail attachments by scraping the embedded "sheet1.xml" files, if the files are formatted the same way.
# The data in each attachment (in this example, arrest records) will be saved a csv file in the "outpath" subdirectory. The csvs can later be combined using csvkit or another tool if so desired.
# Code originally written in Python 2.7 (sorry!)
import os
import re
import csv
import imaplib
import pyemail
import zipfile
from bs4 import BeautifulSoup
# replace EMAIL_ACCOUNT and EMAIL_PASSWORD
m = imaplib.IMAP4_SSL("imap.gmail.com")
m.login('EMAIL_ACCOUNT@gmail.com', 'EMAIL_PASSWORD')
m.select("SELECTION") # replace SELECTION with folder name to select emails in a specific folder
resp, items = m.search(None, "ALL")
# getting the mails id
items = items[0].split()
mainpath = 'email_attachments/'
attachpath = os.path.join(mainpath + '/attachments/')
outpath = os.path.join(mainpath + '/output/')
def createFolder(directory):
try:
if not os.path.exists(directory):
os.makedirs(directory)
except OSError:
print ('Error: Creating directory. ' + directory)
createFolder(attachpath)
createFolder(outpath) # this is where the csvs will be saved
for emailid in items[:2]: # for example, here are a couple of emails
# fetch the mail. "`(RFC822)`" means "get everything"
resp, data = m.fetch(emailid, "(RFC822)")
# get the mail content
email_body = data[0][1]
# parse the mail content
mail = pyemail.parse(email_body)
mail['id'] = emailid
for index, attachment in enumerate(mail.get("attachments")):
# make a new attachment record for each attachment
path = os.path.join('email_attachments', "%s-%s" % (emailid, index))
out = open(path, 'w')
zipped = attachment.read()
out.write(zipped)
out.close()
# treat the excel file as a zip file and extract the contents
t = zipfile.ZipFile(path, 'r')
t.extractall(attachpath)
os.rename(os.path.join(attachpath, 'sheet1.xml'), "%s-%s" % (emailid, index))
# now, use BeautifulSoup to "scrape" the xml and write to a csv
with open(os.path.join("%s-%s" % (emailid, index))) as infile:
blob = infile.read()
f = csv.writer(open(os.path.join(outpath, 'excel_file_%s-%s.csv' % (emailid, index)), 'w'))
f.writerow(['booking_num', 'booking_date', 'arrestee_address'])
soup = BeautifulSoup(blob, 'xml')
r = soup.find_all('row')
for row in r[3: ]:
booking_num = row.get_text().split('\n')[1]
booking_date = row.get_text().split('\n')[2]
arrestee_address = re.sub('\s+',' ',row.get_text().split("\n")[19]) # I'm only pulling out 3 fields here since this is just an example
f.writerow([booking_num, booking_date, arrestee_address])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment