tsl-lindsay/gist:46fa475879b5f15e570e13c1a9598dd4

## gistfile1.txt
# This sample code will get data from excel files that are sent as gmail attachments by scraping the embedded "sheet1.xml" files, if the files are formatted the same way.

# The data in each attachment (in this example, arrest records) will be saved a csv file in the "outpath" subdirectory. The csvs can later be combined using csvkit or another tool if so desired.

# Code originally written in Python 2.7 (sorry!)

import os
import re
import csv
import imaplib
import pyemail
import zipfile
from bs4 import BeautifulSoup

# replace EMAIL_ACCOUNT and EMAIL_PASSWORD
m = imaplib.IMAP4_SSL("imap.gmail.com")
m.login('EMAIL_ACCOUNT@gmail.com', 'EMAIL_PASSWORD')
m.select("SELECTION") # replace SELECTION with folder name to select emails in a specific folder
resp, items = m.search(None, "ALL")

# getting the mails id
items = items[0].split()

mainpath = 'email_attachments/'
attachpath = os.path.join(mainpath + '/attachments/')
outpath = os.path.join(mainpath + '/output/')

def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' +  directory)

createFolder(attachpath)
createFolder(outpath) # this is where the csvs will be saved

for emailid in items[:2]: # for example, here are a couple of emails
# fetch the mail. "`(RFC822)`" means "get everything"
    resp, data = m.fetch(emailid, "(RFC822)")

# get the mail content
    email_body = data[0][1]

# parse the mail content
    mail = pyemail.parse(email_body)
    mail['id'] = emailid

    for index, attachment in enumerate(mail.get("attachments")):
        # make a new attachment record for each attachment
        path = os.path.join('email_attachments', "%s-%s" % (emailid, index))
        out = open(path, 'w')
        zipped = attachment.read()
        out.write(zipped)
        out.close()

        # treat the excel file as a zip file and extract the contents
        t = zipfile.ZipFile(path, 'r')
        t.extractall(attachpath)

        os.rename(os.path.join(attachpath, 'sheet1.xml'), "%s-%s" % (emailid, index))

        # now, use BeautifulSoup to "scrape" the xml and write to a csv
        with open(os.path.join("%s-%s" % (emailid, index))) as infile:
            blob = infile.read()

        f = csv.writer(open(os.path.join(outpath, 'excel_file_%s-%s.csv' % (emailid, index)), 'w'))
        f.writerow(['booking_num', 'booking_date', 'arrestee_address'])

        soup = BeautifulSoup(blob, 'xml')
        r = soup.find_all('row')

        for row in r[3: ]:
            booking_num = row.get_text().split('\n')[1]
            booking_date = row.get_text().split('\n')[2]
            arrestee_address = re.sub('\s+',' ',row.get_text().split("\n")[19]) # I'm only pulling out 3 fields here since this is just an example

            f.writerow([booking_num, booking_date, arrestee_address])
	# This sample code will get data from excel files that are sent as gmail attachments by scraping the embedded "sheet1.xml" files, if the files are formatted the same way.

	# The data in each attachment (in this example, arrest records) will be saved a csv file in the "outpath" subdirectory. The csvs can later be combined using csvkit or another tool if so desired.

	# Code originally written in Python 2.7 (sorry!)

	import os
	import re
	import csv
	import imaplib
	import pyemail
	import zipfile
	from bs4 import BeautifulSoup

	# replace EMAIL_ACCOUNT and EMAIL_PASSWORD
	m = imaplib.IMAP4_SSL("imap.gmail.com")
	m.login('EMAIL_ACCOUNT@gmail.com', 'EMAIL_PASSWORD')
	m.select("SELECTION") # replace SELECTION with folder name to select emails in a specific folder
	resp, items = m.search(None, "ALL")

	# getting the mails id
	items = items[0].split()

	mainpath = 'email_attachments/'
	attachpath = os.path.join(mainpath + '/attachments/')
	outpath = os.path.join(mainpath + '/output/')

	def createFolder(directory):
	try:
	if not os.path.exists(directory):
	os.makedirs(directory)
	except OSError:
	print ('Error: Creating directory. ' + directory)

	createFolder(attachpath)
	createFolder(outpath) # this is where the csvs will be saved

	for emailid in items[:2]: # for example, here are a couple of emails
	# fetch the mail. "`(RFC822)`" means "get everything"
	resp, data = m.fetch(emailid, "(RFC822)")

	# get the mail content
	email_body = data[0][1]

	# parse the mail content
	mail = pyemail.parse(email_body)
	mail['id'] = emailid

	for index, attachment in enumerate(mail.get("attachments")):
	# make a new attachment record for each attachment
	path = os.path.join('email_attachments', "%s-%s" % (emailid, index))
	out = open(path, 'w')
	zipped = attachment.read()
	out.write(zipped)
	out.close()

	# treat the excel file as a zip file and extract the contents
	t = zipfile.ZipFile(path, 'r')
	t.extractall(attachpath)

	os.rename(os.path.join(attachpath, 'sheet1.xml'), "%s-%s" % (emailid, index))

	# now, use BeautifulSoup to "scrape" the xml and write to a csv
	with open(os.path.join("%s-%s" % (emailid, index))) as infile:
	blob = infile.read()

	f = csv.writer(open(os.path.join(outpath, 'excel_file_%s-%s.csv' % (emailid, index)), 'w'))
	f.writerow(['booking_num', 'booking_date', 'arrestee_address'])

	soup = BeautifulSoup(blob, 'xml')
	r = soup.find_all('row')

	for row in r[3: ]:
	booking_num = row.get_text().split('\n')[1]
	booking_date = row.get_text().split('\n')[2]
	arrestee_address = re.sub('\s+',' ',row.get_text().split("\n")[19]) # I'm only pulling out 3 fields here since this is just an example

	f.writerow([booking_num, booking_date, arrestee_address])