Skip to content

Instantly share code, notes, and snippets.

@jeffThompson
Last active August 29, 2015 14:17
Show Gist options
  • Save jeffThompson/99583e4bfc6d1c111b4f to your computer and use it in GitHub Desktop.
Save jeffThompson/99583e4bfc6d1c111b4f to your computer and use it in GitHub Desktop.
import os, re, urllib
'''
BLANK EMAIL PIXELS
Jeff Thompson | 2015 | www.jeffreythompson.org
12,383 pixels = 111 x 112 (with 62px in the last row)
'''
download_images = False # try to download? (very slow)
mail_folder = '/Users/JeffThompson/Library/Mail/V2' # where are emails stored?
log_file = 'log.csv' # save to log file
ignore_fake_images = False # ignore sketchy .php, etc "images"?
image_ext = [ 'gif', 'jpg', 'jpeg', 'png' ] # valid image extensions
# well hello
os.system('cls' if os.name=='nt' else 'clear')
print 'BLANK EMAIL PIXELS'
# create log file
with open(log_file, 'w') as f:
f.write('index,filename,email_file,width,why_recorded,extension\n')
# get list of all emails
print 'Getting list of all emails (may take a while)...'
emails = [ ]
for root, dirs, files in os.walk(mail_folder):
for file in files:
if file.endswith('.emlx'):
emails.append(os.path.join(root, file))
print '- found', len(emails), 'emails'
# iterate, read, extract images
print '\nFinding tracking images...'
for i, email in enumerate(emails):
print ' ' + str(i + 1) + '/' + str(len(emails))
with open(email) as e:
text = e.read().replace('\n', '')
images = re.findall(r'<img.*?>', text)
for image in images:
tracking_image = False # flag for found image
how_did_we_know = '-' # how did we know it was a tracking image?
# hidden images?
if re.search('display:\s*none', image) is not None:
tracking_image = True
how_did_we_know = 'display:none'
if re.search('visibility:\s*hidden', image) is not None:
tracking_image = True
how_did_we_know = 'visibility:hidden'
# width of 0 or 1?
width = re.findall(r'width=3D"([0-1])"', image)
if len(width) > 0:
tracking_image = True
width = str(width[0]) # set as string for logging
how_did_we_know = 'width'
else:
width = 'n/a' # no width specified
# tracking image? download it
# if tracking_image and image.lower().endswith(tuple(extensions)):
if tracking_image:
print ' - ' + how_did_we_know
src = re.findall(r'src=3D"(.*?)"', image)
if len(src) < 1:
print ' - no src found, skipping...'
continue
# get rid of annoying mid-word '='
src = re.sub(r'([A-z])=([A-z])', '\\1\\2', src[0]) # like ht=tp
src = re.sub(r'(\.)=([A-z])', '\\1\\2', src) # and .=gif
# extract extension
# skip if .php or other non-image extensions (sneaky!)
name, ext = os.path.splitext(src)
if len(ext) > 0:
success = True
else:
success = False
# strip args from extension
ext = re.sub(r'\?.*?$', '', ext) # ?
ext = re.sub(r'\&.*?$', '', ext) # &
ext = re.sub(r'\%.*?$', '', ext) # %
ext = re.sub(r'=', '', ext) # any remaining =
if len(ext) > 4: # ignore long (fake) extensions
ext = ''
# ignore fake images, if specified
if ignore_fake_images and ext.lower() not in image_ext:
continue
# has an extension? download
if download_images:
print ' - attempting to download...'
try:
urllib.urlretrieve(name + ext, 'DownloadedImages/' + name + '_' + format(i, '08'))
except IOError:
print ' - ERROR, skipping...'
success = False
# write details to log file
# index, filename, email file, width, why recorded, extension (without .)
print ' - storing to log...\n'
with open(log_file, 'a') as log:
log.write(str(i) + ',"' + name + ext + '","' + email + '",' + width + ',' + how_did_we_know + ',' + ext[1:].lower() + '\n')
# ALL DONE!
print '\n' + 'DONE!'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment