Last active
August 29, 2015 14:17
-
-
Save jeffThompson/99583e4bfc6d1c111b4f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, re, urllib | |
''' | |
BLANK EMAIL PIXELS | |
Jeff Thompson | 2015 | www.jeffreythompson.org | |
12,383 pixels = 111 x 112 (with 62px in the last row) | |
''' | |
download_images = False # try to download? (very slow) | |
mail_folder = '/Users/JeffThompson/Library/Mail/V2' # where are emails stored? | |
log_file = 'log.csv' # save to log file | |
ignore_fake_images = False # ignore sketchy .php, etc "images"? | |
image_ext = [ 'gif', 'jpg', 'jpeg', 'png' ] # valid image extensions | |
# well hello | |
os.system('cls' if os.name=='nt' else 'clear') | |
print 'BLANK EMAIL PIXELS' | |
# create log file | |
with open(log_file, 'w') as f: | |
f.write('index,filename,email_file,width,why_recorded,extension\n') | |
# get list of all emails | |
print 'Getting list of all emails (may take a while)...' | |
emails = [ ] | |
for root, dirs, files in os.walk(mail_folder): | |
for file in files: | |
if file.endswith('.emlx'): | |
emails.append(os.path.join(root, file)) | |
print '- found', len(emails), 'emails' | |
# iterate, read, extract images | |
print '\nFinding tracking images...' | |
for i, email in enumerate(emails): | |
print ' ' + str(i + 1) + '/' + str(len(emails)) | |
with open(email) as e: | |
text = e.read().replace('\n', '') | |
images = re.findall(r'<img.*?>', text) | |
for image in images: | |
tracking_image = False # flag for found image | |
how_did_we_know = '-' # how did we know it was a tracking image? | |
# hidden images? | |
if re.search('display:\s*none', image) is not None: | |
tracking_image = True | |
how_did_we_know = 'display:none' | |
if re.search('visibility:\s*hidden', image) is not None: | |
tracking_image = True | |
how_did_we_know = 'visibility:hidden' | |
# width of 0 or 1? | |
width = re.findall(r'width=3D"([0-1])"', image) | |
if len(width) > 0: | |
tracking_image = True | |
width = str(width[0]) # set as string for logging | |
how_did_we_know = 'width' | |
else: | |
width = 'n/a' # no width specified | |
# tracking image? download it | |
# if tracking_image and image.lower().endswith(tuple(extensions)): | |
if tracking_image: | |
print ' - ' + how_did_we_know | |
src = re.findall(r'src=3D"(.*?)"', image) | |
if len(src) < 1: | |
print ' - no src found, skipping...' | |
continue | |
# get rid of annoying mid-word '=' | |
src = re.sub(r'([A-z])=([A-z])', '\\1\\2', src[0]) # like ht=tp | |
src = re.sub(r'(\.)=([A-z])', '\\1\\2', src) # and .=gif | |
# extract extension | |
# skip if .php or other non-image extensions (sneaky!) | |
name, ext = os.path.splitext(src) | |
if len(ext) > 0: | |
success = True | |
else: | |
success = False | |
# strip args from extension | |
ext = re.sub(r'\?.*?$', '', ext) # ? | |
ext = re.sub(r'\&.*?$', '', ext) # & | |
ext = re.sub(r'\%.*?$', '', ext) # % | |
ext = re.sub(r'=', '', ext) # any remaining = | |
if len(ext) > 4: # ignore long (fake) extensions | |
ext = '' | |
# ignore fake images, if specified | |
if ignore_fake_images and ext.lower() not in image_ext: | |
continue | |
# has an extension? download | |
if download_images: | |
print ' - attempting to download...' | |
try: | |
urllib.urlretrieve(name + ext, 'DownloadedImages/' + name + '_' + format(i, '08')) | |
except IOError: | |
print ' - ERROR, skipping...' | |
success = False | |
# write details to log file | |
# index, filename, email file, width, why recorded, extension (without .) | |
print ' - storing to log...\n' | |
with open(log_file, 'a') as log: | |
log.write(str(i) + ',"' + name + ext + '","' + email + '",' + width + ',' + how_did_we_know + ',' + ext[1:].lower() + '\n') | |
# ALL DONE! | |
print '\n' + 'DONE!' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment