jeffThompson/EveryTrackingPixel

## EveryTrackingPixel
import os, re, urllib

'''
BLANK EMAIL PIXELS
Jeff Thompson | 2015 | www.jeffreythompson.org

12,383 pixels = 111 x 112 (with 62px in the last row)

'''

download_images = 	False					# try to download? (very slow)
mail_folder = 		'/Users/JeffThompson/Library/Mail/V2'	# where are emails stored?
log_file = 		'log.csv'				# save to log file

ignore_fake_images = 	False					# ignore sketchy .php, etc "images"?
image_ext = 		[ 'gif', 'jpg', 'jpeg', 'png' ]		# valid image extensions


# well hello
os.system('cls' if os.name=='nt' else 'clear')
print 'BLANK EMAIL PIXELS'

# create log file
with open(log_file, 'w') as f:
	f.write('index,filename,email_file,width,why_recorded,extension\n')


# get list of all emails
print 'Getting list of all emails (may take a while)...'
emails = [ ]
for root, dirs, files in os.walk(mail_folder):
	for file in files:
		if file.endswith('.emlx'):
			emails.append(os.path.join(root, file))
print '- found', len(emails), 'emails'


# iterate, read, extract images
print '\nFinding tracking images...'
for i, email in enumerate(emails):
	print '  ' + str(i + 1) + '/' + str(len(emails))

	with open(email) as e:
		text = e.read().replace('\n', '')

		images = re.findall(r'<img.*?>', text)
		for image in images:
			tracking_image = False		# flag for found image
			how_did_we_know = '-'		# how did we know it was a tracking image?

			# hidden images?
			if re.search('display:\s*none', image) is not None:
				tracking_image = True
				how_did_we_know = 'display:none'
			if re.search('visibility:\s*hidden', image) is not None:
				tracking_image = True
				how_did_we_know = 'visibility:hidden'

			# width of 0 or 1?
			width = re.findall(r'width=3D"([0-1])"', image)
			if len(width) > 0:
				tracking_image = True
				width = str(width[0])	# set as string for logging
				how_did_we_know = 'width'
			else:
				width = 'n/a'			# no width specified

			# tracking image? download it
			# if tracking_image and image.lower().endswith(tuple(extensions)):
			if tracking_image:
				print '  - ' + how_did_we_know

				src = re.findall(r'src=3D"(.*?)"', image)
				if len(src) < 1:
					print '  - no src found, skipping...'
					continue

				# get rid of annoying mid-word '='
				src = re.sub(r'([A-z])=([A-z])', '\\1\\2', src[0])		# like ht=tp
				src = re.sub(r'(\.)=([A-z])', '\\1\\2', src)			# and .=gif

				# extract extension
				# skip if .php or other non-image extensions (sneaky!)
				name, ext = os.path.splitext(src)
				if len(ext) > 0:
					success = True
				else:
					success = False

				# strip args from extension
				ext = re.sub(r'\?.*?$', '', ext)	# ?
				ext = re.sub(r'\&.*?$', '', ext)	# &
				ext = re.sub(r'\%.*?$', '', ext)	# %
				ext = re.sub(r'=', '', ext)		# any remaining =
				if len(ext) > 4:			# ignore long (fake) extensions
					ext = ''

				# ignore fake images, if specified
				if ignore_fake_images and ext.lower() not in image_ext:
					continue

				# has an extension? download
				if download_images:
					print '  - attempting to download...'
					try:
						urllib.urlretrieve(name + ext, 'DownloadedImages/' + name + '_' + format(i, '08'))
					except IOError:
						print '  - ERROR, skipping...'
						success = False

				# write details to log file
				# index, filename, email file, width, why recorded, extension (without .)
				print '  - storing to log...\n'
				with open(log_file, 'a') as log:
					log.write(str(i) + ',"' + name + ext + '","' + email + '",' + width + ',' + how_did_we_know + ',' + ext[1:].lower() + '\n')


# ALL DONE!
print '\n' + 'DONE!'
	import os, re, urllib

	'''
	BLANK EMAIL PIXELS
	Jeff Thompson \| 2015 \| www.jeffreythompson.org

	12,383 pixels = 111 x 112 (with 62px in the last row)

	'''

	download_images = False # try to download? (very slow)
	mail_folder = '/Users/JeffThompson/Library/Mail/V2' # where are emails stored?
	log_file = 'log.csv' # save to log file

	ignore_fake_images = False # ignore sketchy .php, etc "images"?
	image_ext = [ 'gif', 'jpg', 'jpeg', 'png' ] # valid image extensions


	# well hello
	os.system('cls' if os.name=='nt' else 'clear')
	print 'BLANK EMAIL PIXELS'

	# create log file
	with open(log_file, 'w') as f:
	f.write('index,filename,email_file,width,why_recorded,extension\n')


	# get list of all emails
	print 'Getting list of all emails (may take a while)...'
	emails = [ ]
	for root, dirs, files in os.walk(mail_folder):
	for file in files:
	if file.endswith('.emlx'):
	emails.append(os.path.join(root, file))
	print '- found', len(emails), 'emails'


	# iterate, read, extract images
	print '\nFinding tracking images...'
	for i, email in enumerate(emails):
	print ' ' + str(i + 1) + '/' + str(len(emails))

	with open(email) as e:
	text = e.read().replace('\n', '')

	images = re.findall(r'<img.*?>', text)
	for image in images:
	tracking_image = False # flag for found image
	how_did_we_know = '-' # how did we know it was a tracking image?

	# hidden images?
	if re.search('display:\s*none', image) is not None:
	tracking_image = True
	how_did_we_know = 'display:none'
	if re.search('visibility:\s*hidden', image) is not None:
	tracking_image = True
	how_did_we_know = 'visibility:hidden'

	# width of 0 or 1?
	width = re.findall(r'width=3D"([0-1])"', image)
	if len(width) > 0:
	tracking_image = True
	width = str(width[0]) # set as string for logging
	how_did_we_know = 'width'
	else:
	width = 'n/a' # no width specified

	# tracking image? download it
	# if tracking_image and image.lower().endswith(tuple(extensions)):
	if tracking_image:
	print ' - ' + how_did_we_know

	src = re.findall(r'src=3D"(.*?)"', image)
	if len(src) < 1:
	print ' - no src found, skipping...'
	continue

	# get rid of annoying mid-word '='
	src = re.sub(r'([A-z])=([A-z])', '\\1\\2', src[0]) # like ht=tp
	src = re.sub(r'(\.)=([A-z])', '\\1\\2', src) # and .=gif

	# extract extension
	# skip if .php or other non-image extensions (sneaky!)
	name, ext = os.path.splitext(src)
	if len(ext) > 0:
	success = True
	else:
	success = False

	# strip args from extension
	ext = re.sub(r'\?.*?$', '', ext) # ?
	ext = re.sub(r'\&.*?$', '', ext) # &
	ext = re.sub(r'\%.*?$', '', ext) # %
	ext = re.sub(r'=', '', ext) # any remaining =
	if len(ext) > 4: # ignore long (fake) extensions
	ext = ''

	# ignore fake images, if specified
	if ignore_fake_images and ext.lower() not in image_ext:
	continue

	# has an extension? download
	if download_images:
	print ' - attempting to download...'
	try:
	urllib.urlretrieve(name + ext, 'DownloadedImages/' + name + '_' + format(i, '08'))
	except IOError:
	print ' - ERROR, skipping...'
	success = False

	# write details to log file
	# index, filename, email file, width, why recorded, extension (without .)
	print ' - storing to log...\n'
	with open(log_file, 'a') as log:
	log.write(str(i) + ',"' + name + ext + '","' + email + '",' + width + ',' + how_did_we_know + ',' + ext[1:].lower() + '\n')


	# ALL DONE!
	print '\n' + 'DONE!'