jimaples/ifttt_movies.py

## ifttt_movies.py
"""Lifehack to speed up going through IFTTT emails of new movies posted to Reddit
"""

import re
from itertools import count
from operator import itemgetter

import requests
from lxml import html
import webbrowser

s_header = '''<head>
<title>IFTTT Movies</title>
<style type="text/css">
	body {
		font-size: 75%;
	}
	.tagline, .flat-list, h3 {
		display:none;
	}
	p {
		margin: 4px 0;
	}
	blockquote {
		margin: 0 20px;
	}
	.ifttt_movie {
		border-bottom: 1px solid black;
	}
</style>
<script type="text/javascript">
function showSource(){
    var s = document.getElementById('pagecontent');
    var source = s.innerHTML;
    //now we need to escape the html special chars, javascript has escape
    //but this does not do what we want
    source = source.replace(/</g, "&lt;").replace(/>/g, "&gt;");
    //now we remove newlines and add <pre> tags to preserve whitespace
    source = "<pre>"+source.replace(/\\n/g,'')+"</pre>";
    //now populate the source div
    var e = document.getElementById('pagesource');
    //1 movie per line
    e.innerHTML = source.replace(/&lt;div class="ifttt_movie"/g,'\\n&lt;div class="ifttt_movie"');
    e.style.display = 'block';
    s.style.display = 'none';
	document.getElementById('btn_content').style.display = 'block';
	document.getElementById('btn_source').style.display = 'none';
	return source;
}
function showContent(){
	document.getElementById('pagecontent').style.display = 'block';
	document.getElementById('pagesource').style.display = 'none';
	document.getElementById('btn_content').style.display = 'none';
	document.getElementById('btn_source').style.display = 'block';
}
</script>
</head>'''


re_imdb = re.compile('(?:<strong>)*(\d\.\d)(?:</strong>)*/10')

processMovieIdx = count(1)

def processMovie(name, link):
	i = processMovieIdx.next()
	o = {'name':name,'link':link,'index':i,'rating':'???'}
	o['id'] = 'ifttt_movie{:03d}'.format(i)
	# follow the link (user-agent is to get around the bot-catcher
	site = requests.get(link, headers={'user-agent':'ifttt-email/0.0.'+str(i)})
	o['status'] = site.status_code
	print '{index:2d} ({status:d}) : {name:s}'.format(**o)
	# download link from email
	s = '''<div class="ifttt_movie" id="{id:s}">
<button onclick="document.getElementById('{id:s}').outerHTML = '';">X</button>
<a href="{link:s}">{name:s}</a><br>'''.format(**o)
	if site.status_code == 200: # 200 = OK
		# parse the HTML response
		tree = html.fromstring(site.text)
		# find the IMDb bot post
		div = tree.xpath("//a[contains(.,'MovieGuide')]/../..")
		if len(div):
			s += html.tostring(div[0])
			score = re_imdb.findall(s)
			if len(score):
				o['rating'] = score[0]
	o['html'] = s+'</div>'
	return o

if __name__ == '__main__':
	email = '/home/admin/Documents/Python/IPython/ifttt_email.txt'
	path = email.rsplit('/',1)
	if len(path) == 1:
		# get local directory
		raise NotImplemented, 'Need to import os.path'
	else:
		output = path[0]+'/ifttt_movies.html'
		debug = path[0]+'/ifttt_debug.txt'

	s = file(email,'r').read()

	links = re.findall('(.*)\n\n(http[^\n]+)', s)
	print len(links), 'links found'

	# process the links
	movies = map(lambda l: processMovie(*l), links)
	with open(debug, 'wb') as fp:
		print >> fp, repr(movies)

	# sort by rating
	movies.sort(key=itemgetter('rating'), reverse=True)

	with open(output,'wb') as fp:
		print >> fp, '<html>'+s_header+'<body><ol>'
		# ToC
		for m in movies:
			print >> fp, '<li>IMDb {rating:s} : <a href="#{id:s}">{name:s}</a></li>'.format(**m)
		print >> fp, '''</ol>
<button id="btn_content" style="display:none;" onclick="showContent();">Show Content</button>
<button id="btn_source" onclick="showSource();">Show Source</button>
<hr><div id="pagecontent">'''
		# MovieGuide posts
		for m in movies:
			print >> fp, m['html']
		print >> fp, '</div><div id="pagesource"></div></body></html>'

	# open file in new tab
	print 'Opening results in browser: '+output
	webbrowser.open('file://'+output,2)
	"""Lifehack to speed up going through IFTTT emails of new movies posted to Reddit
	"""

	import re
	from itertools import count
	from operator import itemgetter

	import requests
	from lxml import html
	import webbrowser

	s_header = '''<head>
	<title>IFTTT Movies</title>
	<style type="text/css">
	body {
	font-size: 75%;
	}
	.tagline, .flat-list, h3 {
	display:none;
	}
	p {
	margin: 4px 0;
	}
	blockquote {
	margin: 0 20px;
	}
	.ifttt_movie {
	border-bottom: 1px solid black;
	}
	</style>
	<script type="text/javascript">
	function showSource(){
	var s = document.getElementById('pagecontent');
	var source = s.innerHTML;
	//now we need to escape the html special chars, javascript has escape
	//but this does not do what we want
	source = source.replace(/</g, "<").replace(/>/g, ">");
	//now we remove newlines and add <pre> tags to preserve whitespace
	source = "<pre>"+source.replace(/\\n/g,'')+"</pre>";
	//now populate the source div
	var e = document.getElementById('pagesource');
	//1 movie per line
	e.innerHTML = source.replace(/<div class="ifttt_movie"/g,'\\n<div class="ifttt_movie"');
	e.style.display = 'block';
	s.style.display = 'none';
	document.getElementById('btn_content').style.display = 'block';
	document.getElementById('btn_source').style.display = 'none';
	return source;
	}
	function showContent(){
	document.getElementById('pagecontent').style.display = 'block';
	document.getElementById('pagesource').style.display = 'none';
	document.getElementById('btn_content').style.display = 'none';
	document.getElementById('btn_source').style.display = 'block';
	}
	</script>
	</head>'''


	re_imdb = re.compile('(?:<strong>)(\d\.\d)(?:</strong>)/10')

	processMovieIdx = count(1)

	def processMovie(name, link):
	i = processMovieIdx.next()
	o = {'name':name,'link':link,'index':i,'rating':'???'}
	o['id'] = 'ifttt_movie{:03d}'.format(i)
	# follow the link (user-agent is to get around the bot-catcher
	site = requests.get(link, headers={'user-agent':'ifttt-email/0.0.'+str(i)})
	o['status'] = site.status_code
	print '{index:2d} ({status:d}) : {name:s}'.format(**o)
	# download link from email
	s = '''<div class="ifttt_movie" id="{id:s}">
	<button onclick="document.getElementById('{id:s}').outerHTML = '';">X</button>
	<a href="{link:s}">{name:s}</a><br>'''.format(**o)
	if site.status_code == 200: # 200 = OK
	# parse the HTML response
	tree = html.fromstring(site.text)
	# find the IMDb bot post
	div = tree.xpath("//a[contains(.,'MovieGuide')]/../..")
	if len(div):
	s += html.tostring(div[0])
	score = re_imdb.findall(s)
	if len(score):
	o['rating'] = score[0]
	o['html'] = s+'</div>'
	return o

	if __name__ == '__main__':
	email = '/home/admin/Documents/Python/IPython/ifttt_email.txt'
	path = email.rsplit('/',1)
	if len(path) == 1:
	# get local directory
	raise NotImplemented, 'Need to import os.path'
	else:
	output = path[0]+'/ifttt_movies.html'
	debug = path[0]+'/ifttt_debug.txt'

	s = file(email,'r').read()

	links = re.findall('(.*)\n\n(http[^\n]+)', s)
	print len(links), 'links found'

	# process the links
	movies = map(lambda l: processMovie(*l), links)
	with open(debug, 'wb') as fp:
	print >> fp, repr(movies)

	# sort by rating
	movies.sort(key=itemgetter('rating'), reverse=True)

	with open(output,'wb') as fp:
	print >> fp, '<html>'+s_header+'<body><ol>'
	# ToC
	for m in movies:
	print >> fp, '<li>IMDb {rating:s} : <a href="#{id:s}">{name:s}</a></li>'.format(**m)
	print >> fp, '''</ol>
	<button id="btn_content" style="display:none;" onclick="showContent();">Show Content</button>
	<button id="btn_source" onclick="showSource();">Show Source</button>
	<hr><div id="pagecontent">'''
	# MovieGuide posts
	for m in movies:
	print >> fp, m['html']
	print >> fp, '</div><div id="pagesource"></div></body></html>'

	# open file in new tab
	print 'Opening results in browser: '+output
	webbrowser.open('file://'+output,2)