dado3212/downloadPearls.py

## downloadPearls.py
import datetime, requests, re, urllib
from StringIO import StringIO
from PIL import Image
import pytesseract
from cgi import escape
import json

base_url = "http://www.gocomics.com/pearlsbeforeswine/"
start_date = datetime.datetime(2002, 1, 7)
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}

date = start_date

comics = []

def write_to_file(comics):
	print 'Writing'

	f = open('index.html', 'wb')
	f.write('''
		<html>
			<head>
				<style>
					.comic {
						width: 750px;
						margin-bottom: 15px;
					}

					.comic img {
						max-width: 100%;
					}

					.comic span {
						font-weight: bold;
						font-family: sans-serif;
						font-size: 1.2em;
					}
				</style>
			</head>
			<body>

	''')

	for comic in comics:
		f.write('<div class="comic">')
		f.write('<span>' + comic['date'] + '</span>')
		f.write('<img src="' + comic['small_url'] + '" />')
		if 'ocr' in comic:
			f.write('<p>' + escape(comic['ocr']) + '</p>')
		f.write('</div>')

	f.write('</body></html>')
	f.close()

	t = open('json.txt', 'wb')
	t.write(json.dumps(comics))
	t.close()

	print 'Done'

last_url = ""

cont = True
try:
	while cont:
		try:
			with requests.Session() as c:
				comic = c.get(base_url + date.strftime('%Y/%m/%d'), verify=False, headers=headers) # initializes the headers, cookies
				small_url = re.search('<img alt="Pearls Before Swine" class="strip" src="(.*?)"', comic.text).group(1)
				try:
					url = re.search('zoom_link.*?src="(.*?)"', comic.text).group(1)
				except:
					url = small_url

				if (url == last_url):
					cont = False
					write_to_file(comics)
				else:
					data = urllib.urlopen(url).read()
					img = Image.open(StringIO(data))

					comics.append({'url': url, 'small_url': small_url, 'date': date.strftime('%m/%d/%Y'), 'ocr': pytesseract.image_to_string(img)})
				last_url = url

			date = date + datetime.timedelta(days=1)
			print date.strftime('%Y/%m/%d')
		except Exception as e:
			print e
			cont = False
except:
	write_to_file(comics)
	import datetime, requests, re, urllib
	from StringIO import StringIO
	from PIL import Image
	import pytesseract
	from cgi import escape
	import json

	base_url = "http://www.gocomics.com/pearlsbeforeswine/"
	start_date = datetime.datetime(2002, 1, 7)
	headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}

	date = start_date

	comics = []

	def write_to_file(comics):
	print 'Writing'

	f = open('index.html', 'wb')
	f.write('''
	<html>
	<head>
	<style>
	.comic {
	width: 750px;
	margin-bottom: 15px;
	}

	.comic img {
	max-width: 100%;
	}

	.comic span {
	font-weight: bold;
	font-family: sans-serif;
	font-size: 1.2em;
	}
	</style>
	</head>
	<body>

	''')

	for comic in comics:
	f.write('<div class="comic">')
	f.write('<span>' + comic['date'] + '</span>')
	f.write('<img src="' + comic['small_url'] + '" />')
	if 'ocr' in comic:
	f.write('<p>' + escape(comic['ocr']) + '</p>')
	f.write('</div>')

	f.write('</body></html>')
	f.close()

	t = open('json.txt', 'wb')
	t.write(json.dumps(comics))
	t.close()

	print 'Done'

	last_url = ""

	cont = True
	try:
	while cont:
	try:
	with requests.Session() as c:
	comic = c.get(base_url + date.strftime('%Y/%m/%d'), verify=False, headers=headers) # initializes the headers, cookies
	small_url = re.search('<img alt="Pearls Before Swine" class="strip" src="(.*?)"', comic.text).group(1)
	try:
	url = re.search('zoom_link.?src="(.?)"', comic.text).group(1)
	except:
	url = small_url

	if (url == last_url):
	cont = False
	write_to_file(comics)
	else:
	data = urllib.urlopen(url).read()
	img = Image.open(StringIO(data))

	comics.append({'url': url, 'small_url': small_url, 'date': date.strftime('%m/%d/%Y'), 'ocr': pytesseract.image_to_string(img)})
	last_url = url

	date = date + datetime.timedelta(days=1)
	print date.strftime('%Y/%m/%d')
	except Exception as e:
	print e
	cont = False
	except:
	write_to_file(comics)