Skip to content

Instantly share code, notes, and snippets.

@dado3212
Created October 12, 2018 04:08
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save dado3212/e0347f38475cc374a91f019381c0568c to your computer and use it in GitHub Desktop.
Save dado3212/e0347f38475cc374a91f019381c0568c to your computer and use it in GitHub Desktop.
A download script to download all of the PearlsBeforeSwine comic strips, and run them through OCR
import datetime, requests, re, urllib
from StringIO import StringIO
from PIL import Image
import pytesseract
from cgi import escape
import json
base_url = "http://www.gocomics.com/pearlsbeforeswine/"
start_date = datetime.datetime(2002, 1, 7)
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
date = start_date
comics = []
def write_to_file(comics):
print 'Writing'
f = open('index.html', 'wb')
f.write('''
<html>
<head>
<style>
.comic {
width: 750px;
margin-bottom: 15px;
}
.comic img {
max-width: 100%;
}
.comic span {
font-weight: bold;
font-family: sans-serif;
font-size: 1.2em;
}
</style>
</head>
<body>
''')
for comic in comics:
f.write('<div class="comic">')
f.write('<span>' + comic['date'] + '</span>')
f.write('<img src="' + comic['small_url'] + '" />')
if 'ocr' in comic:
f.write('<p>' + escape(comic['ocr']) + '</p>')
f.write('</div>')
f.write('</body></html>')
f.close()
t = open('json.txt', 'wb')
t.write(json.dumps(comics))
t.close()
print 'Done'
last_url = ""
cont = True
try:
while cont:
try:
with requests.Session() as c:
comic = c.get(base_url + date.strftime('%Y/%m/%d'), verify=False, headers=headers) # initializes the headers, cookies
small_url = re.search('<img alt="Pearls Before Swine" class="strip" src="(.*?)"', comic.text).group(1)
try:
url = re.search('zoom_link.*?src="(.*?)"', comic.text).group(1)
except:
url = small_url
if (url == last_url):
cont = False
write_to_file(comics)
else:
data = urllib.urlopen(url).read()
img = Image.open(StringIO(data))
comics.append({'url': url, 'small_url': small_url, 'date': date.strftime('%m/%d/%Y'), 'ocr': pytesseract.image_to_string(img)})
last_url = url
date = date + datetime.timedelta(days=1)
print date.strftime('%Y/%m/%d')
except Exception as e:
print e
cont = False
except:
write_to_file(comics)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment