Skip to content

Instantly share code, notes, and snippets.

@poizan42
Last active January 30, 2018 18:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save poizan42/71b7f49072596ddfc2b3117d81b27a48 to your computer and use it in GitHub Desktop.
Save poizan42/71b7f49072596ddfc2b3117d81b27a48 to your computer and use it in GitHub Desktop.
Update links to Spinnerette comics to current format
import sys
from datetime import datetime
import re
import urllib2
from bs4 import BeautifulSoup
wayback_prefix = 'https://web.archive.org/web/20160824022930/'
newformat_prefix = 'http://www.spinnyverse.com/comic/'
cachef = open('spinny-cache', 'a+')
cachef.seek(0, 0)
cache = {}
for l in cachef:
(id, new_suffix) = l.split()
cache[id] = new_suffix
def update_url(m):
org_uri = m.group(0)
id = m.group(1)
try:
new_suffix = cache[id]
except KeyError:
print >>sys.stderr, 'Fetching page for ' + id
wayback_page = urllib2.urlopen(wayback_prefix + org_uri)
soup = BeautifulSoup(wayback_page, 'html.parser')
img = soup.find('img', attrs={'id': 'cc-comic'})
if img <> None:
new_suffix = img['title'].lower().replace(' ', '-')
new_suffix = re.sub('-+', '-', new_suffix)
else:
img = soup.find('img', attrs={'id': 'comic'})
m = re.match('/web/\d+im_/http://www.spinnyverse.com:80/comics/(.+).jpg', img['src'])
comicdate = datetime.strptime(m.group(1), '%Y-%m-%d')
new_suffix = comicdate.strftime('%m-%d-%Y')
cache[id] = new_suffix
cachef.write(id + ' ' + new_suffix + '\n')
return (newformat_prefix + new_suffix).encode('ascii')
def update_format2(m):
year = m.group(2)
month = m.group(3)
day = m.group(4)
month2 = m.group(5)
day2 = m.group(6)
year2 = m.group(7)
assert year == year2 and month == month2 and day == day2
return '{0}{1}-{2}-{3}'.format(newformat_prefix, month, day, year)
with open(sys.argv[1], 'r') as f:
cnt = f.read()
newcnt = re.sub('http://www\.spinnyverse\.com/index\.php\?id=(\d+)', update_url, cnt)
newcnt = re.sub('http://www\.(spinnyverse\.com|krakowstudios\.com/spinnerette)/(\d\d\d\d)/(\d\d)/(\d\d)/(\d\d)(\d\d)(\d\d\d\d)/?', update_format2, newcnt)
cachef.close()
print newcnt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment