Skip to content

Instantly share code, notes, and snippets.

Created May 25, 2012 06:37
Show Gist options
  • Save dckc/2786188 to your computer and use it in GitHub Desktop.
Save dckc/2786188 to your computer and use it in GitHub Desktop. -- an example of balancing ocap style and python community norms
'''url_update -- assist in migrating URLs of Frontiers Research pages
>>> web_ua = _MostPagesOKButSome404(('20-million-clinical', 'out-big-5'))
>>> site = Frontiers(web_ua)
>>> src, dest = site.redirects[0]
>>> print site.verified_redirect(src, dest)
Redirect permanent /news.html
>>> print site.gone(site.retired[0])
Redirect gone /subscribe-for-updates-from-frontiers.html
.. note:: We use the Apache Redirect directive, which does
prefix matching, so src paths must not be a prefix
of any path that you don't want redirected.
>>> bad_pages = [src for src, dest in site.redirects
... if not _catch_io(site.verified_redirect, (src, dest))]
>>> len(bad_pages)
>>> bad_pages
import logging
from urlparse import urljoin
from posixpath import splitext, split as split_path
log = logging.getLogger(__name__)
def powerbox_main():
import urllib2
web_ua = urllib2.build_opener()
site = Frontiers(web_ua)
cap_main(log, site, logging)
def cap_main(log, site, logging):
for src, dest in site.redirects:
d = site.verified_redirect(src, dest)
print d
except IOError:
log.warn('unable to verify: %s', src)
for src in site.retired:
print site.gone(src)
class Migrator(object):
def __init__(self, web_ua):
self.__ua = web_ua
def redirects(self):
raise NotImplementedError()
def base(self):
raise NotImplementedError()
def verified_redirect(self, src, dest):
ua = self.__ua
addr = urljoin(self.base, dest)'verifying: %s', addr)
io =
return 'Redirect permanent %(src)s %(dest)s' % dict(
src=src, dest=addr)
def gone(self, src):
return 'Redirect gone %(src)s' % dict(src=src)
def _batch_fix(p):
>>> _batch_fix('/researcher-resources/clinical-and-translational-science-unit-(ctsu)/ctsu-resources.html')
no_special_chars = p.replace('(', '').replace(')', '')
last_segment = split_path(no_special_chars)[1]
return splitext(last_segment)[0]
def _catch_io(f, args):
return f(*args)
except IOError:
return None
class Frontiers(Migrator):
base = ''
batch_moves = '''
redirects = [(p, '/frontiers/' + _batch_fix(p))
for p in batch_moves] + [
'''Not in current site but may need redirect to main page'''
retired = '''
class _MostPagesOKButSome404(object):
'''Raise 404 for about 1 in 10 web pages.
def __init__(self, bad):
self.bad = bad
def open(self, address):
from StringIO import StringIO
if [txt for txt in self.bad if txt in address]:
raise IOError('404...')
return StringIO('page content...')
if __name__ == '__main__':
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment