Skip to content

Instantly share code, notes, and snippets.

@dckc
Created May 25, 2012 06:37
Show Gist options
  • Save dckc/2786188 to your computer and use it in GitHub Desktop.
Save dckc/2786188 to your computer and use it in GitHub Desktop.
url_update.py -- an example of balancing ocap style and python community norms
'''url_update -- assist in migrating URLs of Frontiers Research pages
>>> web_ua = _MostPagesOKButSome404(('20-million-clinical', 'out-big-5'))
>>> site = Frontiers(web_ua)
>>> src, dest = site.redirects[0]
>>> print site.verified_redirect(src, dest)
Redirect permanent /news.html http://frontiersresearch.org/frontiers/news
>>> print site.gone(site.retired[0])
Redirect gone /subscribe-for-updates-from-frontiers.html
ref http://httpd.apache.org/docs/2.0/mod/mod_alias.html
.. note:: We use the Apache Redirect directive, which does
prefix matching, so src paths must not be a prefix
of any path that you don't want redirected.
>>> bad_pages = [src for src, dest in site.redirects
... if not _catch_io(site.verified_redirect, (src, dest))]
>>> len(bad_pages)
2
>>> bad_pages
... #doctest: +NORMALIZE_WHITESPACE
['/news/kumc-receives-20-million-clinical-and-translational-science-award.html',
'/news/greater-kc-chamber-roles-out-big-5.html']
'''
import logging
from urlparse import urljoin
from posixpath import splitext, split as split_path
log = logging.getLogger(__name__)
def powerbox_main():
import urllib2
web_ua = urllib2.build_opener()
site = Frontiers(web_ua)
cap_main(log, site, logging)
def cap_main(log, site, logging):
logging.basicConfig(level=logging.WARN)
for src, dest in site.redirects:
try:
d = site.verified_redirect(src, dest)
print d
except IOError:
log.warn('unable to verify: %s', src)
for src in site.retired:
print site.gone(src)
class Migrator(object):
def __init__(self, web_ua):
self.__ua = web_ua
@property
def redirects(self):
raise NotImplementedError()
@property
def base(self):
raise NotImplementedError()
def verified_redirect(self, src, dest):
ua = self.__ua
addr = urljoin(self.base, dest)
log.info('verifying: %s', addr)
io = ua.open(addr)
io.close()
return 'Redirect permanent %(src)s %(dest)s' % dict(
src=src, dest=addr)
def gone(self, src):
return 'Redirect gone %(src)s' % dict(src=src)
def _batch_fix(p):
'''
>>> _batch_fix('/researcher-resources/clinical-and-translational-science-unit-(ctsu)/ctsu-resources.html')
'ctsu-resources'
'''
no_special_chars = p.replace('(', '').replace(')', '')
last_segment = split_path(no_special_chars)[1]
return splitext(last_segment)[0]
def _catch_io(f, args):
try:
return f(*args)
except IOError:
return None
class Frontiers(Migrator):
base = 'http://frontiersresearch.org'
batch_moves = '''
/news.html
/news/kumc-receives-20-million-clinical-and-translational-science-award.html
/news/greater-kc-chamber-roles-out-big-5.html
/researcher-resources.html
/researcher-resources/regulatory-knowledge-and-support-program.html
/researcher-resources/pilot-and-collaborative-studies-funding-program.html
/researcher-resources/community-partnership-for-health.html
/researcher-resources/biostatistics.html
/researcher-resources/biomedical-informatics.html
/researcher-resources/ethics-program.html
/researcher-resources/evaluation-program.html
/research-education.html
/research-education/tl1.html
/research-education/training-grants.html
/our-partners.html
/advisory-structure.html
/advisory-structure/leadership-team.html
/advisory-structure/deans-council.html
/advisory-structure/health-systems-leadership-council.html
/advisory-structure/community-council.html
/advisory-structure/external-scientific-advisory-committee.html
/citing-frontiers-support.html
/researcher-resources/clinical-and-translational-science-unit-(ctsu).html
/researcher-resources/clinical-and-translational-science-unit-(ctsu)/ctsu-resources.html
/researcher-resources/translational-technologies-resource-center-(ttrc).html
/researcher-resources/personalized-medicine-and-outcomes-center-(pmoc).html
/researcher-resources/institute-for-advancing-medical-innovation-(iami).html
'''.strip().split()
redirects = [(p, '/frontiers/' + _batch_fix(p))
for p in batch_moves] + [
('/researcher-resources/pilot-and-collaborative-studies-funding-program/pilot-studies-grant-awards.html',
'/frontiers/training-grants')]
'''Not in current site but may need redirect to main page'''
retired = '''
/subscribe-for-updates-from-frontiers.html
/news/the-ethics-of-translational-research.html
/researcher-resources/clinical-and-translational-science-unit-(ctsu)/ctsu-committee-members.html
/researcher-resources/clinical-and-translational-science-unit-(ctsu)/ctsu-application-and-submission-process.html
'''.strip().split()
class _MostPagesOKButSome404(object):
'''Raise 404 for about 1 in 10 web pages.
'''
def __init__(self, bad):
self.bad = bad
def open(self, address):
from StringIO import StringIO
if [txt for txt in self.bad if txt in address]:
raise IOError('404...')
return StringIO('page content...')
if __name__ == '__main__':
powerbox_main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment