Created
September 22, 2014 06:02
-
-
Save fperez/c591cab03c66780b18d5 to your computer and use it in GitHub Desktop.
Check links in an IPython notebook. For images, try to copy missing ones from a globally-configured store.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""Check all figure links in an IPython notebook and try to copy missing ones. | |
Note: this quick script is adapted to a fairly specific pattern of | |
mine. You should simply use it as a template and starting point for | |
your own functionality. | |
I keep a single large directory full of figures I reuse on all my | |
talks, and I normally just symlink it from a new talk directory. But | |
this makes it harder to publish the talk sources in a self-contained | |
way (without including my monster figure storage dir). This script | |
will find all missing image links, and then try to copy to the image | |
tag 'src' attribute the figure, if a matching file is found in my | |
figure storage directory. | |
Many thanks to Brian Granger for the original NbConvertApp customization code | |
and tips on BeautifulSoup. | |
""" | |
from __future__ import print_function | |
# Storage area for missing figures | |
FIGSTORE = '/home/fperez/talks/slides/fig' | |
# Module imports | |
import logging | |
import os | |
import shutil | |
import sys | |
from bs4 import BeautifulSoup | |
import requests | |
from IPython.config.loader import Config | |
from IPython.config.application import catch_config_error | |
from IPython.utils.traitlets import Unicode | |
from IPython.nbconvert.nbconvertapp import NbConvertApp | |
from IPython.nbconvert.nbconvertapp import nbconvert_aliases, nbconvert_flags | |
from IPython.nbconvert.writers import WriterBase | |
# Main code | |
class LinkCheckWriter(WriterBase): | |
def copy_image(self, src, dst): | |
"""Copy src to dst, attempting to hard-link first. """ | |
log = self.log | |
try: | |
os.link(src, dst) | |
except OSError: | |
try: | |
shutil.copy(src, dst) | |
except Exception: | |
log.error("Image copy failed: %s" % sys.exc_info()[0]) | |
else: | |
log.warn("Successfully created missing image.") | |
def verify_images(self, soup): | |
"""Verify all image references in a BeautifulSoup HTML object. | |
Parameters | |
---------- | |
soup : BeautifulSoup object built from an HTML source. | |
""" | |
log = self.log | |
for i, img in enumerate(soup.find_all('img')): | |
src = img.get('src') | |
if src.startswith('data:image'): | |
log.info('Image %s has embedded data.' % i) | |
return | |
if os.path.exists(src): | |
log.info("Image #%s OK: %s" % (i, src)) | |
else: | |
log.warn("Image #%s missing: %s" % (i, src)) | |
fname = os.path.split(src)[-1] | |
target_source = os.path.join(FIGSTORE, fname) | |
if os.path.exists(target_source): | |
log.warn('Available at: %s' % FIGSTORE) | |
self.copy_image(target_source, src) | |
def verify_http_link(self, i, href): | |
log = self.log | |
try: | |
r = requests.get(href) | |
except requests.ConnectionError: | |
log.warn("Link #%s Conection Error: %s" % (i, href)) | |
except: | |
log.error("Link #%s error: %s, %s" % (i, href, | |
sys.exc_info[0])) | |
else: | |
stat = r.status_code | |
if stat == requests.codes.ok: | |
log.info("Link #%s OK (%s): %s " % (i, stat, href)) | |
else: | |
log.warn("Link #%s problem (%s): %s " % (i, stat, href)) | |
def verify_links(self, soup): | |
"""Verify all links in a BeautifulSoup HTML object. | |
Parameters | |
---------- | |
soup : BeautifulSoup object built from an HTML source. | |
""" | |
log = self.log | |
# Nothing implemented on links yet, just log them | |
for (i, lnk) in enumerate(soup.find_all('a')): | |
href = lnk.get('href') | |
if href is None: | |
log.warn("Malformed link: %s" % lnk) | |
continue | |
if href.startswith('http'): | |
self.verify_http_link(i, href) | |
elif href.startswith('#'): | |
log.info("Internal anchor link: %s" % href) | |
continue | |
else: | |
if os.path.exists(href): | |
log.info("Local valid link: %s" % href) | |
else: | |
log.warn("Unkown link: %s" % href) | |
def write(self, output, resources, **kw): | |
notebook_uri = resources['unique_key'] | |
self.log.warn('-'*40) | |
self.log.warn('Checking notebook: %s' % notebook_uri) | |
soup = BeautifulSoup(output, "html.parser") | |
self.verify_links(soup) | |
self.verify_images(soup) | |
class LinkCheckApp(NbConvertApp): | |
name = Unicode(u'nblinkcheck') | |
description = Unicode(u'Check image links in a notebook.') | |
examples = """ | |
To check all image links in all notebooks in the current directory: | |
./nblinkcheck *ipynb | |
""" | |
def _export_format_default(self): | |
return 'html' | |
def build_extra_config(self): | |
self.extra_config = Config() | |
self.extra_config.Exporter.preprocessors = [ | |
] | |
self.config.merge(self.extra_config) | |
@catch_config_error | |
def initialize(self, argv=None): | |
# Meant to be used as a command-line app, so only log at a higher level | |
self.log.level = logging.WARN | |
super(LinkCheckApp,self).initialize(argv) | |
self.build_extra_config() | |
self.writer = LinkCheckWriter(parent=self) | |
if __name__ == '__main__': | |
LinkCheckApp.launch_instance() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment