Skip to content

Instantly share code, notes, and snippets.

@fperez
Created September 22, 2014 06:02
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fperez/c591cab03c66780b18d5 to your computer and use it in GitHub Desktop.
Save fperez/c591cab03c66780b18d5 to your computer and use it in GitHub Desktop.
Check links in an IPython notebook. For images, try to copy missing ones from a globally-configured store.
#!/usr/bin/env python
"""Check all figure links in an IPython notebook and try to copy missing ones.
Note: this quick script is adapted to a fairly specific pattern of
mine. You should simply use it as a template and starting point for
your own functionality.
I keep a single large directory full of figures I reuse on all my
talks, and I normally just symlink it from a new talk directory. But
this makes it harder to publish the talk sources in a self-contained
way (without including my monster figure storage dir). This script
will find all missing image links, and then try to copy to the image
tag 'src' attribute the figure, if a matching file is found in my
figure storage directory.
Many thanks to Brian Granger for the original NbConvertApp customization code
and tips on BeautifulSoup.
"""
from __future__ import print_function
# Storage area for missing figures
FIGSTORE = '/home/fperez/talks/slides/fig'
# Module imports
import logging
import os
import shutil
import sys
from bs4 import BeautifulSoup
import requests
from IPython.config.loader import Config
from IPython.config.application import catch_config_error
from IPython.utils.traitlets import Unicode
from IPython.nbconvert.nbconvertapp import NbConvertApp
from IPython.nbconvert.nbconvertapp import nbconvert_aliases, nbconvert_flags
from IPython.nbconvert.writers import WriterBase
# Main code
class LinkCheckWriter(WriterBase):
def copy_image(self, src, dst):
"""Copy src to dst, attempting to hard-link first. """
log = self.log
try:
os.link(src, dst)
except OSError:
try:
shutil.copy(src, dst)
except Exception:
log.error("Image copy failed: %s" % sys.exc_info()[0])
else:
log.warn("Successfully created missing image.")
def verify_images(self, soup):
"""Verify all image references in a BeautifulSoup HTML object.
Parameters
----------
soup : BeautifulSoup object built from an HTML source.
"""
log = self.log
for i, img in enumerate(soup.find_all('img')):
src = img.get('src')
if src.startswith('data:image'):
log.info('Image %s has embedded data.' % i)
return
if os.path.exists(src):
log.info("Image #%s OK: %s" % (i, src))
else:
log.warn("Image #%s missing: %s" % (i, src))
fname = os.path.split(src)[-1]
target_source = os.path.join(FIGSTORE, fname)
if os.path.exists(target_source):
log.warn('Available at: %s' % FIGSTORE)
self.copy_image(target_source, src)
def verify_http_link(self, i, href):
log = self.log
try:
r = requests.get(href)
except requests.ConnectionError:
log.warn("Link #%s Conection Error: %s" % (i, href))
except:
log.error("Link #%s error: %s, %s" % (i, href,
sys.exc_info[0]))
else:
stat = r.status_code
if stat == requests.codes.ok:
log.info("Link #%s OK (%s): %s " % (i, stat, href))
else:
log.warn("Link #%s problem (%s): %s " % (i, stat, href))
def verify_links(self, soup):
"""Verify all links in a BeautifulSoup HTML object.
Parameters
----------
soup : BeautifulSoup object built from an HTML source.
"""
log = self.log
# Nothing implemented on links yet, just log them
for (i, lnk) in enumerate(soup.find_all('a')):
href = lnk.get('href')
if href is None:
log.warn("Malformed link: %s" % lnk)
continue
if href.startswith('http'):
self.verify_http_link(i, href)
elif href.startswith('#'):
log.info("Internal anchor link: %s" % href)
continue
else:
if os.path.exists(href):
log.info("Local valid link: %s" % href)
else:
log.warn("Unkown link: %s" % href)
def write(self, output, resources, **kw):
notebook_uri = resources['unique_key']
self.log.warn('-'*40)
self.log.warn('Checking notebook: %s' % notebook_uri)
soup = BeautifulSoup(output, "html.parser")
self.verify_links(soup)
self.verify_images(soup)
class LinkCheckApp(NbConvertApp):
name = Unicode(u'nblinkcheck')
description = Unicode(u'Check image links in a notebook.')
examples = """
To check all image links in all notebooks in the current directory:
./nblinkcheck *ipynb
"""
def _export_format_default(self):
return 'html'
def build_extra_config(self):
self.extra_config = Config()
self.extra_config.Exporter.preprocessors = [
]
self.config.merge(self.extra_config)
@catch_config_error
def initialize(self, argv=None):
# Meant to be used as a command-line app, so only log at a higher level
self.log.level = logging.WARN
super(LinkCheckApp,self).initialize(argv)
self.build_extra_config()
self.writer = LinkCheckWriter(parent=self)
if __name__ == '__main__':
LinkCheckApp.launch_instance()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment