Skip to content

Instantly share code, notes, and snippets.

@pplonski
Forked from brazilbean/preprocessors.py
Created November 18, 2022 08:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pplonski/23f57fab987e2382dc53669fc1900e64 to your computer and use it in GitHub Desktop.
Save pplonski/23f57fab987e2382dc53669fc1900e64 to your computer and use it in GitHub Desktop.
Custom Jupyter Notebook Pre-processors
'''Specialized Preprocessors'''
import nbconvert, nbformat, re, sys
from nbconvert.preprocessors import ExecutePreprocessor, Preprocessor
from traitlets import Dict, Unicode
from textwrap import dedent
from warnings import warn
def warn_deprecated(msg):
'''Raise a DeprecationWarning'''
warn(msg, DeprecationWarning, stacklevel=2)
class RemoveCodePreprocessor(Preprocessor):
def _should_remove(self, cell):
if cell.cell_type == 'code':
return True
elif cell.cell_type == 'markdown':
# For cells starting with headers
# The first token will be a set of #'s
# Check for a '!' as the second token in the source
tokens = cell.source.split()
return all(t == '#' for t in tokens[0]) and tokens[1] == '!'
else:
return False
def preprocess(self, notebook, resources):
'''Skip code cells and special markdown cells'''
notebook.cells = [cell for cell in notebook.cells if not self._should_remove(cell)]
return notebook, resources
class FilterCodePreprocessor(Preprocessor):
def _should_filter(self, cell):
'''Determine whether a cell should be filtered'''
if cell.cell_type != 'code':
return False
else:
# Check for a ##Hidecell comment
if re.search('##hidecell', cell.source, re.I) is not None:
return True
else:
return False
def preprocess(self, notebook, resources):
'''Filter code cells'''
notebook.cells = [cell for cell in notebook.cells if not self._should_filter(cell)]
return notebook, resources
class ClearEmptyRawCellsPreprocessor(Preprocessor):
'''Remove empty raw cells from the notebook'''
def preprocess(self, notebook, resources):
notebook.cells = [cell for cell in notebook.cells
if not (cell.cell_type == 'raw' and cell.source == '')]
return notebook, resources
class ArgumentSubstitutionPreprocessor(Preprocessor):
args = Dict(Unicode(),
default_value={}).tag(config=True)
def replace_variables(self, source, variables):
"""
Replace <<variablename>> with stored value
"""
try:
replaced = re.sub("<<(.*?)>>", lambda m: variables.get(m.group(1), m.group(1)), source)
except TypeError:
print("WARNING: unable to perform replacement in cell: {}".format(source), sys.stderr)
replaced = source
return replaced
def preprocess_cell(self, cell, resources, index):
"""
Preprocess cell
Parameters
----------
cell : NotebookNode cell
Notebook cell being processed
resources : dictionary
Additional resources used in the conversion process. Allows
preprocessors to pass variables into the Jinja engine.
cell_index : int
Index of the cell being processed (see base.py)
"""
if cell.cell_type == "code":
if len(self.args) > 0:
cell.source = self.replace_variables(cell.source, self.args)
return cell, resources
class ExecuteCodeMarkdownPreprocessor(ExecutePreprocessor):
def __init__(self, **kw):
self.sections = {'default': True} # maps section ID to true or false
self.EmptyCell = nbformat.v4.nbbase.new_raw_cell("")
self.MetaCodeProcessor = MetaCodePreprocessor(self)
return super().__init__(**kw)
def preprocess_cell(self, cell, resources, cell_index):
"""
Executes a single code cell. See base.py for details.
To execute all cells see :meth:`preprocess_cell`.
"""
try:
cell, resources = self.MetaCodeProcessor.process_cell(cell, resources)
if cell is None:
return self.EmptyCell, resources
if cell.cell_type not in ['code','markdown']:
return cell, resources
if cell.cell_type == 'code':
# Do code stuff
return self.preprocess_code_cell(cell, resources, cell_index)
elif cell.cell_type == 'markdown':
# Do markdown stuff
return self.preprocess_markdown_cell(cell, resources, cell_index)
else:
# Don't do anything
return cell, resources
except TimeoutError:
print("Timeout on execution of cell: {}".format(cell.source), file=sys.stderr, flush=True)
raise
def preprocess_code_cell(self, cell, resources, cell_index):
''' Process code cell. '''
outputs = self.run_cell(cell)
cell.outputs = outputs
if not self.allow_errors:
for out in outputs:
if out.output_type == 'error':
pattern = u"""\
An error occurred while executing the following cell:
------------------
{cell.source}
------------------
{out.ename}: {out.evalue}
"""
msg = dedent(pattern).format(out=out, cell=cell)
raise nbconvert.preprocessors.execute.CellExecutionError(msg)
return cell, resources
def preprocess_markdown_cell(self, cell, resources, cell_index):
# Find and execute snippets of code
cell['metadata']['variables'] = {}
for m in re.finditer("{{(.*?)}}", cell.source):
# Execute code
fakecell = nbformat.v4.nbbase.new_code_cell(m.group(1))
fakecell, resources = self.preprocess_code_cell(fakecell, resources, cell_index)
# Output found in cell.outputs
# Put output in cell['metadata']['variables']
for output in fakecell.outputs:
html = self.convert_output_to_html(output)
if html is not None:
cell['metadata']['variables'][fakecell.source] = html
break
return cell, resources
def convert_output_to_html(self, output):
'''Convert IOpub output to HTML
See https://github.com/ipython-contrib/IPython-notebook-extensions/blob/master/nbextensions/usability/python-markdown/main.js
'''
if output['output_type'] == 'error':
text = '**' + output.ename + '**: ' + output.evalue;
return text
elif output.output_type == 'execute_result' or output.output_type == 'display_data':
data = output.data
if 'text/latex' in data:
html = data['text/latex']
return html
elif 'image/svg+xml' in data:
# Not supported
#var svg = ul['image/svg+xml'];
#/* embed SVG in an <img> tag, still get eaten by sanitizer... */
#svg = btoa(svg);
#html = '<img src="data:image/svg+xml;base64,' + svg + '"/>';
return None
elif 'image/jpeg' in data:
jpeg = data['image/jpeg']
html = '<img src="data:image/jpeg;base64,' + jpeg + '"/>'
return html
elif 'image/png' in data:
png = data['image/png']
html = '<img src="data:image/png;base64,' + png + '"/>'
return html
elif 'text/markdown' in data:
text = data['text/markdown']
return text
elif 'text/html' in data:
html = data['text/html']
return html
elif 'text/plain' in data:
text = data['text/plain']
# Strip <p> and </p> tags
# Strip quotes
# html.match(/<p>([\s\S]*?)<\/p>/)[1]
text = re.sub(r'<p>([\s\S]*?)<\/p>', r'\1', text)
text = re.sub(r"'([\s\S]*?)'",r'\1', text)
return text
else:
# Some tag we don't support
return None
else:
return None
class MetaCodePreprocessor:
def __init__(self, cell_runner):
self._in_exclude_mode = False
self.cell_runner = cell_runner
def _output_to_bool(self, output):
'''Convert cell execution output to a boolean'''
if not output:
# Empty string is false
return False
else:
try:
# Return the boolean version of evaluating the string
# Should handle cases like "True", "False", "0", "1", etc.
return bool(eval(output))
except NameError:
# The output did not comprise a valid python expression
# Return True
return True
def _evaluate_cell(self, cell):
'''Evaluate the cell and return True or False'''
# Execute the cell source to see whether the section should be kept
# Default to True
outputs = self.cell_runner.run_cell(cell)
for output in outputs:
if "data" in output and "text/plain" in output.data:
return self._output_to_bool(output.data["text/plain"])
return True
def process_cell(self, cell, resources):
'''Identify and process metacode tags
Metacode tags are found on the first line of the cell and start with "#$"
'''
m = re.match(r'\#\$\s+(\w+)\s+(.+)', cell.source)
if m:
command = m.group(1)
args = m.group(2)
if command == 'if':
# Start if block
self._in_exclude_mode = not self._evaluate_cell(cell)
elif command == 'else':
# Start else block
self._in_exclude_mode = not self._in_exclude_mode
elif command == 'end':
# End block
if args.split()[0] == 'if':
# End if/else block
self._in_exclude_mode = False
else:
print("Unrecognized metacode end tag: " + m.group(0))
elif command == 'endif':
# Deprecated: end if/else block
warn_deprecated("'endif'' is deprecated. Use 'end if'")
self._in_exclude_mode = False
elif command == 'section':
# Deprecated functionality kept for backwards compatibility
warn_deprecated("'section' tag is deprecated. Use 'if' and 'end if'")
if 'start' in args.lower():
# Start section block
self._in_exclude_mode = not self._evaluate_cell(cell)
elif 'end' in args.lower():
# End section block
self._in_exclude_mode = False
else:
# Unrecognized section tag
print("Unrecognized metacode tag: " + m.group(0))
# All cells with metacode tags are removed
return None, resources
else:
if self._in_exclude_mode:
return None, resources
else:
return cell, resources
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment