pplonski/preprocessors.py

## preprocessors.py
'''Specialized Preprocessors'''

import nbconvert, nbformat, re, sys
from nbconvert.preprocessors import ExecutePreprocessor, Preprocessor
from traitlets import Dict, Unicode
from textwrap import dedent

from warnings import warn
def warn_deprecated(msg):
    '''Raise a DeprecationWarning'''
    warn(msg, DeprecationWarning, stacklevel=2)


class RemoveCodePreprocessor(Preprocessor):
    def _should_remove(self, cell):
        if cell.cell_type == 'code':
            return True
        elif cell.cell_type == 'markdown':
            # For cells starting with headers
            # The first token will be a set of #'s
            # Check for a '!' as the second token in the source
            tokens = cell.source.split()
            return all(t == '#' for t in tokens[0]) and tokens[1] == '!'
        else:
            return False

    def preprocess(self, notebook, resources):
        '''Skip code cells and special markdown cells'''
        notebook.cells = [cell for cell in notebook.cells if not self._should_remove(cell)]
        return notebook, resources

class FilterCodePreprocessor(Preprocessor):
    def _should_filter(self, cell):
        '''Determine whether a cell should be filtered'''
        if cell.cell_type != 'code':
            return False
        else:
            # Check for a ##Hidecell comment
            if re.search('##hidecell', cell.source, re.I) is not None:
                return True
            else:
                return False

    def preprocess(self, notebook, resources):
        '''Filter code cells'''
        notebook.cells = [cell for cell in notebook.cells if not self._should_filter(cell)]
        return notebook, resources

class ClearEmptyRawCellsPreprocessor(Preprocessor):
    '''Remove empty raw cells from the notebook'''
    def preprocess(self, notebook, resources):
        notebook.cells = [cell for cell in notebook.cells
            if not (cell.cell_type == 'raw' and cell.source == '')]
        return notebook, resources

class ArgumentSubstitutionPreprocessor(Preprocessor):
    args = Dict(Unicode(),
                default_value={}).tag(config=True)

    def replace_variables(self, source, variables):
        """
        Replace <<variablename>> with stored value
        """
        try:
            replaced = re.sub("<<(.*?)>>", lambda m: variables.get(m.group(1), m.group(1)), source)
        except TypeError:
            print("WARNING: unable to perform replacement in cell: {}".format(source), sys.stderr)
            replaced = source
        return replaced

    def preprocess_cell(self, cell, resources, index):
        """
        Preprocess cell

        Parameters
        ----------
        cell : NotebookNode cell
            Notebook cell being processed
        resources : dictionary
            Additional resources used in the conversion process.  Allows
            preprocessors to pass variables into the Jinja engine.
        cell_index : int
            Index of the cell being processed (see base.py)
        """
        if cell.cell_type == "code":
             if len(self.args) > 0:
                cell.source = self.replace_variables(cell.source, self.args)
        return cell, resources

class ExecuteCodeMarkdownPreprocessor(ExecutePreprocessor):

    def __init__(self, **kw):
        self.sections = {'default': True} # maps section ID to true or false
        self.EmptyCell = nbformat.v4.nbbase.new_raw_cell("")
        self.MetaCodeProcessor = MetaCodePreprocessor(self)

        return super().__init__(**kw)

    def preprocess_cell(self, cell, resources, cell_index):
        """
        Executes a single code cell. See base.py for details.
        To execute all cells see :meth:`preprocess_cell`.
        """
        try:
            cell, resources = self.MetaCodeProcessor.process_cell(cell, resources)
            if cell is None:
                return self.EmptyCell, resources

            if cell.cell_type not in ['code','markdown']:
                return cell, resources

            if cell.cell_type == 'code':
                # Do code stuff
                return self.preprocess_code_cell(cell, resources, cell_index)

            elif cell.cell_type == 'markdown':
                # Do markdown stuff
                return self.preprocess_markdown_cell(cell, resources, cell_index)

            else:
                # Don't do anything
                return cell, resources
        except TimeoutError:
            print("Timeout on execution of cell: {}".format(cell.source), file=sys.stderr, flush=True)
            raise

    def preprocess_code_cell(self, cell, resources, cell_index):
        ''' Process code cell. '''

        outputs = self.run_cell(cell)
        cell.outputs = outputs

        if not self.allow_errors:
            for out in outputs:
                if out.output_type == 'error':
                    pattern = u"""\
                        An error occurred while executing the following cell:
                        ------------------
                        {cell.source}
                        ------------------
                        {out.ename}: {out.evalue}
                        """
                    msg = dedent(pattern).format(out=out, cell=cell)
                    raise nbconvert.preprocessors.execute.CellExecutionError(msg)

        return cell, resources

    def preprocess_markdown_cell(self, cell, resources, cell_index):
        # Find and execute snippets of code
        cell['metadata']['variables'] = {}
        for m in re.finditer("{{(.*?)}}", cell.source):
            # Execute code
            fakecell = nbformat.v4.nbbase.new_code_cell(m.group(1))
            fakecell, resources = self.preprocess_code_cell(fakecell, resources, cell_index)

            # Output found in cell.outputs
            # Put output in cell['metadata']['variables']
            for output in fakecell.outputs:
                html = self.convert_output_to_html(output)
                if html is not None:
                    cell['metadata']['variables'][fakecell.source] = html
                    break
        return cell, resources

    def convert_output_to_html(self, output):
        '''Convert IOpub output to HTML

        See https://github.com/ipython-contrib/IPython-notebook-extensions/blob/master/nbextensions/usability/python-markdown/main.js
        '''
        if output['output_type'] == 'error':
            text = '**' + output.ename + '**: ' + output.evalue;
            return text
        elif output.output_type == 'execute_result' or output.output_type == 'display_data':
            data = output.data
            if 'text/latex' in data:
                html = data['text/latex']
                return html
            elif 'image/svg+xml' in data:
                # Not supported
                #var svg = ul['image/svg+xml'];
                #/* embed SVG in an <img> tag, still get eaten by sanitizer... */
                #svg = btoa(svg);
                #html = '<img src="data:image/svg+xml;base64,' + svg + '"/>';
                return None
            elif 'image/jpeg' in data:
                jpeg = data['image/jpeg']
                html = '<img src="data:image/jpeg;base64,' + jpeg + '"/>'
                return html
            elif 'image/png' in data:
                png = data['image/png']
                html = '<img src="data:image/png;base64,' + png + '"/>'
                return html
            elif 'text/markdown' in data:
                text = data['text/markdown']
                return text
            elif 'text/html' in data:
                html = data['text/html']
                return html
            elif 'text/plain' in data:
                text = data['text/plain']
                # Strip <p> and </p> tags
                # Strip quotes
                # html.match(/<p>([\s\S]*?)<\/p>/)[1]
                text = re.sub(r'<p>([\s\S]*?)<\/p>', r'\1', text)
                text = re.sub(r"'([\s\S]*?)'",r'\1', text)
                return text
            else:
            # Some tag we don't support
                return None
        else:
            return None

class MetaCodePreprocessor:
    def __init__(self, cell_runner):
        self._in_exclude_mode = False
        self.cell_runner = cell_runner

    def _output_to_bool(self, output):
        '''Convert cell execution output to a boolean'''
        if not output:
            # Empty string is false
            return False
        else:
            try:
                # Return the boolean version of evaluating the string
                # Should handle cases like "True", "False", "0", "1", etc.
                return bool(eval(output))
            except NameError:
                # The output did not comprise a valid python expression
                # Return True
                return True

    def _evaluate_cell(self, cell):
        '''Evaluate the cell and return True or False'''
        # Execute the cell source to see whether the section should be kept
        # Default to True
        outputs = self.cell_runner.run_cell(cell)
        for output in outputs:
            if "data" in output and "text/plain" in output.data:
                return self._output_to_bool(output.data["text/plain"])
        return True

    def process_cell(self, cell, resources):
        '''Identify and process metacode tags

           Metacode tags are found on the first line of the cell and start with "#$"
        '''
        m = re.match(r'\#\$\s+(\w+)\s+(.+)', cell.source)
        if m:
            command = m.group(1)
            args = m.group(2)
            if command == 'if':
                # Start if block
                self._in_exclude_mode = not self._evaluate_cell(cell)

            elif command == 'else':
                # Start else block
                self._in_exclude_mode = not self._in_exclude_mode

            elif command == 'end':
                # End block
                if args.split()[0] == 'if':
                    # End if/else block
                    self._in_exclude_mode = False
                else:
                    print("Unrecognized metacode end tag: " + m.group(0))

            elif command == 'endif':
                # Deprecated: end if/else block
                warn_deprecated("'endif'' is deprecated. Use 'end if'")
                self._in_exclude_mode = False

            elif command == 'section':
                # Deprecated functionality kept for backwards compatibility
                warn_deprecated("'section' tag is deprecated. Use 'if' and 'end if'")
                if 'start' in args.lower():
                    # Start section block
                    self._in_exclude_mode = not self._evaluate_cell(cell)
                elif 'end' in args.lower():
                    # End section block
                    self._in_exclude_mode = False
            else:
                # Unrecognized section tag
                print("Unrecognized metacode tag: " + m.group(0))

            # All cells with metacode tags are removed
            return None, resources

        else:
            if self._in_exclude_mode:
                return None, resources
            else:
                return cell, resources
	'''Specialized Preprocessors'''

	import nbconvert, nbformat, re, sys
	from nbconvert.preprocessors import ExecutePreprocessor, Preprocessor
	from traitlets import Dict, Unicode
	from textwrap import dedent

	from warnings import warn
	def warn_deprecated(msg):
	'''Raise a DeprecationWarning'''
	warn(msg, DeprecationWarning, stacklevel=2)


	class RemoveCodePreprocessor(Preprocessor):
	def _should_remove(self, cell):
	if cell.cell_type == 'code':
	return True
	elif cell.cell_type == 'markdown':
	# For cells starting with headers
	# The first token will be a set of #'s
	# Check for a '!' as the second token in the source
	tokens = cell.source.split()
	return all(t == '#' for t in tokens[0]) and tokens[1] == '!'
	else:
	return False

	def preprocess(self, notebook, resources):
	'''Skip code cells and special markdown cells'''
	notebook.cells = [cell for cell in notebook.cells if not self._should_remove(cell)]
	return notebook, resources

	class FilterCodePreprocessor(Preprocessor):
	def _should_filter(self, cell):
	'''Determine whether a cell should be filtered'''
	if cell.cell_type != 'code':
	return False
	else:
	# Check for a ##Hidecell comment
	if re.search('##hidecell', cell.source, re.I) is not None:
	return True
	else:
	return False

	def preprocess(self, notebook, resources):
	'''Filter code cells'''
	notebook.cells = [cell for cell in notebook.cells if not self._should_filter(cell)]
	return notebook, resources

	class ClearEmptyRawCellsPreprocessor(Preprocessor):
	'''Remove empty raw cells from the notebook'''
	def preprocess(self, notebook, resources):
	notebook.cells = [cell for cell in notebook.cells
	if not (cell.cell_type == 'raw' and cell.source == '')]
	return notebook, resources

	class ArgumentSubstitutionPreprocessor(Preprocessor):
	args = Dict(Unicode(),
	default_value={}).tag(config=True)

	def replace_variables(self, source, variables):
	"""
	Replace <<variablename>> with stored value
	"""
	try:
	replaced = re.sub("<<(.*?)>>", lambda m: variables.get(m.group(1), m.group(1)), source)
	except TypeError:
	print("WARNING: unable to perform replacement in cell: {}".format(source), sys.stderr)
	replaced = source
	return replaced

	def preprocess_cell(self, cell, resources, index):
	"""
	Preprocess cell

	Parameters
	----------
	cell : NotebookNode cell
	Notebook cell being processed
	resources : dictionary
	Additional resources used in the conversion process. Allows
	preprocessors to pass variables into the Jinja engine.
	cell_index : int
	Index of the cell being processed (see base.py)
	"""
	if cell.cell_type == "code":
	if len(self.args) > 0:
	cell.source = self.replace_variables(cell.source, self.args)
	return cell, resources

	class ExecuteCodeMarkdownPreprocessor(ExecutePreprocessor):

	def __init__(self, **kw):
	self.sections = {'default': True} # maps section ID to true or false
	self.EmptyCell = nbformat.v4.nbbase.new_raw_cell("")
	self.MetaCodeProcessor = MetaCodePreprocessor(self)

	return super().__init__(**kw)

	def preprocess_cell(self, cell, resources, cell_index):
	"""
	Executes a single code cell. See base.py for details.
	To execute all cells see :meth:`preprocess_cell`.
	"""
	try:
	cell, resources = self.MetaCodeProcessor.process_cell(cell, resources)
	if cell is None:
	return self.EmptyCell, resources

	if cell.cell_type not in ['code','markdown']:
	return cell, resources

	if cell.cell_type == 'code':
	# Do code stuff
	return self.preprocess_code_cell(cell, resources, cell_index)

	elif cell.cell_type == 'markdown':
	# Do markdown stuff
	return self.preprocess_markdown_cell(cell, resources, cell_index)

	else:
	# Don't do anything
	return cell, resources
	except TimeoutError:
	print("Timeout on execution of cell: {}".format(cell.source), file=sys.stderr, flush=True)
	raise

	def preprocess_code_cell(self, cell, resources, cell_index):
	''' Process code cell. '''

	outputs = self.run_cell(cell)
	cell.outputs = outputs

	if not self.allow_errors:
	for out in outputs:
	if out.output_type == 'error':
	pattern = u"""\
	An error occurred while executing the following cell:
	------------------
	{cell.source}
	------------------
	{out.ename}: {out.evalue}
	"""
	msg = dedent(pattern).format(out=out, cell=cell)
	raise nbconvert.preprocessors.execute.CellExecutionError(msg)

	return cell, resources

	def preprocess_markdown_cell(self, cell, resources, cell_index):
	# Find and execute snippets of code
	cell['metadata']['variables'] = {}
	for m in re.finditer("{{(.*?)}}", cell.source):
	# Execute code
	fakecell = nbformat.v4.nbbase.new_code_cell(m.group(1))
	fakecell, resources = self.preprocess_code_cell(fakecell, resources, cell_index)

	# Output found in cell.outputs
	# Put output in cell['metadata']['variables']
	for output in fakecell.outputs:
	html = self.convert_output_to_html(output)
	if html is not None:
	cell['metadata']['variables'][fakecell.source] = html
	break
	return cell, resources

	def convert_output_to_html(self, output):
	'''Convert IOpub output to HTML

	See https://github.com/ipython-contrib/IPython-notebook-extensions/blob/master/nbextensions/usability/python-markdown/main.js
	'''
	if output['output_type'] == 'error':
	text = '' + output.ename + ': ' + output.evalue;
	return text
	elif output.output_type == 'execute_result' or output.output_type == 'display_data':
	data = output.data
	if 'text/latex' in data:
	html = data['text/latex']
	return html
	elif 'image/svg+xml' in data:
	# Not supported
	#var svg = ul['image/svg+xml'];
	#/* embed SVG in an <img> tag, still get eaten by sanitizer... */
	#svg = btoa(svg);
	#html = '<img src="data:image/svg+xml;base64,' + svg + '"/>';
	return None
	elif 'image/jpeg' in data:
	jpeg = data['image/jpeg']
	html = '<img src="data:image/jpeg;base64,' + jpeg + '"/>'
	return html
	elif 'image/png' in data:
	png = data['image/png']
	html = '<img src="data:image/png;base64,' + png + '"/>'
	return html
	elif 'text/markdown' in data:
	text = data['text/markdown']
	return text
	elif 'text/html' in data:
	html = data['text/html']
	return html
	elif 'text/plain' in data:
	text = data['text/plain']
	# Strip <p> and </p> tags
	# Strip quotes
	# html.match(/<p>([\s\S]*?)<\/p>/)[1]
	text = re.sub(r'<p>([\s\S]*?)<\/p>', r'\1', text)
	text = re.sub(r"'([\s\S]*?)'",r'\1', text)
	return text
	else:
	# Some tag we don't support
	return None
	else:
	return None

	class MetaCodePreprocessor:
	def __init__(self, cell_runner):
	self._in_exclude_mode = False
	self.cell_runner = cell_runner

	def _output_to_bool(self, output):
	'''Convert cell execution output to a boolean'''
	if not output:
	# Empty string is false
	return False
	else:
	try:
	# Return the boolean version of evaluating the string
	# Should handle cases like "True", "False", "0", "1", etc.
	return bool(eval(output))
	except NameError:
	# The output did not comprise a valid python expression
	# Return True
	return True

	def _evaluate_cell(self, cell):
	'''Evaluate the cell and return True or False'''
	# Execute the cell source to see whether the section should be kept
	# Default to True
	outputs = self.cell_runner.run_cell(cell)
	for output in outputs:
	if "data" in output and "text/plain" in output.data:
	return self._output_to_bool(output.data["text/plain"])
	return True

	def process_cell(self, cell, resources):
	'''Identify and process metacode tags

	Metacode tags are found on the first line of the cell and start with "#$"
	'''
	m = re.match(r'\#\$\s+(\w+)\s+(.+)', cell.source)
	if m:
	command = m.group(1)
	args = m.group(2)
	if command == 'if':
	# Start if block
	self._in_exclude_mode = not self._evaluate_cell(cell)

	elif command == 'else':
	# Start else block
	self._in_exclude_mode = not self._in_exclude_mode

	elif command == 'end':
	# End block
	if args.split()[0] == 'if':
	# End if/else block
	self._in_exclude_mode = False
	else:
	print("Unrecognized metacode end tag: " + m.group(0))

	elif command == 'endif':
	# Deprecated: end if/else block
	warn_deprecated("'endif'' is deprecated. Use 'end if'")
	self._in_exclude_mode = False

	elif command == 'section':
	# Deprecated functionality kept for backwards compatibility
	warn_deprecated("'section' tag is deprecated. Use 'if' and 'end if'")
	if 'start' in args.lower():
	# Start section block
	self._in_exclude_mode = not self._evaluate_cell(cell)
	elif 'end' in args.lower():
	# End section block
	self._in_exclude_mode = False
	else:
	# Unrecognized section tag
	print("Unrecognized metacode tag: " + m.group(0))

	# All cells with metacode tags are removed
	return None, resources

	else:
	if self._in_exclude_mode:
	return None, resources
	else:
	return cell, resources