jessstringham/ipynb_to_jekyll.py

## ipynb_to_jekyll.py
'''This is hacky code I use to convert Jupyter notebooks into Jekyll posts.

Notebooks' first line should be
    # Title

and `Title` will be used as the post's title.

I convert LaTeX into the form MathJax needs.

To make plots work, see `get_maybe_image_code` comment.
'''
import re
import os
import sys

import nbformat


NB_VERSION = 4

HEADER = '''---
title: '{}'
tags: [jupyter]
layout: post
mathjax: true
---

'''

GITHUB_PATH = 'https://github.com/jessstringham/blog/tree/master/notebooks'
ASSETS_PATH = 'assets'

SOURCE_CODE_PREFIX = '\n\n{% highlight python %}\n'
SOURCE_CODE_SUFFIX = '\n{% endhighlight %}\n\n'


def header_from_path(path, title):
    '''Return the front-matter with the notebook title, and the first line of the
    post as a link to my github notebook
    '''
    filename = path.split('/')[-1]

    link = '[This post is also a Jupyter notebook!]({}/{})\n'.format(
        GITHUB_PATH,
        filename
    )

    return HEADER.format(title) + link

def replace_single_dollar_signs(text):
    '''My Jekyll-setup renders $$\LaTeX$$ okay, but needs $\LaTeX$ to be replaced
    with \\( \LaTeX \\). Hacky  because I don't remember clever ways to use re or
    whatever to handle symmetric braces.
    '''
    new_text = ''
    state = 'text'
    for letter in source:
        if state == 'text':
            if letter == '$':
                state = 'one_dollar'
                # don't add letter
            else:
                new_text += letter
        elif state == 'one_dollar':
            if letter == '$':
                state = 'two_dollar'
                # I suppressed single $, so add both here
                new_text += '$$'
            else:
                state = 'in_state'
                new_text += '\\\\( '
                new_text += letter

        elif state == 'in_state':
            if letter == '$':
                state = 'text'
                new_text += ' \\\\)'
            else:
                new_text += letter

        elif state == 'two_dollar':
            if letter == '$':
                state = 'exit_two_dollar'
            new_text += letter

        elif state == 'exit_two_dollar':
            # this should be a $
            state = 'text'
            new_text += letter

    return new_text


def check_image_link(image_filename):
    if not os.path.exists(os.path.join(ASSETS_PATH, image_filename)):
        print('remember to move {} to {}'.format(image_filename, ASSETS_PATH))


def process_md(source):
    # check that image files shown in the notebook exist in my blog images folder
    images = re.findall("images\/(.*?)[)\"']", source)
    for image in images:
        check_image_link(image)

    # handle LaTeX
    source = replace_single_dollar_signs(source)

    # then update links to the blog images links
    source = source.replace('images/', '/{}/'.format(ASSETS_PATH))

    # update links to other notebooks in the folder to other posts

    source = re.sub(
        r'\((.*?)\.ipynb\)',
        r'({% post_url \1 %})',
        source
    )

    return source


def get_maybe_image_code(source):
    '''Get images for the notebook output.

    This is a little hacky: In the notebook, I add a function `maybe_save_plot`:

        SAVE = True
        def maybe_save_plot(filename):
            if SAVE:
                plt.tight_layout()
                plt.savefig('images/' + filename, bbox_inches="tight")

    I call this before the `plt.show()`s. I run the notebook and it saves all the plots.
    When I generate the blog post, I append the image after codeblocks that contain
    `maybe_save_plot`.
    atm, my blog reads images from a separate assets folder, so I need to remember to manually
    move it over. That's what the print statement is for.
    '''
    maybe_image = ''

    for line in source.split('\n'):
        save_plot_match = re.match("maybe_save_plot\('(.*)'\)", line)
        if save_plot_match:
            save_name = save_plot_match.group(1)
            filename = save_name + '.png'
            check_image_link(filename)

            maybe_image = '![](/assets/{})'.format(filename)

    return maybe_image


def process_code(source):
    maybe_image = get_maybe_image_code(source)

    return SOURCE_CODE_PREFIX + source + SOURCE_CODE_SUFFIX + maybe_image + '\n'


def extract_title(source):
    '''Grab the title out of the first line, like "something" from "# something"
    Return the titles and the new first cell.
    '''
    lines = source.split('\n')
    return lines[0][len('# '):], '\n'.join(lines[1:])


if __name__ == '__main__':
    path = sys.argv[1]
    output_path = sys.argv[2]

    with open(path) as f:
        nb = nbformat.read(f, NB_VERSION)

    cells = nb['cells']

    # Extract the title from the first cell and update the local representation of
    # the cell's source
    title, first_cell_source = extract_title(cells[0]['source'])
    cells[0]['source'] = first_cell_source

    result_lines = []
    for cell in cells:
        source = cell['source']
        if source:
            if cell['cell_type'] == 'markdown':
                result_lines.append(process_md(source))
            elif cell['cell_type'] == 'code':
                result_lines.append(process_code(source))
            else:
                print('I don\'t know how to process cells of type {}'.format(cell['cell_type']))

    result_lines = [header_from_path(path, title)] + result_lines

    # write the post!
    with open(output_path, 'w') as f:
        f.write('\n'.join(result_lines))
	'''This is hacky code I use to convert Jupyter notebooks into Jekyll posts.

	Notebooks' first line should be
	# Title

	and `Title` will be used as the post's title.

	I convert LaTeX into the form MathJax needs.

	To make plots work, see `get_maybe_image_code` comment.
	'''
	import re
	import os
	import sys

	import nbformat


	NB_VERSION = 4

	HEADER = '''---
	title: '{}'
	tags: [jupyter]
	layout: post
	mathjax: true
	---

	'''

	GITHUB_PATH = 'https://github.com/jessstringham/blog/tree/master/notebooks'
	ASSETS_PATH = 'assets'

	SOURCE_CODE_PREFIX = '\n\n{% highlight python %}\n'
	SOURCE_CODE_SUFFIX = '\n{% endhighlight %}\n\n'


	def header_from_path(path, title):
	'''Return the front-matter with the notebook title, and the first line of the
	post as a link to my github notebook
	'''
	filename = path.split('/')[-1]

	link = '[This post is also a Jupyter notebook!]({}/{})\n'.format(
	GITHUB_PATH,
	filename
	)

	return HEADER.format(title) + link

	def replace_single_dollar_signs(text):
	'''My Jekyll-setup renders $$\LaTeX$$ okay, but needs $\LaTeX$ to be replaced
	with \\( \LaTeX \\). Hacky because I don't remember clever ways to use re or
	whatever to handle symmetric braces.
	'''
	new_text = ''
	state = 'text'
	for letter in source:
	if state == 'text':
	if letter == '$':
	state = 'one_dollar'
	# don't add letter
	else:
	new_text += letter
	elif state == 'one_dollar':
	if letter == '$':
	state = 'two_dollar'
	# I suppressed single $, so add both here
	new_text += '$$'
	else:
	state = 'in_state'
	new_text += '\\\\( '
	new_text += letter

	elif state == 'in_state':
	if letter == '$':
	state = 'text'
	new_text += ' \\\\)'
	else:
	new_text += letter

	elif state == 'two_dollar':
	if letter == '$':
	state = 'exit_two_dollar'
	new_text += letter

	elif state == 'exit_two_dollar':
	# this should be a $
	state = 'text'
	new_text += letter

	return new_text


	def check_image_link(image_filename):
	if not os.path.exists(os.path.join(ASSETS_PATH, image_filename)):
	print('remember to move {} to {}'.format(image_filename, ASSETS_PATH))


	def process_md(source):
	# check that image files shown in the notebook exist in my blog images folder
	images = re.findall("images\/(.*?)[)\"']", source)
	for image in images:
	check_image_link(image)

	# handle LaTeX
	source = replace_single_dollar_signs(source)

	# then update links to the blog images links
	source = source.replace('images/', '/{}/'.format(ASSETS_PATH))

	# update links to other notebooks in the folder to other posts

	source = re.sub(
	r'\((.*?)\.ipynb\)',
	r'({% post_url \1 %})',
	source
	)

	return source


	def get_maybe_image_code(source):
	'''Get images for the notebook output.

	This is a little hacky: In the notebook, I add a function `maybe_save_plot`:

	SAVE = True
	def maybe_save_plot(filename):
	if SAVE:
	plt.tight_layout()
	plt.savefig('images/' + filename, bbox_inches="tight")

	I call this before the `plt.show()`s. I run the notebook and it saves all the plots.
	When I generate the blog post, I append the image after codeblocks that contain
	`maybe_save_plot`.
	atm, my blog reads images from a separate assets folder, so I need to remember to manually
	move it over. That's what the print statement is for.
	'''
	maybe_image = ''

	for line in source.split('\n'):
	save_plot_match = re.match("maybe_save_plot\('(.*)'\)", line)
	if save_plot_match:
	save_name = save_plot_match.group(1)
	filename = save_name + '.png'
	check_image_link(filename)

	maybe_image = '![](/assets/{})'.format(filename)

	return maybe_image


	def process_code(source):
	maybe_image = get_maybe_image_code(source)

	return SOURCE_CODE_PREFIX + source + SOURCE_CODE_SUFFIX + maybe_image + '\n'


	def extract_title(source):
	'''Grab the title out of the first line, like "something" from "# something"
	Return the titles and the new first cell.
	'''
	lines = source.split('\n')
	return lines[0][len('# '):], '\n'.join(lines[1:])


	if __name__ == '__main__':
	path = sys.argv[1]
	output_path = sys.argv[2]

	with open(path) as f:
	nb = nbformat.read(f, NB_VERSION)

	cells = nb['cells']

	# Extract the title from the first cell and update the local representation of
	# the cell's source
	title, first_cell_source = extract_title(cells[0]['source'])
	cells[0]['source'] = first_cell_source

	result_lines = []
	for cell in cells:
	source = cell['source']
	if source:
	if cell['cell_type'] == 'markdown':
	result_lines.append(process_md(source))
	elif cell['cell_type'] == 'code':
	result_lines.append(process_code(source))
	else:
	print('I don\'t know how to process cells of type {}'.format(cell['cell_type']))

	result_lines = [header_from_path(path, title)] + result_lines

	# write the post!
	with open(output_path, 'w') as f:
	f.write('\n'.join(result_lines))