saj1th/z2ipynb.py

## z2ipynb.py
# Based on https://github.com/rdblue/jupyter-zeppelin
import os, sys
import re
import csv
import json
import html
import nbformat
import codecs
from io import StringIO

csv.field_size_limit(sys.maxsize)

MD = re.compile(r'%md\s')
SQL = re.compile(r'%sql\s')
UNKNOWN_MAGIC = re.compile(r'%\w+\s')
HTML = re.compile(r'%html\s')
PYSPARK = re.compile(r'%pyspark\s')
SPARK = re.compile(r'%spark\s')

def read_io(path):
    note = StringIO()
    with open(path) as local:
        for line in local.readlines():
            note.write(line)

    note.seek(0)

    return note

def table_cell_to_html(cell):
    """Formats a cell from a Zeppelin TABLE as HTML.
    """
    if HTML.match(cell):
        # the contents is already HTML
        return cell
    else:
        return html.escape(cell)

def table_to_html(tsv):
    """Formats the tab-separated content of a Zeppelin TABLE as HTML.
    """
    io = StringIO(tsv)
    reader = csv.reader(io, delimiter="\t")
    fields = next(reader)
    column_headers = "".join([ "<th>" + name + "</th>" for name in fields ])
    lines = [
            "<table>",
            "<tr>{column_headers}</tr>".format(column_headers=column_headers)
        ]
    for row in reader:
        lines.append("<tr>" + "".join([ "<td>" + table_cell_to_html(cell) + "</td>" for cell in row ]) + "</tr>")
    lines.append("</table>")
    return "\n".join(lines)


def convert_json(zeppelin_json):
    """Converts a Zeppelin note from JSON to a Jupyter NotebookNode.
    """
    return convert_parsed(json.load(zeppelin_json))

def convert_parsed(zeppelin_note):
    """Converts a Zeppelin note from parsed JSON to a Jupyter NotebookNode.
    """
    notebook_name = zeppelin_note['name']

    cells = []
    index = 0
    for paragraph in zeppelin_note['paragraphs']:
        code = paragraph.get('text')
        if not code:
            continue

        code = code.lstrip()

        cell = {}


        if PYSPARK.match(code):
            cell['cell_type'] = 'code'
            cell['execution_count'] = index
            cell['metadata'] = {'autoscroll': 'auto'}
            cell['outputs'] = []
            cell['source'] = code.lstrip('%pyspark').lstrip("\n")
        elif SPARK.match(code):
            cell['cell_type'] = 'code'
            cell['execution_count'] = index
            cell['metadata'] = {'autoscroll': 'auto'}
            cell['outputs'] = []
            cell['source'] = code.replace('%spark', '%scala')
        elif MD.match(code):
            cell['cell_type'] = 'markdown'
            cell['metadata'] = {}
            cell['source'] = code
        elif SQL.match(code) or HTML.match(code):
            cell['cell_type'] = 'code'
            cell['execution_count'] = index
            cell['metadata'] = {}
            cell['outputs'] = []
            cell['source'] = code
        elif UNKNOWN_MAGIC.match(code):
            # use raw cells for unknown magic
            cell['cell_type'] = 'raw'
            cell['metadata'] = {'format': 'text/plain'}
            cell['source'] = "%md \n" + code
        else:
            cell['cell_type'] = 'markdown'
            cell['execution_count'] = index
            cell['metadata'] = {'autoscroll': 'auto'}
            cell['outputs'] = []
            cell['source'] = "%md \n" + code

        cells.append(cell)

        result = paragraph.get('result')
        if cell['cell_type'] == 'code' and result:
            if result['code'] == 'SUCCESS':
                result_type = result.get('type')
                output_by_mime_type = {}
                if result_type == 'TEXT':
                    output_by_mime_type['text/plain'] = result['msg']
                elif result_type == 'HTML':
                    output_by_mime_type['text/html'] = result['msg']
                elif result_type == 'TABLE':
                    output_by_mime_type['text/html'] = table_to_html(result['msg'])

                cell['outputs'] = [{
                    'output_type': 'execute_result',
                    'metadata': {},
                    'execution_count': index,
                    'data': output_by_mime_type
                }]

        index += 1

    notebook = nbformat.from_dict({
        "metadata": {
            "kernelspec": {
                "display_name": "Spark 2.0.0 - Scala 2.11",
                "language": "scala",
                "name": "spark2-scala"
            },
            "language_info": {
                "codemirror_mode": "text/x-scala",
                "file_extension": ".scala",
                "mimetype": "text/x-scala",
                "name": "scala",
                "pygments_lexer": "scala",
                "version": "2.11.8"
            }
        },
        "nbformat": 4,
        "nbformat_minor": 2,
        "cells" : cells,
    })

    return (notebook_name, notebook)

def write_notebook(notebook_name, notebook, path=None):
    """Writes a NotebookNode to a file created from the notebook name.

    If path is None, the output path will be created the notebook name in the current directory.
    """
    filename = path
    if not filename:
        filename = notebook_name + '.ipynb'
        if os.path.exists(filename):
            for i in range(1, 1000):
                filename = notebook_name + ' (' + str(i) + ').ipynb'
                if not os.path.exists(filename):
                    break
                if i == 1000:
                    raise RuntimeError('Cannot write %s: versions 1-1000 already exist.' % (notebook_name,))

    with codecs.open(filename, 'w', encoding='UTF-8') as io:
        nbformat.write(notebook, io)

    return filename

if __name__ == '__main__':
    num_args = len(sys.argv)

    zeppelin_note_path = None
    target_path = None
    if num_args >= 2:
        zeppelin_note_path = sys.argv[1]
    if num_args == 3:
        target_path = sys.argv[2]

    if not zeppelin_note_path:
        exit()

    name, content = convert_json(read_io(zeppelin_note_path))
    write_notebook(name, content, target_path)
	# Based on https://github.com/rdblue/jupyter-zeppelin
	import os, sys
	import re
	import csv
	import json
	import html
	import nbformat
	import codecs
	from io import StringIO

	csv.field_size_limit(sys.maxsize)

	MD = re.compile(r'%md\s')
	SQL = re.compile(r'%sql\s')
	UNKNOWN_MAGIC = re.compile(r'%\w+\s')
	HTML = re.compile(r'%html\s')
	PYSPARK = re.compile(r'%pyspark\s')
	SPARK = re.compile(r'%spark\s')

	def read_io(path):
	note = StringIO()
	with open(path) as local:
	for line in local.readlines():
	note.write(line)

	note.seek(0)

	return note

	def table_cell_to_html(cell):
	"""Formats a cell from a Zeppelin TABLE as HTML.
	"""
	if HTML.match(cell):
	# the contents is already HTML
	return cell
	else:
	return html.escape(cell)

	def table_to_html(tsv):
	"""Formats the tab-separated content of a Zeppelin TABLE as HTML.
	"""
	io = StringIO(tsv)
	reader = csv.reader(io, delimiter="\t")
	fields = next(reader)
	column_headers = "".join([ "<th>" + name + "</th>" for name in fields ])
	lines = [
	"<table>",
	"<tr>{column_headers}</tr>".format(column_headers=column_headers)
	]
	for row in reader:
	lines.append("<tr>" + "".join([ "<td>" + table_cell_to_html(cell) + "</td>" for cell in row ]) + "</tr>")
	lines.append("</table>")
	return "\n".join(lines)


	def convert_json(zeppelin_json):
	"""Converts a Zeppelin note from JSON to a Jupyter NotebookNode.
	"""
	return convert_parsed(json.load(zeppelin_json))

	def convert_parsed(zeppelin_note):
	"""Converts a Zeppelin note from parsed JSON to a Jupyter NotebookNode.
	"""
	notebook_name = zeppelin_note['name']

	cells = []
	index = 0
	for paragraph in zeppelin_note['paragraphs']:
	code = paragraph.get('text')
	if not code:
	continue

	code = code.lstrip()

	cell = {}


	if PYSPARK.match(code):
	cell['cell_type'] = 'code'
	cell['execution_count'] = index
	cell['metadata'] = {'autoscroll': 'auto'}
	cell['outputs'] = []
	cell['source'] = code.lstrip('%pyspark').lstrip("\n")
	elif SPARK.match(code):
	cell['cell_type'] = 'code'
	cell['execution_count'] = index
	cell['metadata'] = {'autoscroll': 'auto'}
	cell['outputs'] = []
	cell['source'] = code.replace('%spark', '%scala')
	elif MD.match(code):
	cell['cell_type'] = 'markdown'
	cell['metadata'] = {}
	cell['source'] = code
	elif SQL.match(code) or HTML.match(code):
	cell['cell_type'] = 'code'
	cell['execution_count'] = index
	cell['metadata'] = {}
	cell['outputs'] = []
	cell['source'] = code
	elif UNKNOWN_MAGIC.match(code):
	# use raw cells for unknown magic
	cell['cell_type'] = 'raw'
	cell['metadata'] = {'format': 'text/plain'}
	cell['source'] = "%md \n" + code
	else:
	cell['cell_type'] = 'markdown'
	cell['execution_count'] = index
	cell['metadata'] = {'autoscroll': 'auto'}
	cell['outputs'] = []
	cell['source'] = "%md \n" + code

	cells.append(cell)

	result = paragraph.get('result')
	if cell['cell_type'] == 'code' and result:
	if result['code'] == 'SUCCESS':
	result_type = result.get('type')
	output_by_mime_type = {}
	if result_type == 'TEXT':
	output_by_mime_type['text/plain'] = result['msg']
	elif result_type == 'HTML':
	output_by_mime_type['text/html'] = result['msg']
	elif result_type == 'TABLE':
	output_by_mime_type['text/html'] = table_to_html(result['msg'])

	cell['outputs'] = [{
	'output_type': 'execute_result',
	'metadata': {},
	'execution_count': index,
	'data': output_by_mime_type
	}]

	index += 1

	notebook = nbformat.from_dict({
	"metadata": {
	"kernelspec": {
	"display_name": "Spark 2.0.0 - Scala 2.11",
	"language": "scala",
	"name": "spark2-scala"
	},
	"language_info": {
	"codemirror_mode": "text/x-scala",
	"file_extension": ".scala",
	"mimetype": "text/x-scala",
	"name": "scala",
	"pygments_lexer": "scala",
	"version": "2.11.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2,
	"cells" : cells,
	})

	return (notebook_name, notebook)

	def write_notebook(notebook_name, notebook, path=None):
	"""Writes a NotebookNode to a file created from the notebook name.

	If path is None, the output path will be created the notebook name in the current directory.
	"""
	filename = path
	if not filename:
	filename = notebook_name + '.ipynb'
	if os.path.exists(filename):
	for i in range(1, 1000):
	filename = notebook_name + ' (' + str(i) + ').ipynb'
	if not os.path.exists(filename):
	break
	if i == 1000:
	raise RuntimeError('Cannot write %s: versions 1-1000 already exist.' % (notebook_name,))

	with codecs.open(filename, 'w', encoding='UTF-8') as io:
	nbformat.write(notebook, io)

	return filename

	if __name__ == '__main__':
	num_args = len(sys.argv)

	zeppelin_note_path = None
	target_path = None
	if num_args >= 2:
	zeppelin_note_path = sys.argv[1]
	if num_args == 3:
	target_path = sys.argv[2]

	if not zeppelin_note_path:
	exit()

	name, content = convert_json(read_io(zeppelin_note_path))
	write_notebook(name, content, target_path)