sgomezvillamor/html2ipynb.py

## html2ipynb.py
# requires python3

from bs4 import BeautifulSoup
import json
import urllib.request

def html2ipynb(sourceHtml, targetIpynb):
    dictionary = {'nbformat': 4, 'nbformat_minor': 1, 'cells': [], 'metadata': {}}

    print("html2ipynb")
    print("Source (html)  : '%s'" % sourceHtml)
    print("Target (ipynb) : '%s'" % targetIpynb)

    response = open(sourceHtml, encoding='utf-8')
    text = response.read()

    soup = BeautifulSoup(text, 'lxml')

    for d in soup.findAll("div"):
        if 'class' in d.attrs.keys():
            for clas in d.attrs["class"]:
                if clas in ["text_cell_render", "input_area"]:
                    # code cell
                    if clas == "input_area":
                        cell = {}
                        cell['metadata'] = {}
                        cell['outputs'] = []
                        # this removes all empty lines
                        # new_source = "\n".join([s for s in d.get_text().splitlines() if s.strip()])
                        new_source = d.get_text().splitlines()
                        # remove first line if empty
                        if not new_source[0].strip():
                            new_source = new_source[1:]
                        # remove last lines if empty
                        if not new_source[-1].strip():
                            new_source = new_source[0:-1]
                        cell['source'] = "\n".join(new_source)
                        cell['execution_count'] = None
                        cell['cell_type'] = 'code'
                        dictionary['cells'].append(cell)

                    else:
                        cell = {}
                        cell['metadata'] = {}

                        cell['source'] = [d.decode_contents()]
                        cell['cell_type'] = 'markdown'
                        dictionary['cells'].append(cell)

    open(targetIpynb, 'w').write(json.dumps(dictionary))
    response.close()
    print("'%s' successfully written" % targetIpynb)

# html2ipynb for all *.html in the source dir

import os

sourceDir = os.path.join("__Referencias", "02 python", "Teoría")
targetDir = os.path.join("ML con Python")

for f in os.listdir(sourceDir):
    if f.endswith(".html"):
        fname = f[0:-5]
        sourceHtml = os.path.join(sourceDir, f)
        targetIpynb = os.path.join(targetDir, fname + ".ipynb")
        html2ipynb(sourceHtml, targetIpynb)
	# requires python3

	from bs4 import BeautifulSoup
	import json
	import urllib.request

	def html2ipynb(sourceHtml, targetIpynb):
	dictionary = {'nbformat': 4, 'nbformat_minor': 1, 'cells': [], 'metadata': {}}

	print("html2ipynb")
	print("Source (html) : '%s'" % sourceHtml)
	print("Target (ipynb) : '%s'" % targetIpynb)

	response = open(sourceHtml, encoding='utf-8')
	text = response.read()

	soup = BeautifulSoup(text, 'lxml')

	for d in soup.findAll("div"):
	if 'class' in d.attrs.keys():
	for clas in d.attrs["class"]:
	if clas in ["text_cell_render", "input_area"]:
	# code cell
	if clas == "input_area":
	cell = {}
	cell['metadata'] = {}
	cell['outputs'] = []
	# this removes all empty lines
	# new_source = "\n".join([s for s in d.get_text().splitlines() if s.strip()])
	new_source = d.get_text().splitlines()
	# remove first line if empty
	if not new_source[0].strip():
	new_source = new_source[1:]
	# remove last lines if empty
	if not new_source[-1].strip():
	new_source = new_source[0:-1]
	cell['source'] = "\n".join(new_source)
	cell['execution_count'] = None
	cell['cell_type'] = 'code'
	dictionary['cells'].append(cell)

	else:
	cell = {}
	cell['metadata'] = {}

	cell['source'] = [d.decode_contents()]
	cell['cell_type'] = 'markdown'
	dictionary['cells'].append(cell)

	open(targetIpynb, 'w').write(json.dumps(dictionary))
	response.close()
	print("'%s' successfully written" % targetIpynb)

	# html2ipynb for all *.html in the source dir

	import os

	sourceDir = os.path.join("__Referencias", "02 python", "Teoría")
	targetDir = os.path.join("ML con Python")

	for f in os.listdir(sourceDir):
	if f.endswith(".html"):
	fname = f[0:-5]
	sourceHtml = os.path.join(sourceDir, f)
	targetIpynb = os.path.join(targetDir, fname + ".ipynb")
	html2ipynb(sourceHtml, targetIpynb)