Skip to content

Instantly share code, notes, and snippets.

@sgomezvillamor
Last active January 21, 2018 09:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sgomezvillamor/54b64f5c504d6e2fd4318eb30c05a6df to your computer and use it in GitHub Desktop.
Save sgomezvillamor/54b64f5c504d6e2fd4318eb30c05a6df to your computer and use it in GitHub Desktop.
From html 2 original ipynb file
# requires python3
from bs4 import BeautifulSoup
import json
import urllib.request
def html2ipynb(sourceHtml, targetIpynb):
dictionary = {'nbformat': 4, 'nbformat_minor': 1, 'cells': [], 'metadata': {}}
print("html2ipynb")
print("Source (html) : '%s'" % sourceHtml)
print("Target (ipynb) : '%s'" % targetIpynb)
response = open(sourceHtml, encoding='utf-8')
text = response.read()
soup = BeautifulSoup(text, 'lxml')
for d in soup.findAll("div"):
if 'class' in d.attrs.keys():
for clas in d.attrs["class"]:
if clas in ["text_cell_render", "input_area"]:
# code cell
if clas == "input_area":
cell = {}
cell['metadata'] = {}
cell['outputs'] = []
# this removes all empty lines
# new_source = "\n".join([s for s in d.get_text().splitlines() if s.strip()])
new_source = d.get_text().splitlines()
# remove first line if empty
if not new_source[0].strip():
new_source = new_source[1:]
# remove last lines if empty
if not new_source[-1].strip():
new_source = new_source[0:-1]
cell['source'] = "\n".join(new_source)
cell['execution_count'] = None
cell['cell_type'] = 'code'
dictionary['cells'].append(cell)
else:
cell = {}
cell['metadata'] = {}
cell['source'] = [d.decode_contents()]
cell['cell_type'] = 'markdown'
dictionary['cells'].append(cell)
open(targetIpynb, 'w').write(json.dumps(dictionary))
response.close()
print("'%s' successfully written" % targetIpynb)
# html2ipynb for all *.html in the source dir
import os
sourceDir = os.path.join("__Referencias", "02 python", "Teoría")
targetDir = os.path.join("ML con Python")
for f in os.listdir(sourceDir):
if f.endswith(".html"):
fname = f[0:-5]
sourceHtml = os.path.join(sourceDir, f)
targetIpynb = os.path.join(targetDir, fname + ".ipynb")
html2ipynb(sourceHtml, targetIpynb)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment