dogboydog/convert.py

## convert.py
# -------------------------------------------------------------------------
#  Nimbus note HTML export to markdown converter
#  Extract all zip files containing 'note.html' and convert to markdown
#
# Setup:
#  1) install python 3 for your OS
#  2) install pandoc https://github.com/jgm/pandoc/releases/tag/2.11.4
#     on Windows, the .msi will automatically add pandoc to your $PATH
#     otherwise add it to your $PATH.
#  3) save this script in the directory where your HTML exports were
#     exported.  Open a terminal / command prompt and cd to the directory
#     where you saved convert.py.
#  4) Issue the command "python convert.py"
#     (add the word "debug" afterward for extra output: python convert.py debug)
#  5) To use for Joplin import, Use File -> Import -> Markdown (Directory)
#     and select the 'converted' directory that is created by this script
#
#  Happy note-taking.   -dogboydog
# -------------------------------------------------------------------------

import os
import pathlib
import re
import shutil
import subprocess
import sys
from os.path import abspath
from zipfile import ZipFile

notes_written = 0
notes_failed = 0

sep = os.path.sep

color = not ('no-color' in sys.argv[1:] or 'NO_COLOR' in os.environ)

debug_on = 'DEBUG' in os.environ and os.environ['DEBUG'] != "0"
if ('debug' in sys.argv[1:]):
    debug_on = True

clean = True  # set to False to keep html files from conversion


class _c:
    HEADER = '\033[95m' if color else ''
    BLUE = '\u001b[34m' if color else ''
    CYAN = '\033[96m' if color else ''
    GREEN = '\033[92m' if color else ''
    YELLOW = '\u001b[33m' if color else ''
    RED = '\033[91m' if color else ''
    ENDC = '\033[0m' if color else ''
    BOLD = '\033[1m' if color else ''
    UNDERLINE = '\033[4m' if color else ''


html_extension = ".html"
zip_extension = ".zip"


def log_debug(message):
    if debug_on:
        print(f"{_c.BLUE}{message}{_c.ENDC}")


def remove_empty_dir(empty_dir):
    try:
        os.removedirs(empty_dir)
        log_debug(f"Deleted empty directory '{empty_dir}'")
    except OSError:
        pass

# recursively delete empty directories


def remove_empty_dirs(path):
    # topdown False: start with deepest nested directories
    for root, dirnames, filenames in os.walk(path, topdown=False):
        for dirname in dirnames:
            remove_empty_dir(os.path.realpath(os.path.join(root, dirname)))

def clean_up():
    if not clean:
        return
    clean_extensions = [".woff2", ".css", ".woff",
                        ".ttf", "icomoon.svg", "icomoon.eot"]
    for directory, subdirlist, filelist in os.walk('converted'):
        for f in filelist:
            parent_dir = f"{converted_dir}"
            for clean_ext in clean_extensions:
                if (f.endswith(clean_ext)):
                    os.unlink(f"{directory}{sep}{f}")
    remove_empty_dirs("converted")


def write_note(html_file, markdown_destination):
    global notes_written, notes_failed
    print(f"Writing markdown to {markdown_destination}")

    pandoc_run = subprocess.run(
        ["pandoc", html_file,
         "--from", "html", "--to", "markdown_strict-raw_html"],
        capture_output=True,
        check=True)

    if pandoc_run.returncode != 0:
        print(pandoc_run.stderr.decode())
        print(f"Failed to convert {html_file}")
        notes_failed += 1
    else:
        log_debug(pandoc_run.stdout.decode())
        with open(markdown_destination, "w", encoding="utf-8") as markdown_fp:
            markdown_content = pandoc_run.stdout.decode()
            markdown_fp.write(markdown_content)
        notes_written += 1


print(
    f"Searching for zip files containing HTML to convert...")
for directory, subdirlist, filelist in os.walk('.'):
    for f in filelist:
        if (f.endswith(zip_extension)):
            print(f"Found zipped note: {f}")
            with ZipFile(f"{directory}{sep}{f}", 'r') as zip:
                converted_dir = f"converted{sep}{directory}"
                pathlib.Path(converted_dir).resolve().mkdir(
                    parents=True, exist_ok=True)
                zip.extractall(converted_dir)
                for file_in_zip in zip.infolist():
                    if file_in_zip.is_dir():
                        continue
                    file_in_zip_ext = file_in_zip.filename[len(
                        file_in_zip.filename)-len(html_extension):]
                    if html_extension in file_in_zip_ext.lower():
                        note_new_filename = zip.filename[0:len(
                            zip.filename)-len(zip_extension)] + ".html"

                        old_path = pathlib.Path(
                            f"{converted_dir}{sep}{file_in_zip.filename}").resolve()
                        new_path = pathlib.Path(
                            f"converted{sep}{note_new_filename}")
                        log_debug(f"Renaming {old_path} to {new_path}")
                        shutil.move(old_path, new_path)

print(
    f"Will try to convert all HTML notes in the current directory to Markdown")
for directory, subdirlist, filelist in os.walk('converted'):
    for f in filelist:

        converted_dir = "converted"
        parent_dir = f"{converted_dir}"
        if (f.endswith(html_extension)):
            note_name = f.replace(html_extension, "").strip()
            root = f"{directory}{sep}"
            html_note = f"{root}{f}"
            print(f"Found HTML note: {html_note}")
            parent_dir = f"{directory}"
            parent_dir_pathlib = pathlib.Path(parent_dir).resolve()
            log_debug(f"mkdir {parent_dir_pathlib}")
            parent_dir_pathlib.mkdir(parents=True, exist_ok=True)
            markdown_destination = f"{parent_dir_pathlib}{sep}{ note_name }.md"
            write_note(html_note, markdown_destination)

            if clean:
                os.unlink(html_note)

clean_up()


print(f"\n{_c.GREEN}Wrote {notes_written} notes.{_c.ENDC}")
if notes_failed > 0:
    print(f"{_c.RED}{notes_failed} notes failed to convert :( {_c.ENDC}")
	# -------------------------------------------------------------------------
	# Nimbus note HTML export to markdown converter
	# Extract all zip files containing 'note.html' and convert to markdown
	#
	# Setup:
	# 1) install python 3 for your OS
	# 2) install pandoc https://github.com/jgm/pandoc/releases/tag/2.11.4
	# on Windows, the .msi will automatically add pandoc to your $PATH
	# otherwise add it to your $PATH.
	# 3) save this script in the directory where your HTML exports were
	# exported. Open a terminal / command prompt and cd to the directory
	# where you saved convert.py.
	# 4) Issue the command "python convert.py"
	# (add the word "debug" afterward for extra output: python convert.py debug)
	# 5) To use for Joplin import, Use File -> Import -> Markdown (Directory)
	# and select the 'converted' directory that is created by this script
	#
	# Happy note-taking. -dogboydog
	# -------------------------------------------------------------------------

	import os
	import pathlib
	import re
	import shutil
	import subprocess
	import sys
	from os.path import abspath
	from zipfile import ZipFile

	notes_written = 0
	notes_failed = 0

	sep = os.path.sep

	color = not ('no-color' in sys.argv[1:] or 'NO_COLOR' in os.environ)

	debug_on = 'DEBUG' in os.environ and os.environ['DEBUG'] != "0"
	if ('debug' in sys.argv[1:]):
	debug_on = True

	clean = True # set to False to keep html files from conversion


	class _c:
	HEADER = '\033[95m' if color else ''
	BLUE = '\u001b[34m' if color else ''
	CYAN = '\033[96m' if color else ''
	GREEN = '\033[92m' if color else ''
	YELLOW = '\u001b[33m' if color else ''
	RED = '\033[91m' if color else ''
	ENDC = '\033[0m' if color else ''
	BOLD = '\033[1m' if color else ''
	UNDERLINE = '\033[4m' if color else ''


	html_extension = ".html"
	zip_extension = ".zip"


	def log_debug(message):
	if debug_on:
	print(f"{_c.BLUE}{message}{_c.ENDC}")


	def remove_empty_dir(empty_dir):
	try:
	os.removedirs(empty_dir)
	log_debug(f"Deleted empty directory '{empty_dir}'")
	except OSError:
	pass

	# recursively delete empty directories


	def remove_empty_dirs(path):
	# topdown False: start with deepest nested directories
	for root, dirnames, filenames in os.walk(path, topdown=False):
	for dirname in dirnames:
	remove_empty_dir(os.path.realpath(os.path.join(root, dirname)))

	def clean_up():
	if not clean:
	return
	clean_extensions = [".woff2", ".css", ".woff",
	".ttf", "icomoon.svg", "icomoon.eot"]
	for directory, subdirlist, filelist in os.walk('converted'):
	for f in filelist:
	parent_dir = f"{converted_dir}"
	for clean_ext in clean_extensions:
	if (f.endswith(clean_ext)):
	os.unlink(f"{directory}{sep}{f}")
	remove_empty_dirs("converted")


	def write_note(html_file, markdown_destination):
	global notes_written, notes_failed
	print(f"Writing markdown to {markdown_destination}")

	pandoc_run = subprocess.run(
	["pandoc", html_file,
	"--from", "html", "--to", "markdown_strict-raw_html"],
	capture_output=True,
	check=True)

	if pandoc_run.returncode != 0:
	print(pandoc_run.stderr.decode())
	print(f"Failed to convert {html_file}")
	notes_failed += 1
	else:
	log_debug(pandoc_run.stdout.decode())
	with open(markdown_destination, "w", encoding="utf-8") as markdown_fp:
	markdown_content = pandoc_run.stdout.decode()
	markdown_fp.write(markdown_content)
	notes_written += 1


	print(
	f"Searching for zip files containing HTML to convert...")
	for directory, subdirlist, filelist in os.walk('.'):
	for f in filelist:
	if (f.endswith(zip_extension)):
	print(f"Found zipped note: {f}")
	with ZipFile(f"{directory}{sep}{f}", 'r') as zip:
	converted_dir = f"converted{sep}{directory}"
	pathlib.Path(converted_dir).resolve().mkdir(
	parents=True, exist_ok=True)
	zip.extractall(converted_dir)
	for file_in_zip in zip.infolist():
	if file_in_zip.is_dir():
	continue
	file_in_zip_ext = file_in_zip.filename[len(
	file_in_zip.filename)-len(html_extension):]
	if html_extension in file_in_zip_ext.lower():
	note_new_filename = zip.filename[0:len(
	zip.filename)-len(zip_extension)] + ".html"

	old_path = pathlib.Path(
	f"{converted_dir}{sep}{file_in_zip.filename}").resolve()
	new_path = pathlib.Path(
	f"converted{sep}{note_new_filename}")
	log_debug(f"Renaming {old_path} to {new_path}")
	shutil.move(old_path, new_path)

	print(
	f"Will try to convert all HTML notes in the current directory to Markdown")
	for directory, subdirlist, filelist in os.walk('converted'):
	for f in filelist:

	converted_dir = "converted"
	parent_dir = f"{converted_dir}"
	if (f.endswith(html_extension)):
	note_name = f.replace(html_extension, "").strip()
	root = f"{directory}{sep}"
	html_note = f"{root}{f}"
	print(f"Found HTML note: {html_note}")
	parent_dir = f"{directory}"
	parent_dir_pathlib = pathlib.Path(parent_dir).resolve()
	log_debug(f"mkdir {parent_dir_pathlib}")
	parent_dir_pathlib.mkdir(parents=True, exist_ok=True)
	markdown_destination = f"{parent_dir_pathlib}{sep}{ note_name }.md"
	write_note(html_note, markdown_destination)

	if clean:
	os.unlink(html_note)

	clean_up()


	print(f"\n{_c.GREEN}Wrote {notes_written} notes.{_c.ENDC}")
	if notes_failed > 0:
	print(f"{_c.RED}{notes_failed} notes failed to convert :( {_c.ENDC}")