Skip to content

Instantly share code, notes, and snippets.

@aolle
Forked from dogboydog/convert.py
Created April 18, 2021 17:33
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save aolle/6e595650391deef79ffb1c9bb38fb6e9 to your computer and use it in GitHub Desktop.
Save aolle/6e595650391deef79ffb1c9bb38fb6e9 to your computer and use it in GitHub Desktop.
Convert Nimbus Notes HTML to Markdown for Joplin
# -------------------------------------------------------------------------
# Nimbus note HTML export to markdown converter
# Extract all zip files containing 'note.html' and convert to markdown
#
# Setup:
# 1) install python 3 for your OS
# 2) install pandoc https://github.com/jgm/pandoc/releases/tag/2.11.4
# on Windows, the .msi will automatically add pandoc to your $PATH
# otherwise add it to your $PATH.
# 3) save this script in the directory where your HTML exports were
# exported. Open a terminal / command prompt and cd to the directory
# where you saved convert.py.
# 4) Issue the command "python convert.py"
# (add the word "debug" afterward for extra output: python convert.py debug)
# 5) To use for Joplin import, Use File -> Import -> Markdown (Directory)
# and select the 'converted' directory that is created by this script
#
# Happy note-taking. -dogboydog
# -------------------------------------------------------------------------
import os
import pathlib
import re
import shutil
import subprocess
import sys
from os.path import abspath
from zipfile import ZipFile
notes_written = 0
notes_failed = 0
sep = os.path.sep
color = not ('no-color' in sys.argv[1:] or 'NO_COLOR' in os.environ)
debug_on = 'DEBUG' in os.environ and os.environ['DEBUG'] != "0"
if ('debug' in sys.argv[1:]):
debug_on = True
clean = True # set to False to keep html files from conversion
class _c:
HEADER = '\033[95m' if color else ''
BLUE = '\u001b[34m' if color else ''
CYAN = '\033[96m' if color else ''
GREEN = '\033[92m' if color else ''
YELLOW = '\u001b[33m' if color else ''
RED = '\033[91m' if color else ''
ENDC = '\033[0m' if color else ''
BOLD = '\033[1m' if color else ''
UNDERLINE = '\033[4m' if color else ''
html_extension = ".html"
zip_extension = ".zip"
def log_debug(message):
if debug_on:
print(f"{_c.BLUE}{message}{_c.ENDC}")
def remove_empty_dir(empty_dir):
try:
os.removedirs(empty_dir)
log_debug(f"Deleted empty directory '{empty_dir}'")
except OSError:
pass
# recursively delete empty directories
def remove_empty_dirs(path):
# topdown False: start with deepest nested directories
for root, dirnames, filenames in os.walk(path, topdown=False):
for dirname in dirnames:
remove_empty_dir(os.path.realpath(os.path.join(root, dirname)))
def clean_up():
if not clean:
return
clean_extensions = [".woff2", ".css", ".woff",
".ttf", "icomoon.svg", "icomoon.eot"]
for directory, subdirlist, filelist in os.walk('converted'):
for f in filelist:
parent_dir = f"{converted_dir}"
for clean_ext in clean_extensions:
if (f.endswith(clean_ext)):
os.unlink(f"{directory}{sep}{f}")
remove_empty_dirs("converted")
def write_note(html_file, markdown_destination):
global notes_written, notes_failed
print(f"Writing markdown to {markdown_destination}")
pandoc_run = subprocess.run(
["pandoc", html_file,
"--from", "html", "--to", "markdown_strict-raw_html"],
capture_output=True,
shell=True)
if pandoc_run.returncode != 0:
print(pandoc_run.stderr.decode())
print(f"Failed to convert {html_file}")
notes_failed += 1
else:
log_debug(pandoc_run.stdout.decode())
with open(markdown_destination, "w", encoding="utf-8") as markdown_fp:
markdown_content = pandoc_run.stdout.decode()
markdown_fp.write(markdown_content)
notes_written += 1
print(
f"Searching for zip files containing HTML to convert...")
for directory, subdirlist, filelist in os.walk('.'):
for f in filelist:
if (f.endswith(zip_extension)):
print(f"Found zipped note: {f}")
with ZipFile(f"{directory}{sep}{f}", 'r') as zip:
converted_dir = f"converted{sep}{directory}"
pathlib.Path(converted_dir).resolve().mkdir(
parents=True, exist_ok=True)
zip.extractall(converted_dir)
for file_in_zip in zip.infolist():
if file_in_zip.is_dir():
continue
file_in_zip_ext = file_in_zip.filename[len(
file_in_zip.filename)-len(html_extension):]
if html_extension in file_in_zip_ext.lower():
note_new_filename = zip.filename[0:len(
zip.filename)-len(zip_extension)] + ".html"
old_path = pathlib.Path(
f"{converted_dir}{sep}{file_in_zip.filename}").resolve()
new_path = pathlib.Path(
f"converted{sep}{note_new_filename}")
log_debug(f"Renaming {old_path} to {new_path}")
shutil.move(old_path, new_path)
print(
f"Will try to convert all HTML notes in the current directory to Markdown")
for directory, subdirlist, filelist in os.walk('converted'):
for f in filelist:
converted_dir = "converted"
parent_dir = f"{converted_dir}"
if (f.endswith(html_extension)):
note_name = f.replace(html_extension, "").strip()
root = f"{directory}{sep}"
html_note = f"{root}{f}"
print(f"Found HTML note: {html_note}")
parent_dir = f"{directory}"
parent_dir_pathlib = pathlib.Path(parent_dir).resolve()
log_debug(f"mkdir {parent_dir_pathlib}")
parent_dir_pathlib.mkdir(parents=True, exist_ok=True)
markdown_destination = f"{parent_dir_pathlib}{sep}{ note_name }.md"
write_note(html_note, markdown_destination)
if clean:
os.unlink(html_note)
clean_up()
print(f"\n{_c.GREEN}Wrote {notes_written} notes.{_c.ENDC}")
if notes_failed > 0:
print(f"{_c.RED}{notes_failed} notes failed to convert :( {_c.ENDC}")
@onmahadev
Copy link

After running the script and searching for ZIP archives, when trying to convert HTML to Markdown, it just stands there and does nothing. I don't understand what to do, alas... The debug does not display any errors or warnings.

Stopping on this stage:

Will try to convert all HTML notes in the current directory to Markdown
Found HTML note: converted/All Notes/r00t/Шри Гаудия Гити Гуччха.html
mkdir /Users/cc/Documents/export-2023-08-22_9-31-54/converted/All Notes/r00t
Writing markdown to /Users/cc/Documents/export-2023-08-22_9-31-54/converted/All Notes/r00t/Шри Гаудия Гити Гуччха.md

@wzbfyb
Copy link

wzbfyb commented Nov 30, 2023

same here. but the sole thing that it also renames the HTML files before it tries to convert them made it so much easier to work with the files afterwards, so thanks a lot! :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment