Skip to content

Instantly share code, notes, and snippets.

@sebastien
Created June 22, 2015 15:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sebastien/ba5e57ac1e9b77e30145 to your computer and use it in GitHub Desktop.
Save sebastien/ba5e57ac1e9b77e30145 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# encoding=utf8 ---------------------------------------------------------------
# Project : Enextrcator
# -----------------------------------------------------------------------------
# Author : Sébastien Pierre
# License : BSD License
# -----------------------------------------------------------------------------
# Creation date : 2015-06-22
# Last modification : 2015-06-22
# -----------------------------------------------------------------------------
VERSION = "0.0.0"
LICENSE = "http://ffctn.com/doc/licenses/bsd"
import os, sys, re, json, base64, logging, datetime, dateutil.parser, mimetypes
__doc__ = """
A converter that parses Evernote's .enex data dump and outputs its content
as HTML, JSON, binaries and Markdown (if Pandoc is installed).
"""
RE_KEY =re.compile("([^\w\d]|[_])+")
RE_SPACES =re.compile("\s+")
try:
from wwwclient import *
except ImportError, e:
print ("Enextractor requires wwwclient: type `pip install -u wwwclient` or `easy_install wwwclient`")
sys.exit(-1)
def save( data, path ):
"""Saves the given data to the given path, if the data is not null."""
if data:
d = os.path.dirname(os.path.abspath(path))
if not os.path.isdir(d): os.mkdir(d)
with open(path, "wb") as f:
logging.info("Writing: {0}".format(path))
try:
f.write(data)
except Exception, e:
logging.error("Cannot write: {0}".format(e))
return path
else:
return None
def process_note( xml ):
title = xml.first("title").text()
created = xml.first("created").text()
updated = xml.first("updated").text()
attributes = [{e.name:e.text()} for e in xml.first("note-attributes").children]
created = created or datetime.datetime.now()
created = tuple(dateutil.parser.parse(created).timetuple())
updated = tuple(dateutil.parser.parse(updated).timetuple()) if updated else created
# The content is stored as HTML CDATA
content = xml.first("content").html().split("<![CDATA[",1)[-1].rsplit("]]>",1)[0]
content = "<html><body>" + content + "</body></html>"
rsrc = []
for r in xml.query("resource"):
data = r.first("date")
mime = r.first("mime")
if data and data.get("encoding"):
enc = data.get("encoding")
assert enc == "base64", "Unsupported encoding: {0}".format(enc)
data = base64.decodestring(data)
# We take tha last extension (it's the most complete)
ext = mimetypes.guess_all_extensions(mime)[-1]
rsrc.append(dict(
data = data,
mime = mime,
ext = ext,
))
return dict(
title = title,
created = created,
updated = updated,
attributes = attributes,
content = content,
resources = rsrc,
)
def write_note( note, path="notes", use_pandoc=True ):
"""Writes the note in the given directory."""
name = RE_SPACES.sub("_", RE_KEY.sub("-", note["title"]).lower()).strip()
logging.info("Writing note {0} in {1}".format(name, path))
date = note["updated"] or note["created"]
prefix = os.path.join(path, "{0:04d}-{1:02d}-{2:02d}-{3}".format(date[0], date[1], date[2], name))
save(json.dumps(note), "{0}.json".format(prefix))
save(note["content"], "{0}.html".format(prefix))
if use_pandoc: os.system("pandoc -f html -t markdown {0}.html > {0}.md".format(prefix))
for i,r in enumerate(note["resources"]):
save(r["data"], "{0}-{1}.{2}".format(i, r["ext"]))
def process_enex( path ):
assert os.path.exists(path), "Given path does not exist: {0}".format(path)
logging.info("Processing note at {0}".format(path))
with open(path) as f: t = f.read()
xml = HTML.tree(t, asXML=True)
for note in xml.query("note"):
write_note(process_note(note))
# -----------------------------------------------------------------------------
#
# SECTION
#
# -----------------------------------------------------------------------------
def run( args ):
if not args:
logging.error("run(): At least one .enex file path required")
else:
for p in args:
process_enex(p)
# -----------------------------------------------------------------------------
#
# MAIN
#
# -----------------------------------------------------------------------------
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG,stream=sys.stderr)
run(sys.argv[1:])
# EOF - vim: ts=4 sw=4 noet
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment