Created
June 22, 2015 15:31
-
-
Save sebastien/dc18ee5c5a73cac539bb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding=utf8 --------------------------------------------------------------- | |
# Project : Enextrcator | |
# ----------------------------------------------------------------------------- | |
# Author : Sébastien Pierre | |
# License : BSD License | |
# ----------------------------------------------------------------------------- | |
# Creation date : 2015-06-22 | |
# Last modification : 2015-06-22 | |
# ----------------------------------------------------------------------------- | |
VERSION = "0.0.0" | |
LICENSE = "http://ffctn.com/doc/licenses/bsd" | |
import os, sys, re, json, base64, logging, datetime, dateutil.parser, mimetypes | |
__doc__ = """ | |
A converter that parses Evernote's .enex data dump and outputs its content | |
as HTML, JSON, binaries and Markdown (if Pandoc is installed). | |
""" | |
RE_KEY =re.compile("([^\w\d]|[_])+") | |
RE_SPACES =re.compile("\s+") | |
try: | |
from wwwclient import * | |
except ImportError, e: | |
print ("Enextractor requires wwwclient: type `pip install -u wwwclient` or `easy_install wwwclient`") | |
sys.exit(-1) | |
def save( data, path ): | |
"""Saves the given data to the given path, if the data is not null.""" | |
if data: | |
d = os.path.dirname(os.path.abspath(path)) | |
if not os.path.isdir(d): os.mkdir(d) | |
with open(path, "wb") as f: | |
logging.info("Writing: {0}".format(path)) | |
try: | |
f.write(data) | |
except Exception, e: | |
logging.error("Cannot write: {0}".format(e)) | |
return path | |
else: | |
return None | |
def process_note( xml ): | |
title = xml.first("title").text() | |
created = xml.first("created").text() | |
updated = xml.first("updated").text() | |
attributes = [{e.name:e.text()} for e in xml.first("note-attributes").children] | |
created = created or datetime.datetime.now() | |
created = tuple(dateutil.parser.parse(created).timetuple()) | |
updated = tuple(dateutil.parser.parse(updated).timetuple()) if updated else created | |
# The content is stored as HTML CDATA | |
content = xml.first("content").html().split("<![CDATA[",1)[-1].rsplit("]]>",1)[0] | |
content = "<html><body>" + content + "</body></html>" | |
rsrc = [] | |
for r in xml.query("resource"): | |
data = r.first("date") | |
mime = r.first("mime") | |
if data and data.get("encoding"): | |
enc = data.get("encoding") | |
assert enc == "base64", "Unsupported encoding: {0}".format(enc) | |
data = base64.decodestring(data) | |
# We take tha last extension (it's the most complete) | |
ext = mimetypes.guess_all_extensions(mime)[-1] | |
rsrc.append(dict( | |
data = data, | |
mime = mime, | |
ext = ext, | |
)) | |
return dict( | |
title = title, | |
created = created, | |
updated = updated, | |
attributes = attributes, | |
content = content, | |
resources = rsrc, | |
) | |
def write_note( note, path="notes", use_pandoc=True ): | |
"""Writes the note in the given directory.""" | |
name = RE_SPACES.sub("_", RE_KEY.sub("-", note["title"]).lower()).strip() | |
logging.info("Writing note {0} in {1}".format(name, path)) | |
date = note["updated"] or note["created"] | |
prefix = os.path.join(path, "{0:04d}-{1:02d}-{2:02d}-{3}".format(date[0], date[1], date[2], name)) | |
save(json.dumps(note), "{0}.json".format(prefix)) | |
save(note["content"], "{0}.html".format(prefix)) | |
if use_pandoc: os.system("pandoc -f html -t markdown {0}.html > {0}.md".format(prefix)) | |
for i,r in enumerate(note["resources"]): | |
save(r["data"], "{0}-{1}.{2}".format(i, r["ext"])) | |
def process_enex( path ): | |
assert os.path.exists(path), "Given path does not exist: {0}".format(path) | |
logging.info("Processing note at {0}".format(path)) | |
with open(path) as f: t = f.read() | |
xml = HTML.tree(t, asXML=True) | |
for note in xml.query("note"): | |
write_note(process_note(note)) | |
# ----------------------------------------------------------------------------- | |
# | |
# SECTION | |
# | |
# ----------------------------------------------------------------------------- | |
def run( args ): | |
if not args: | |
logging.error("run(): At least one .enex file path required") | |
else: | |
for p in args: | |
process_enex(p) | |
# ----------------------------------------------------------------------------- | |
# | |
# MAIN | |
# | |
# ----------------------------------------------------------------------------- | |
if __name__ == "__main__": | |
logging.basicConfig(level=logging.DEBUG,stream=sys.stderr) | |
run(sys.argv[1:]) | |
# EOF - vim: ts=4 sw=4 noet |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment