-
-
Save foxmask/7b29c43a161e001ff04afdb2f181e31c to your computer and use it in GitHub Desktop.
Parsing Evernote export file (.enex) using Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*.enex |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<!DOCTYPE en-export SYSTEM "http://xml.evernote.com/pub/evernote-export2.dtd"> | |
<en-export export-date="20120727T073610Z" application="Evernote" version="Evernote Mac 3.0.5 (209942)"> | |
<note> | |
<title>Vim Tips</title> | |
<content> | |
<![CDATA[ | |
<?xml version="1.0" encoding="UTF-8" standalone="no"?> | |
<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"> | |
<en-note style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;"> | |
yank for copy, delete for cut, put for parse | |
<div><br/></div> | |
<div>Move in context, not position</div> | |
<div>/ search forward</div> | |
<div>? search backward</div> | |
<div>n repeat last search</div> | |
<div>N repeat last search but in the opposite direction</div> | |
<div>tx move to 'x'</div> | |
<div>fx find 'x'</div> | |
</en-note> | |
]]> | |
</content> | |
<created>20101229T161500Z</created> | |
<updated>20101231T161039Z</updated> | |
<note-attributes/> | |
</note> | |
</en-export> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from base64 import b64decode | |
import hashlib | |
from lxml import etree | |
from io import BytesIO | |
import os | |
from time import strptime | |
from pypandoc import convert_text | |
#http://www.hanxiaogang.com/writing/parsing-evernote-export-file-enex-using-python/ | |
p = etree.XMLParser(remove_blank_text=True, resolve_entities=False) | |
def parse_content(content): | |
text = convert_text(content, 'org', format='html') | |
return text | |
def parse_resource(resource): | |
rsc_dict = {} | |
for elem in resource: | |
if elem.tag == 'data': | |
# Some times elem.text is None | |
rsc_dict[elem.tag] = b64decode(elem.text) if elem.text else b'' | |
rsc_dict['hash'] = hashlib.md5(rsc_dict[elem.tag]).hexdigest() | |
else: | |
rsc_dict[elem.tag] = elem.text | |
return rsc_dict | |
def parse_note(note): | |
note_dict = {} | |
resources = [] | |
for elem in note: | |
if elem.tag == 'content': | |
note_dict[elem.tag] = parse_content(elem.text) | |
# A copy of original content | |
note_dict['content-raw'] = elem.text | |
elif elem.tag == 'resource': | |
resources.append(parse_resource(elem)) | |
elif elem.tag == 'created' or elem.tag == 'updated': | |
note_dict[elem.tag] = strptime(elem.text, '%Y%m%dT%H%M%SZ') | |
else: | |
note_dict[elem.tag] = elem.text | |
note_dict['resource'] = resources | |
return note_dict | |
def parseNoteXML(xmlFile): | |
# Without huge_tree set to True, parser may complain about huge text node | |
# Try to recover, because there may be " ", which will cause | |
# "XMLSyntaxError: Entity 'nbsp' not defined" | |
context = etree.iterparse(xmlFile, encoding='utf-8', strip_cdata=False, huge_tree=True, recover=True) | |
for action, elem in context: | |
if elem.tag == "note": | |
yield parse_note(elem) | |
# Save notes and attachments | |
# in directories named according to date of creation | |
def export_note(note): | |
date = note['created'] | |
year = str(date.tm_year) | |
mon = '%02d' % date.tm_mon | |
mday = '%02d' % date.tm_mday | |
note_dir = os.path.join('en-export', year, mon, mday) | |
os.makedirs(note_dir, exist_ok=True) | |
# Remove "/" from filenames | |
title = note['title'].replace('/', ' ')[:20] | |
text_file = os.path.join(note_dir, title +'.org') | |
with open(text_file, 'w') as fd: | |
# Write the original title | |
fd.write('#+TITLE: ' + note['title'] + '\n') | |
fd.write(note['content']) | |
bak_file = os.path.join(note_dir, title+'.bak') | |
with open(bak_file, 'w') as fd: | |
fd.write(note['content-raw']) | |
for resource in note['resource']: | |
rsc_file = os.path.join(note_dir, resource['hash']+'.data') | |
data = resource['data'] | |
with open(rsc_file, 'wb') as fd: | |
fd.write(data) | |
if __name__ == '__main__': | |
notes = parseNoteXML('mynote.enex') | |
for note in notes: | |
export_note(note) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[{'content': ['\nyank for copy, delete for cut, put for parse\n', | |
None, | |
None, | |
'Move in context, not position', | |
'/ search forward', | |
'? search backward', | |
'n repeat last search', | |
'N repeat last search but in the opposite direction', | |
"tx move to 'x'", | |
"fx find 'x'"], | |
'created': '20101229T161500Z', | |
'note': None, | |
'note-attributes': None, | |
'title': 'Vim Tips', | |
'updated': '20101231T161039Z'}] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment