Last active
August 10, 2024 15:01
-
-
Save itsjfx/689ae620222240911a3efae33e313b1b to your computer and use it in GitHub Desktop.
A modified version of the popular keep-to-enex script which allows you to choose a backup folder instead of parsing each file or * as an argument
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# originally created and posted by user dgc on | |
# https://discussion.evernote.com/topic/97201-how-to-transfer-all-the-notes-from-google-keep-to-evernote/ | |
# Modified by user charlescanato https://gitlab.com/charlescanato/google-keep-to-evernote-converter | |
# Modified by gokhan mete erturk to enable bulk operation of html files without any parameters and | |
# solves the character set problems on Windows | |
# Modified by Leonard777 to add importing of image data. | |
# Modified by itsjfx to read a folder and import HTML files | |
# until now, Google Takeout for Keep does NOT export: | |
# - correct order of lists notes (non-checked first, checked last) | |
# - list items indentation | |
import argparse | |
import sys | |
import re | |
import parsedatetime as pdt | |
import time | |
import glob | |
import hashlib | |
import base64 | |
import os | |
cal = pdt.Calendar() | |
r1 = re.compile('<li class="listitem checked"><span class="bullet">☑</span>.*?<span class="text">(.*?)</span>.*?</li>') | |
r2 = re.compile('<li class="listitem"><span class="bullet">☐</span>.*?<span class="text">(.*?)</span>.*?</li>') | |
r3 = re.compile('<span class="chip label"><span class="label-name">([^<]*)</span>[^<]*</span>') | |
# Use non-greedy expressions to support multiple image tags for each note | |
r4 = re.compile('<img alt="" src="data:(.*?);(.*?)\,(.*?)" />') | |
r5 = re.compile('<div class="content">(.*)</div>') | |
def readlineUntil(file, str): | |
currLine = "" | |
while not str in currLine: | |
currLine = file.readline() | |
return currLine | |
def readTagsFromChips(line): | |
# line might still have chips | |
if line.startswith('<div class="chips">'): | |
return line + '\n' | |
def readImagesFromAttachment(line): | |
# Attachments need a name, so we will use the note title with a numeric suffix to make them unique. | |
# Suffix number for multiple attachments of the same name | |
attachmentNumber = 0 | |
result = () | |
m = r4.search(line) | |
while m: | |
h = hashlib.md5(base64.b64decode(m.group(3).encode("utf-8"))) | |
# Import all images at 1024px wide. Not sure if we can determine original size from binary data or not. | |
newContent = '\n<div><en-media type="' + m.group(1) + '" width="1024" hash="' + h.hexdigest() + '" /></div>' | |
imageFormat = m.group(1).split('/')[1] | |
newResource = '<resource><data encoding="' + m.group(2) + '">' + m.group(3) + '</data>\n<mime>' + m.group(1) + '</mime><resource-attributes><file-name>IMAGE_FILE_NAME_' + str(attachmentNumber) + '.' + imageFormat + '</file-name></resource-attributes></resource>\n' | |
result += (newContent, newResource) | |
attachmentNumber += 1 | |
line = line[m.end():] | |
m = r4.search(line) | |
return result | |
def mungefile(fn): | |
fp = open(fn, 'r', encoding="utf8") | |
title = readlineUntil( fp, "<title>" ).strip() | |
title = title.replace('<title>', '').replace('</title>', '') | |
readlineUntil( fp, "<body>" ) | |
t = fp.readline() | |
tags = '' | |
resources = '' | |
if '"archived"' in t: | |
tags = '<tag>archived</tag>' | |
fp.readline() #</div> alone | |
date = fp.readline().strip().replace('</div>', '') | |
dt, flat = cal.parse(date) | |
iso = time.strftime('%Y%m%dT%H%M%SZ', time.gmtime(time.mktime(dt))) | |
fp.readline() # extra title | |
content = fp.readline() | |
m = r5.search(content) | |
if m: | |
content = m.group(1) | |
content = content.replace( '<ul class="list">', '' ) | |
for line in fp: | |
line = line.strip() | |
if line == '</div></body></html>': | |
break | |
# Chips contain the tags as well as dynamic content previews.. but we care mostly about the tags | |
elif line.startswith('<div class="chips">'): | |
content += readTagsFromChips(line) | |
# Attachments contains the image data | |
elif line.startswith('<div class="attachments">'): | |
result = readImagesFromAttachment(line) | |
i = 0 | |
while i < len(result): | |
if i+1 < len(result): | |
content += result[i] | |
# Use the note title without spaces as the image file name | |
currentResource = result[i+1].replace("IMAGE_FILE_NAME", title.replace(' ', '')) | |
resources += currentResource | |
i += 2 | |
else: | |
content += line + '\n' | |
content = content.replace('<br>', '<br/>') | |
content = content.replace('\n', '\0') | |
while True: | |
m = r1.search(content) | |
if not m: | |
break | |
content = content[:m.start()] + '<en-todo checked="true"/>' + m.group(1) + '<br/>' + content[m.end():] | |
while True: | |
m = r2.search(content) | |
if not m: | |
break | |
content = content[:m.start()] + '<en-todo checked="false"/>' + m.group(1) + '<br/>' + content[m.end():] | |
content = content.replace('\0', '\n') | |
# remove list close (if it was a list) | |
lastUl = content.rfind('</ul>') | |
if lastUl != -1: | |
content = content[:lastUl] + content[lastUl+5:] | |
m = r3.search(content) | |
if m: | |
content = content[:m.start()] + content[m.end():] | |
tags = '<tag>' + m.group(1) + '</tag>' | |
content = re.sub(r'class="[^"]*"', '', content) | |
fp.close() | |
print (''' | |
<note> | |
<title>{title}</title> | |
<content><![CDATA[<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"><en-note style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;">{content}</en-note>]]></content> | |
<created>{iso}</created> | |
<updated>{iso}</updated> | |
{tags} | |
<note-attributes> | |
<latitude>0</latitude> | |
<longitude>0</longitude> | |
<source>google-keep</source> | |
<reminder-order>0</reminder-order> | |
</note-attributes> | |
{resources} | |
</note> | |
'''.format(**locals()), file=fxt) | |
parser = argparse.ArgumentParser(description="Convert Google Keep notes from .html to .enex for Evernote") | |
parser.add_argument('-o', '--output', help="The output file to write into. If not specified output goes to stdout.", default="sys.stdout") | |
parser.add_argument('-f', '--folder', help="The path to the folder which contains the HTML files", default="./Keep") | |
args = parser.parse_args() | |
if args.output == "sys.stdout": | |
fxt = sys.stdout | |
else: | |
fxt = open(args.output, "w", encoding="utf8") | |
print ('''<?xml version="1.0" encoding="UTF-8"?> | |
<!DOCTYPE en-export SYSTEM "http://xml.evernote.com/pub/evernote-export3.dtd"> | |
<en-export export-date="20180502T065115Z" application="Evernote/Windows" version="6.x">''', file=fxt) | |
for f in os.listdir(args.folder): | |
if f.endswith(".html"): | |
mungefile(os.path.join(args.folder, f)) | |
print ('''</en-export>''', file=fxt) | |
fxt.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Made an account just to say thank you for your effort and it helped me a lot. I could not get the original version to work for me.