Skip to content

Instantly share code, notes, and snippets.

Last active September 5, 2023 20:24
Show Gist options
  • Save itsjfx/689ae620222240911a3efae33e313b1b to your computer and use it in GitHub Desktop.
Save itsjfx/689ae620222240911a3efae33e313b1b to your computer and use it in GitHub Desktop.
A modified version of the popular keep-to-enex script which allows you to choose a backup folder instead of parsing each file or * as an argument
# originally created and posted by user dgc on
# Modified by user charlescanato
# Modified by gokhan mete erturk to enable bulk operation of html files without any parameters and
# solves the character set problems on Windows
# Modified by Leonard777 to add importing of image data.
# Modified by itsjfx to read a folder and import HTML files
# until now, Google Takeout for Keep does NOT export:
# - correct order of lists notes (non-checked first, checked last)
# - list items indentation
import argparse
import sys
import re
import parsedatetime as pdt
import time
import glob
import hashlib
import base64
import os
cal = pdt.Calendar()
r1 = re.compile('<li class="listitem checked"><span class="bullet">&#9745;</span>.*?<span class="text">(.*?)</span>.*?</li>')
r2 = re.compile('<li class="listitem"><span class="bullet">&#9744;</span>.*?<span class="text">(.*?)</span>.*?</li>')
r3 = re.compile('<span class="chip label"><span class="label-name">([^<]*)</span>[^<]*</span>')
# Use non-greedy expressions to support multiple image tags for each note
r4 = re.compile('<img alt="" src="data:(.*?);(.*?)\,(.*?)" />')
r5 = re.compile('<div class="content">(.*)</div>')
def readlineUntil(file, str):
currLine = ""
while not str in currLine:
currLine = file.readline()
return currLine
def readTagsFromChips(line):
# line might still have chips
if line.startswith('<div class="chips">'):
return line + '\n'
def readImagesFromAttachment(line):
# Attachments need a name, so we will use the note title with a numeric suffix to make them unique.
# Suffix number for multiple attachments of the same name
attachmentNumber = 0
result = ()
m =
while m:
h = hashlib.md5(base64.b64decode("utf-8")))
# Import all images at 1024px wide. Not sure if we can determine original size from binary data or not.
newContent = '\n<div><en-media type="' + + '" width="1024" hash="' + h.hexdigest() + '" /></div>'
imageFormat ='/')[1]
newResource = '<resource><data encoding="' + + '">' + + '</data>\n<mime>' + + '</mime><resource-attributes><file-name>IMAGE_FILE_NAME_' + str(attachmentNumber) + '.' + imageFormat + '</file-name></resource-attributes></resource>\n'
result += (newContent, newResource)
attachmentNumber += 1
line = line[m.end():]
m =
return result
def mungefile(fn):
fp = open(fn, 'r', encoding="utf8")
title = readlineUntil( fp, "<title>" ).strip()
title = title.replace('<title>', '').replace('</title>', '')
readlineUntil( fp, "<body>" )
t = fp.readline()
tags = ''
resources = ''
if '"archived"' in t:
tags = '<tag>archived</tag>'
fp.readline() #</div> alone
date = fp.readline().strip().replace('</div>', '')
dt, flat = cal.parse(date)
iso = time.strftime('%Y%m%dT%H%M%SZ', time.gmtime(time.mktime(dt)))
fp.readline() # extra title
content = fp.readline()
m =
if m:
content =
content = content.replace( '<ul class="list">', '' )
for line in fp:
line = line.strip()
if line == '</div></body></html>':
# Chips contain the tags as well as dynamic content previews.. but we care mostly about the tags
elif line.startswith('<div class="chips">'):
content += readTagsFromChips(line)
# Attachments contains the image data
elif line.startswith('<div class="attachments">'):
result = readImagesFromAttachment(line)
i = 0
while i < len(result):
if i+1 < len(result):
content += result[i]
# Use the note title without spaces as the image file name
currentResource = result[i+1].replace("IMAGE_FILE_NAME", title.replace(' ', ''))
resources += currentResource
i += 2
content += line + '\n'
content = content.replace('<br>', '<br/>')
content = content.replace('\n', '\0')
while True:
m =
if not m:
content = content[:m.start()] + '<en-todo checked="true"/>' + + '<br/>' + content[m.end():]
while True:
m =
if not m:
content = content[:m.start()] + '<en-todo checked="false"/>' + + '<br/>' + content[m.end():]
content = content.replace('\0', '\n')
# remove list close (if it was a list)
lastUl = content.rfind('</ul>')
if lastUl != -1:
content = content[:lastUl] + content[lastUl+5:]
m =
if m:
content = content[:m.start()] + content[m.end():]
tags = '<tag>' + + '</tag>'
content = re.sub(r'class="[^"]*"', '', content)
print ('''
<content><![CDATA[<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE en-note SYSTEM ""><en-note style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;">{content}</en-note>]]></content>
'''.format(**locals()), file=fxt)
parser = argparse.ArgumentParser(description="Convert Google Keep notes from .html to .enex for Evernote")
parser.add_argument('-o', '--output', help="The output file to write into. If not specified output goes to stdout.", default="sys.stdout")
parser.add_argument('-f', '--folder', help="The path to the folder which contains the HTML files", default="./Keep")
args = parser.parse_args()
if args.output == "sys.stdout":
fxt = sys.stdout
fxt = open(args.output, "w", encoding="utf8")
print ('''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE en-export SYSTEM "">
<en-export export-date="20180502T065115Z" application="Evernote/Windows" version="6.x">''', file=fxt)
for f in os.listdir(args.folder):
if f.endswith(".html"):
mungefile(os.path.join(args.folder, f))
print ('''</en-export>''', file=fxt)
Copy link

Made an account just to say thank you for your effort and it helped me a lot. I could not get the original version to work for me.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment