Skip to content

Instantly share code, notes, and snippets.

@dempe
Last active October 20, 2023 21:10
Show Gist options
  • Save dempe/f26536e2d04c2e8e815e0abf7d8d2d69 to your computer and use it in GitHub Desktop.
Save dempe/f26536e2d04c2e8e815e0abf7d8d2d69 to your computer and use it in GitHub Desktop.
Convert Kindle HTML notes to Markdown
from bs4 import BeautifulSoup
import argparse
def clean_text(section) -> str:
return section.get_text().lstrip().replace('\n', '').replace('\t', '')
def parse_chapter(text):
# There is NO chapter number, but there is A page number. Example ->
# Highlight(<span class="highlight_orange">orange</span>) - Page xxvii · Location 381
if '>' not in text and 'Page' in text:
page = text.split('Page')[1].split('·')[0].strip()
return None, page
# There is A chapter number, but there is NO page number. Example ->
# Highlight(<span class="highlight_orange">orange</span>) - I > Location 38
# Note: BS parses the <span>
if '>' in text and 'Page' not in text:
chapter = text.split('>')[0].strip()
chapter = chapter.split('Highlight(orange) -')[1].strip()
return chapter, None
# There is A chapter number, and there is A page number. Example ->
# Highlight(<span class="highlight_orange">orange</span>) - I.1 Enter the two Bishops, [the Archbishop] of Canterbury and [the Bishop of] Ely. > Page 7 · Location 778
if '>' in text and 'Page' in text:
chapter = text.split('Highlight(orange) -')[1]\
.split('>')[0].strip()
page = text.split('Page')[1].split('·')[0].strip()
return chapter, page
# There is NO chapter number, and there is NO page number
return None, None
def remove_brackets(text) -> str:
"""Brackets are used for Wikilinks and conflict with Obsidian"""
return text.replace('[', '').replace(']', '')
def convert_notes(inputf) -> str:
mdf = ""
with open(inputf, 'r') as htmlf:
soup = BeautifulSoup(htmlf.read(), 'html.parser')
mdf += f"## Reflection\n"
mdf += f"## Notes\n"
# Find first section heading
start_node = soup.find('div', {'class': 'sectionHeading'})
mdf += f"### {clean_text(start_node)}\n"
# Notes must be parsed sequentially like so, bc Amazon does not use children elements.
is_note = False
chapter = ''
page = None
current = start_node
while current.next_sibling:
current = current.next_sibling
if not current.name or not current.attrs or 'class' not in current.attrs:
continue
class_name = current['class'][0]
# Indent notes to make them a child element
if class_name == 'noteText' and is_note:
mdf += f"\t+ **Note**: {clean_text(current)}\n"
continue
if class_name == 'noteText':
if page and page != '':
mdf += f"+ *Page {page}*: {clean_text(current)}\n"
# Not all books have page numbers. Some only use "Location".
else:
mdf += f"+ {clean_text(current)}\n"
continue
# Notes added to highlights begin with "Note".
# Setting `is_note` allows us to make the note text a sublist to the highlight.
elif class_name == 'noteHeading':
text = clean_text(current)
is_note = text.startswith('Note - ')
if not is_note:
new_chapter, page = parse_chapter(text)
if new_chapter is not None and new_chapter != chapter:
mdf += f"#### {new_chapter}\n"
# # Has not happened yet, but I want to know if it does.
# if new_chapter is None and chapter is not None:
# mdf += f"#### NO CHAPTER\n"
chapter = new_chapter
continue
elif class_name == 'sectionHeading':
mdf += f"### {clean_text(current)}\n"
continue
return remove_brackets(mdf)
def parse_args():
parser = argparse.ArgumentParser(description="Process filename")
parser.add_argument("f", type=str, help="The input filename")
return parser.parse_args().f
if __name__ == '__main__':
print(convert_notes(parse_args()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment