Last active
October 20, 2023 21:10
-
-
Save dempe/f26536e2d04c2e8e815e0abf7d8d2d69 to your computer and use it in GitHub Desktop.
Convert Kindle HTML notes to Markdown
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import argparse | |
def clean_text(section) -> str: | |
return section.get_text().lstrip().replace('\n', '').replace('\t', '') | |
def parse_chapter(text): | |
# There is NO chapter number, but there is A page number. Example -> | |
# Highlight(<span class="highlight_orange">orange</span>) - Page xxvii · Location 381 | |
if '>' not in text and 'Page' in text: | |
page = text.split('Page')[1].split('·')[0].strip() | |
return None, page | |
# There is A chapter number, but there is NO page number. Example -> | |
# Highlight(<span class="highlight_orange">orange</span>) - I > Location 38 | |
# Note: BS parses the <span> | |
if '>' in text and 'Page' not in text: | |
chapter = text.split('>')[0].strip() | |
chapter = chapter.split('Highlight(orange) -')[1].strip() | |
return chapter, None | |
# There is A chapter number, and there is A page number. Example -> | |
# Highlight(<span class="highlight_orange">orange</span>) - I.1 Enter the two Bishops, [the Archbishop] of Canterbury and [the Bishop of] Ely. > Page 7 · Location 778 | |
if '>' in text and 'Page' in text: | |
chapter = text.split('Highlight(orange) -')[1]\ | |
.split('>')[0].strip() | |
page = text.split('Page')[1].split('·')[0].strip() | |
return chapter, page | |
# There is NO chapter number, and there is NO page number | |
return None, None | |
def remove_brackets(text) -> str: | |
"""Brackets are used for Wikilinks and conflict with Obsidian""" | |
return text.replace('[', '').replace(']', '') | |
def convert_notes(inputf) -> str: | |
mdf = "" | |
with open(inputf, 'r') as htmlf: | |
soup = BeautifulSoup(htmlf.read(), 'html.parser') | |
mdf += f"## Reflection\n" | |
mdf += f"## Notes\n" | |
# Find first section heading | |
start_node = soup.find('div', {'class': 'sectionHeading'}) | |
mdf += f"### {clean_text(start_node)}\n" | |
# Notes must be parsed sequentially like so, bc Amazon does not use children elements. | |
is_note = False | |
chapter = '' | |
page = None | |
current = start_node | |
while current.next_sibling: | |
current = current.next_sibling | |
if not current.name or not current.attrs or 'class' not in current.attrs: | |
continue | |
class_name = current['class'][0] | |
# Indent notes to make them a child element | |
if class_name == 'noteText' and is_note: | |
mdf += f"\t+ **Note**: {clean_text(current)}\n" | |
continue | |
if class_name == 'noteText': | |
if page and page != '': | |
mdf += f"+ *Page {page}*: {clean_text(current)}\n" | |
# Not all books have page numbers. Some only use "Location". | |
else: | |
mdf += f"+ {clean_text(current)}\n" | |
continue | |
# Notes added to highlights begin with "Note". | |
# Setting `is_note` allows us to make the note text a sublist to the highlight. | |
elif class_name == 'noteHeading': | |
text = clean_text(current) | |
is_note = text.startswith('Note - ') | |
if not is_note: | |
new_chapter, page = parse_chapter(text) | |
if new_chapter is not None and new_chapter != chapter: | |
mdf += f"#### {new_chapter}\n" | |
# # Has not happened yet, but I want to know if it does. | |
# if new_chapter is None and chapter is not None: | |
# mdf += f"#### NO CHAPTER\n" | |
chapter = new_chapter | |
continue | |
elif class_name == 'sectionHeading': | |
mdf += f"### {clean_text(current)}\n" | |
continue | |
return remove_brackets(mdf) | |
def parse_args(): | |
parser = argparse.ArgumentParser(description="Process filename") | |
parser.add_argument("f", type=str, help="The input filename") | |
return parser.parse_args().f | |
if __name__ == '__main__': | |
print(convert_notes(parse_args())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment