jaiamo/kindle.py

## kindle.py
"""
kindle.py: Generate markdown notes from Kindle highlights
"""
import os
import sys
import re
import argparse
import requests
from bs4 import BeautifulSoup

# Parse arguments
parser = argparse.ArgumentParser(description="Generate markdown notes from Kindle highlights")
parser.add_argument("-l", action="store_true", help="list all books")
group = parser.add_mutually_exclusive_group()
group.add_argument("-a", action="store_true", help="save notes from all books")
group.add_argument("-n", metavar="book_num", type=int, help="save notes from specified book number")
args = parser.parse_args()

if len(sys.argv) == 1:
    print(parser.print_help())
    sys.exit()

# Retrieve environment variables (set based on request headers of browser logged into Kindle notebook)
user_agent = os.environ.get("KINDLE_USER_AGENT") or sys.exit("Need $KINDLE_USER_AGENT environment variable.")
cookie = os.environ.get("KINDLE_COOKIE") or sys.exit("Need $KINDLE_COOKIE environment variable.")

# Use headers to retrieve Kindle notebook page
base_url = "https://read.amazon.com/notebook"

session = requests.Session()
session.headers.update({
    "user-agent": user_agent,
    "cookie" : cookie
    })

response = session.get(base_url)
soup = BeautifulSoup(response.text, "lxml")

if soup.select_one("title").get_text() != "Kindle: Your Notes and Highlights":
    with open("/tmp/kindle.html", "w") as file:
        file.write(soup.prettify())
    sys.exit(f"Unexpected page. Check: {os.path.abspath(file.name)}")

# Select divs containing books
books = soup.select("#kp-notebook-library .kp-notebook-library-each-book")

for i, book in enumerate(books):
    asin = book["id"]
    title = book.find("h2").get_text().strip()
    author = book.find("p").get_text().split(":")[-1].strip()
    short_title = title.split(":")[0]
    file_name = re.sub('[~"#%&*:<>?\/\\{|}]+', "", short_title).lower()

    # List books if arguments have list flag
    if args.l:
        print(f"{i+1:2d}: {short_title} by {author}")

    # Write markdown to {short_title}.md for each book if arguments have save flags
    if args.n == i+1 or args.a:
        with open (f"{file_name}.md", "w") as file:
            # Write YAML, title, and metadata
            yaml = f"---\ntags:\n    - #book\n---\n\n"
            metadata = (
                f"## Metadata\n\n"
                f"- Title: {title}\n"
                f"- Author: {author}\n"
                f"- ASIN: [{asin}](kindle://book?action=openasin={asin})\n\n"
            )

            file.write(yaml)
            file.write(f"# {title} by {author}\n\n")
            file.write(metadata)
            file.write(f"## Notes")

            # Retrieve highlights / notes split across multiple html pages
            content_limit_state = "&"
            token = ""
            next_page = True

            while next_page:
                response = session.get(f"{base_url}?asin={asin}&token={token}&contentLimitState={content_limit_state}")
                soup = BeautifulSoup(response.text, "lxml")
                highlights = soup.select(".a-spacing-base .kp-notebook-row-separator")

                # Write highlights / notes to file
                for highlight in highlights:
                    location = highlight.select_one("#kp-annotation-location")["value"]
                    highlight_select = highlight.select_one("#highlight")
                    note_text = highlight.select_one("#note").get_text()

                    file.write("\n\n---\n\n")
                    # Some locations don't have #highlight divs, so select returns None
                    if highlight_select: file.write(f"{highlight_select.get_text()}")
                    # All locations have #note divs, but most empty
                    if len(note_text): file.write(f"\n\nNote: {note_text}")
                    file.write(f" - (Loc: [{location}](kindle://book?action=open&location=#{location}&asin={asin}))")

                # Get URL parameters for next page if it exists
                token_select = soup.select_one(".kp-notebook-annotations-next-page-start")
                next_page = token_select.has_attr("value")
                if next_page:
                    token = token_select["value"]
                    content_limit_state = soup.select_one(".kp-notebook-content-limit-state")["value"]
	"""
	kindle.py: Generate markdown notes from Kindle highlights
	"""
	import os
	import sys
	import re
	import argparse
	import requests
	from bs4 import BeautifulSoup

	# Parse arguments
	parser = argparse.ArgumentParser(description="Generate markdown notes from Kindle highlights")
	parser.add_argument("-l", action="store_true", help="list all books")
	group = parser.add_mutually_exclusive_group()
	group.add_argument("-a", action="store_true", help="save notes from all books")
	group.add_argument("-n", metavar="book_num", type=int, help="save notes from specified book number")
	args = parser.parse_args()

	if len(sys.argv) == 1:
	print(parser.print_help())
	sys.exit()

	# Retrieve environment variables (set based on request headers of browser logged into Kindle notebook)
	user_agent = os.environ.get("KINDLE_USER_AGENT") or sys.exit("Need $KINDLE_USER_AGENT environment variable.")
	cookie = os.environ.get("KINDLE_COOKIE") or sys.exit("Need $KINDLE_COOKIE environment variable.")

	# Use headers to retrieve Kindle notebook page
	base_url = "https://read.amazon.com/notebook"

	session = requests.Session()
	session.headers.update({
	"user-agent": user_agent,
	"cookie" : cookie
	})

	response = session.get(base_url)
	soup = BeautifulSoup(response.text, "lxml")

	if soup.select_one("title").get_text() != "Kindle: Your Notes and Highlights":
	with open("/tmp/kindle.html", "w") as file:
	file.write(soup.prettify())
	sys.exit(f"Unexpected page. Check: {os.path.abspath(file.name)}")

	# Select divs containing books
	books = soup.select("#kp-notebook-library .kp-notebook-library-each-book")

	for i, book in enumerate(books):
	asin = book["id"]
	title = book.find("h2").get_text().strip()
	author = book.find("p").get_text().split(":")[-1].strip()
	short_title = title.split(":")[0]
	file_name = re.sub('[~"#%&*:<>?\/\\{\|}]+', "", short_title).lower()

	# List books if arguments have list flag
	if args.l:
	print(f"{i+1:2d}: {short_title} by {author}")

	# Write markdown to {short_title}.md for each book if arguments have save flags
	if args.n == i+1 or args.a:
	with open (f"{file_name}.md", "w") as file:
	# Write YAML, title, and metadata
	yaml = f"---\ntags:\n - #book\n---\n\n"
	metadata = (
	f"## Metadata\n\n"
	f"- Title: {title}\n"
	f"- Author: {author}\n"
	f"- ASIN: [{asin}](kindle://book?action=openasin={asin})\n\n"
	)

	file.write(yaml)
	file.write(f"# {title} by {author}\n\n")
	file.write(metadata)
	file.write(f"## Notes")

	# Retrieve highlights / notes split across multiple html pages
	content_limit_state = "&"
	token = ""
	next_page = True

	while next_page:
	response = session.get(f"{base_url}?asin={asin}&token={token}&contentLimitState={content_limit_state}")
	soup = BeautifulSoup(response.text, "lxml")
	highlights = soup.select(".a-spacing-base .kp-notebook-row-separator")

	# Write highlights / notes to file
	for highlight in highlights:
	location = highlight.select_one("#kp-annotation-location")["value"]
	highlight_select = highlight.select_one("#highlight")
	note_text = highlight.select_one("#note").get_text()

	file.write("\n\n---\n\n")
	# Some locations don't have #highlight divs, so select returns None
	if highlight_select: file.write(f"{highlight_select.get_text()}")
	# All locations have #note divs, but most empty
	if len(note_text): file.write(f"\n\nNote: {note_text}")
	file.write(f" - (Loc: [{location}](kindle://book?action=open&location=#{location}&asin={asin}))")

	# Get URL parameters for next page if it exists
	token_select = soup.select_one(".kp-notebook-annotations-next-page-start")
	next_page = token_select.has_attr("value")
	if next_page:
	token = token_select["value"]
	content_limit_state = soup.select_one(".kp-notebook-content-limit-state")["value"]