|
#!/usr/bin/env python |
|
# encoding: utf-8 |
|
""" |
|
*Parse kindle book highlights from exported kindle HTML file and embed them in PDF version of the same book* |
|
|
|
:Author: |
|
David Young |
|
|
|
:Date Created: |
|
August 21, 2021 |
|
|
|
Usage: |
|
kindle_highlights_to_pdf <htmlPath> <pdfPath> |
|
|
|
Options: |
|
-h, --help show this help message |
|
htmlPath path to the kindle export of book highlights |
|
pdfPath path to the PDF version of the book |
|
""" |
|
################# GLOBAL IMPORTS #################### |
|
import sys |
|
import os |
|
from fundamentals import tools |
|
from os.path import expanduser |
|
import codecs |
|
from bs4 import BeautifulSoup |
|
from fuzzysearch import find_near_matches |
|
import fitz |
|
import pandas as pd |
|
from tabulate import tabulate |
|
import sqlite3 as sql |
|
import numpy as np |
|
import time |
|
rgbColors = { |
|
"blue": (0.75, 0.8, 0.95), |
|
"yellow": (1, 1, 0.6), |
|
"orange": (1, 0.75, 0.5), |
|
"pink": (1, 0.7, 0.9) |
|
} |
|
|
|
|
|
def main(arguments=None): |
|
""" |
|
*The main function used when ``kindle_highlights_to_pdf.py`` is run as a single script from the cl* |
|
""" |
|
|
|
# SETUP THE COMMAND-LINE UTIL SETTINGS |
|
su = tools( |
|
arguments=arguments, |
|
docString=__doc__, |
|
logLevel="WARNING", |
|
options_first=False, |
|
projectName=False |
|
) |
|
arguments, settings, log, dbConn = su.setup() |
|
|
|
# UNPACK REMAINING CL ARGUMENTS USING `EXEC` TO SETUP THE VARIABLE NAMES |
|
# AUTOMATICALLY |
|
a = {} |
|
for arg, val in list(arguments.items()): |
|
if arg[0] == "-": |
|
varname = arg.replace("-", "") + "Flag" |
|
else: |
|
varname = arg.replace("<", "").replace(">", "") |
|
a[varname] = val |
|
if arg == "--dbConn": |
|
dbConn = val |
|
a["dbConn"] = val |
|
log.debug('%s = %s' % (varname, val,)) |
|
|
|
htmlPath = a['htmlPath'] |
|
pdfPath = a['pdfPath'] |
|
# MAKE RELATIVE HOME PATH ABSOLUTE |
|
|
|
home = expanduser("~") |
|
if htmlPath[0] == "~": |
|
htmlPath = htmlPath.replace("~", home) |
|
if pdfPath[0] == "~": |
|
pdfPath = pdfPath.replace("~", home) |
|
|
|
highlights = parse_html_highlights( |
|
log=log, |
|
htmlPath=htmlPath) |
|
|
|
output = embed_highlights_in_pdf( |
|
log=log, |
|
highlights=highlights, |
|
pdfPath=pdfPath) |
|
|
|
print(f"Highlighted PDF is here: '{output}'") |
|
|
|
return |
|
|
|
|
|
def parse_html_highlights( |
|
log, |
|
htmlPath): |
|
"""*return a list of highlight content to embed in PDF* |
|
|
|
**Key Arguments:** |
|
|
|
- ``log`` -- logger |
|
- ``htmlPath`` -- path to the kindle export of book highlights |
|
|
|
**Usage:** |
|
|
|
```python |
|
highlights = parse_html_highlights( |
|
log=log, |
|
htmlPath=htmlPath |
|
) |
|
``` |
|
""" |
|
log.debug('starting the ``parse_html_highlights`` function') |
|
|
|
with codecs.open(htmlPath, encoding='utf-8', mode='r') as readFile: |
|
thisData = readFile.read() |
|
soup = BeautifulSoup(thisData, 'html.parser') |
|
|
|
highlights = soup.find_all("div", {"class": "noteText"}) |
|
headings = soup.find_all("h3", {"class": "noteHeading"}) |
|
|
|
colors = [h.find("span").text for h in headings] |
|
highlights = [h.text.split("Highlight (")[0].strip() for h in highlights] |
|
|
|
# REMOVE HIGHLIGHT LESS THAN 4 WORDS OR 20 CHARACTERS |
|
colors = [c for h, c in zip(highlights, colors) |
|
if len(h) > 19 and len(h.split()) > 3] |
|
highlights = [h for h in highlights if len(h) > 19 and len(h.split()) > 3] |
|
|
|
# CREATE DATA FRAME FROM A DICTIONARY OF LISTS |
|
highlights = {"highlight": highlights, "color": colors} |
|
highlights['page'] = 0 |
|
highlights['page_min'] = 0 |
|
highlights['page_max'] = 0 |
|
highlights['match'] = "" |
|
highlights['split_match_bot'] = "" |
|
highlights['split_match_top'] = "" |
|
highlights['length'] = 0 |
|
highlights = pd.DataFrame(highlights) |
|
|
|
log.debug('completed the ``parse_html_highlights`` function') |
|
return highlights |
|
|
|
|
|
def embed_highlights_in_pdf( |
|
log, |
|
highlights, |
|
pdfPath): |
|
"""*summary of function* |
|
|
|
**Key Arguments:** |
|
|
|
- ``log`` -- logger |
|
- ``highlights`` -- list of highlighted text content |
|
- ``pdfPath`` -- path to the PDF version of the book |
|
|
|
**Usage:** |
|
|
|
```eval_rst |
|
.. todo:: |
|
|
|
add usage info |
|
create a sublime snippet for usage |
|
``` |
|
|
|
```python |
|
usage code |
|
``` |
|
""" |
|
log.debug('starting the ``embed_highlights_in_pdf`` function') |
|
|
|
cleanPDF = fitz.open(pdfPath) |
|
# cleanPDF.save("/tmp/kindle_book.pdf", garbage=4, deflate=True, clean=True) |
|
# EMPTY PDF PLACEHOLDER FOR THE SOON-TO-BE HIGHLIGHTED PDF |
|
annotatedPDF = fitz.open() |
|
annotatedPDF.insertPDF( |
|
cleanPDF, |
|
from_page=0, |
|
to_page=cleanPDF.pageCount) |
|
cleanPDF.close() |
|
|
|
total = len(highlights) |
|
count = 0 |
|
|
|
# FIND SINGLE PAGE MATCHES |
|
percent = 0 |
|
matchTolerance = 4 |
|
while percent < 90 and matchTolerance < 40: |
|
time.sleep(2) |
|
highlights = highlights.apply( |
|
find_in_single_page_hightlight, axis=1, pdf=annotatedPDF, matchTolerance=matchTolerance) |
|
# FILTER DATA FRAME |
|
# FIRST CREATE THE MASK |
|
mask = (highlights['match'] != "") |
|
totalMatched = len(highlights.loc[mask]) |
|
total = len(highlights.index) |
|
percent = int(totalMatched * 100 / total) |
|
print(f"Matched {totalMatched}/{total} annotations ({percent}%) (max levenshtein distance = {matchTolerance})") |
|
pages = np.copy(highlights["page"].values) |
|
pages[pages == 0] = -999999 |
|
matchTolerance = matchTolerance + 5 |
|
theseMin = [] |
|
theseMin[:] = [np.max(pages[:i + 1]) |
|
for i in range(len(highlights.index))] |
|
pages[pages == -999999] = 999999 |
|
theseMax = [] |
|
theseMax[:] = [i for i in range(len(highlights.index))] |
|
theseMax[:] = [np.min(pages[i:]) |
|
for i in range(len(highlights.index))] |
|
highlights['page_min'] = theseMin |
|
highlights['page_max'] = theseMax |
|
if totalMatched == total: |
|
break |
|
|
|
matchTolerance = 4 |
|
while matchTolerance < 40: |
|
if totalMatched == total: |
|
break |
|
time.sleep(0.5) |
|
highlights = highlights.apply( |
|
find_in_over_multiple_pages, axis=1, pdf=annotatedPDF, matchTolerance=matchTolerance) |
|
# FILTER DATA FRAME |
|
# FIRST CREATE THE MASK |
|
mask = ((highlights['split_match_bot'] != |
|
"") & (highlights['split_match_top'] != "")) |
|
highlights.loc[mask, 'match'] = highlights[ |
|
'split_match_bot'] + " " + highlights['split_match_top'] |
|
mask = (highlights['match'] != "") |
|
totalMatched = len(highlights.loc[mask]) |
|
total = len(highlights.index) |
|
percent = int(totalMatched * 100 / total) |
|
print(f"Matched {totalMatched}/{total} annotations ({percent}%) (max levenshtein distance = {matchTolerance})") |
|
if totalMatched == total: |
|
break |
|
matchTolerance = matchTolerance + 2 |
|
|
|
outfile = pdfPath.replace(".pdf", "_highlighted.pdf") |
|
annotatedPDF.save(outfile, garbage=4, deflate=True, clean=True) |
|
annotatedPDF.close() |
|
|
|
# CONNECT TO THE DATABASE |
|
conn = sql.connect("highlights_export.db") |
|
# SEND TO DATABASE |
|
highlights.to_sql('highlights', con=conn, |
|
index=False, if_exists='replace') |
|
|
|
log.debug('completed the ``embed_highlights_in_pdf`` function') |
|
return outfile |
|
|
|
|
|
def find_in_single_page_hightlight( |
|
series, |
|
pdf, |
|
matchTolerance): |
|
"""*given a highlighted text, find its location in the PDF (doesn't find highlights spanning multiple pages)* |
|
|
|
**Key Arguments:** |
|
|
|
- ``series`` -- the dataframe row/series to apply work on |
|
- ``pdf`` -- the fitz PDF object |
|
- ``matchTolerance`` -- lower tolerance is stricter, matching less, but faster |
|
""" |
|
h = series['highlight'] |
|
c = series['color'] |
|
color = rgbColors[c] |
|
series['length'] = len(h) |
|
matchText = None |
|
|
|
if len(series['match']): |
|
return series |
|
|
|
page_min = series['page_min'] |
|
page_max = series['page_max'] + 1 |
|
|
|
if page_max == 1 or page_max > 99998: |
|
page_max = pdf.pageCount |
|
if page_min < 1: |
|
page_min = 1 |
|
|
|
for p in range(page_min, page_max): |
|
page = pdf[p] |
|
pageText = page.getText("text") |
|
matches = find_near_matches( |
|
h, pageText, max_l_dist=matchTolerance) |
|
for match in matches: |
|
matchText = match.matched |
|
clMatch = matchText.replace("\n", " ")[:30] |
|
print(f" MATCH: {clMatch}...") |
|
textOnPage = page.searchFor(matchText) |
|
while not textOnPage: |
|
textOnPage = page.searchFor(matchText) |
|
start = textOnPage[0].top_left |
|
end = textOnPage[-1].bottom_right |
|
series["page"] = p |
|
series['match'] = matchText |
|
annot = page.addHighlightAnnot(None, start=start, stop=end) |
|
annot.set_colors({"stroke": color}) |
|
|
|
# annot.update() |
|
if matchText: |
|
break |
|
|
|
return series |
|
|
|
|
|
def find_in_over_multiple_pages( |
|
series, |
|
pdf, |
|
matchTolerance): |
|
"""*given a highlighted text, find its location in the PDF (doesn't find highlights spanning multiple pages)* |
|
|
|
**Key Arguments:** |
|
|
|
- ``series`` -- the dataframe row/series to apply work on |
|
- ``pdf`` -- the fitz PDF object |
|
- ``matchTolerance`` -- lower tolerance is stricter, matching less, but faster |
|
""" |
|
h = series['highlight'] |
|
c = series['color'] |
|
color = rgbColors[c] |
|
series['length'] = len(h) |
|
matchText = None |
|
|
|
if len(series['match']): |
|
return series |
|
|
|
page_min = series['page_min'] |
|
page_max = series['page_max'] + 1 |
|
|
|
if page_max == 1 or page_max > 99998: |
|
page_max = pdf.pageCount |
|
if page_min < 1: |
|
page_min = 1 |
|
|
|
if not len(series['split_match_bot']): |
|
|
|
snippet = h[:40] |
|
for p in range(page_min, page_max): |
|
if len(series['split_match_bot']): |
|
break |
|
page = pdf[p] |
|
pageText = page.getText("text") |
|
matches = find_near_matches( |
|
snippet, pageText, max_l_dist=matchTolerance) |
|
for match in matches: |
|
matchText = match.matched |
|
if len(matchText): |
|
matchText = matchText + pageText.split(matchText)[-1] |
|
else: |
|
matchText = None |
|
|
|
if matchText: |
|
matches = find_near_matches( |
|
matchText, pageText, max_l_dist=4) |
|
for match in matches: |
|
matchText = match.matched |
|
clMatch = matchText.replace("\n", " ")[:30] |
|
print(f" MATCH: {clMatch}...") |
|
textOnPage = page.searchFor(matchText) |
|
while not textOnPage: |
|
textOnPage = page.searchFor(matchText) |
|
start = textOnPage[0].top_left |
|
end = textOnPage[-1].bottom_right |
|
series["page"] = p |
|
# series['match'] = matchText |
|
series["split_match_bot"] = matchText |
|
annot = page.addHighlightAnnot( |
|
None, start=start, stop=end) |
|
annot.set_colors({"stroke": color}) |
|
|
|
series['page_min'] = p |
|
page_min = p |
|
if matchText: |
|
break |
|
|
|
if not len(series['split_match_top']): |
|
|
|
matchText = None |
|
snippet = h[-40:] |
|
for p in range(page_min, page_max): |
|
if len(series['split_match_top']): |
|
break |
|
page = pdf[p] |
|
pageText = page.getText("text") |
|
matches = find_near_matches( |
|
snippet, pageText, max_l_dist=matchTolerance) |
|
for match in matches: |
|
matchText = match.matched |
|
matchText = pageText.split(matchText)[0] + matchText |
|
if matchText: |
|
matches = find_near_matches( |
|
matchText, pageText, max_l_dist=4) |
|
for match in matches: |
|
matchText = match.matched |
|
clMatch = matchText.replace("\n", " ")[:30] |
|
print(f" MATCH: {clMatch}...") |
|
textOnPage = page.searchFor(matchText) |
|
while not textOnPage: |
|
textOnPage = page.searchFor(matchText) |
|
start = textOnPage[0].top_left |
|
end = textOnPage[-1].bottom_right |
|
series["page"] = p |
|
# series['match'] = matchText |
|
series["split_match_top"] = matchText |
|
annot = page.addHighlightAnnot( |
|
None, start=start, stop=end) |
|
annot.set_colors({"stroke": color}) |
|
|
|
series['page_max'] = p |
|
page_max = p + 1 |
|
if matchText: |
|
break |
|
|
|
return series |
|
|
|
# use the tab-trigger below for new function |
|
# xt-def-function |
|
|
|
if __name__ == '__main__': |
|
main() |
PyMuPDF changed (https://pymupdf.readthedocs.io/en/latest/znames.html) from “camelCase” to “snake_cased” naming. Easy to fix.
I still do have some problems with your nice script. For example there are lines in my exported html where the script seems to hang (samples enclosed). And find_in_over_multiple_pages is creating strange highlighting in the target pdf file.