thespacedoctor/01_README.md

## 01_README.md

      
    Raw
  

              01_README.md
            
          
    kindle highlights to PDF annotations

The gist containing this content and be found here.
Installation

conda create -n kindle python=3.7 pip six fuzzysearch fundamentals tabulate pandas
conda activate kindle
pip install PyMuPDF
To make the script available system wide (OPTIONAL):
chmod 777 kindle_highlights_to_pdf.py
sudo ln -s $PWD/kindle_highlights_to_pdf.py /usr/local/bin/kindle_highlights_to_pdf
Note you will have to install all dependencies into the native python version site-packages for this to work.
Usage

kindle_highlights_to_pdf <htmlPath> <pdfPath>
Exporting Kindle Annotation as HTML

Open the book in Kindle for Mac (or any other device you can export from) and click the export button in the notebook. Save the resulting HTML file to a suitable location (e.g. a folder on your Desktop).

Convert the Kindle Book to PDF

Using Calibre convert the kindle book to PDF and save it in the same location as the exported annotation HTML file.
Copy Highlights to PDF Book

kindle_highlights_to_pdf "~/Desktop/Crime and Punishment-Notebook.html" "/Users/Dave/Desktop/Crime and Punishment - Fyodor Dostoyevsky.pdf"
The script will then search the PDF, add all annotations to the correct locations and print the path to the (copied) PDF with annotations embedded:
Highlighted PDF is here: /Users/Dave/Desktop/Crime and Punishment - Fyodor Dostoyevsky_highlighted.pdf


## kindle_highlights_to_pdf.py
#!/usr/bin/env python
# encoding: utf-8
"""
*Parse kindle book highlights from exported kindle HTML file and embed them in PDF version of the same book*

:Author:
    David Young

:Date Created:
    August 21, 2021

Usage:
    kindle_highlights_to_pdf <htmlPath> <pdfPath>

Options:
    -h, --help            show this help message
    htmlPath              path to the kindle export of book highlights
    pdfPath               path to the PDF version of the book
"""
################# GLOBAL IMPORTS ####################
import sys
import os
from fundamentals import tools
from os.path import expanduser
import codecs
from bs4 import BeautifulSoup
from fuzzysearch import find_near_matches
import fitz
import pandas as pd
from tabulate import tabulate
import sqlite3 as sql
import numpy as np
import time
rgbColors = {
    "blue": (0.75, 0.8, 0.95),
    "yellow": (1, 1, 0.6),
    "orange": (1, 0.75, 0.5),
    "pink": (1, 0.7, 0.9)
}


def main(arguments=None):
    """
    *The main function used when ``kindle_highlights_to_pdf.py`` is run as a single script from the cl*
    """

    # SETUP THE COMMAND-LINE UTIL SETTINGS
    su = tools(
        arguments=arguments,
        docString=__doc__,
        logLevel="WARNING",
        options_first=False,
        projectName=False
    )
    arguments, settings, log, dbConn = su.setup()

    # UNPACK REMAINING CL ARGUMENTS USING `EXEC` TO SETUP THE VARIABLE NAMES
    # AUTOMATICALLY
    a = {}
    for arg, val in list(arguments.items()):
        if arg[0] == "-":
            varname = arg.replace("-", "") + "Flag"
        else:
            varname = arg.replace("<", "").replace(">", "")
        a[varname] = val
        if arg == "--dbConn":
            dbConn = val
            a["dbConn"] = val
        log.debug('%s = %s' % (varname, val,))

    htmlPath = a['htmlPath']
    pdfPath = a['pdfPath']
    # MAKE RELATIVE HOME PATH ABSOLUTE

    home = expanduser("~")
    if htmlPath[0] == "~":
        htmlPath = htmlPath.replace("~", home)
    if pdfPath[0] == "~":
        pdfPath = pdfPath.replace("~", home)

    highlights = parse_html_highlights(
        log=log,
        htmlPath=htmlPath)

    output = embed_highlights_in_pdf(
        log=log,
        highlights=highlights,
        pdfPath=pdfPath)

    print(f"Highlighted PDF is here: '{output}'")

    return


def parse_html_highlights(
        log,
        htmlPath):
    """*return a list of highlight content to embed in PDF*

    **Key Arguments:**

    - ``log`` -- logger
    - ``htmlPath`` -- path to the kindle export of book highlights

    **Usage:**

    ```python
    highlights = parse_html_highlights(
        log=log,
        htmlPath=htmlPath
    )
    ```
    """
    log.debug('starting the ``parse_html_highlights`` function')

    with codecs.open(htmlPath, encoding='utf-8', mode='r') as readFile:
        thisData = readFile.read()
    soup = BeautifulSoup(thisData, 'html.parser')

    highlights = soup.find_all("div", {"class": "noteText"})
    headings = soup.find_all("h3", {"class": "noteHeading"})

    colors = [h.find("span").text for h in headings]
    highlights = [h.text.split("Highlight (")[0].strip() for h in highlights]

    # REMOVE HIGHLIGHT LESS THAN 4 WORDS OR 20 CHARACTERS
    colors = [c for h, c in zip(highlights, colors)
              if len(h) > 19 and len(h.split()) > 3]
    highlights = [h for h in highlights if len(h) > 19 and len(h.split()) > 3]

    # CREATE DATA FRAME FROM A DICTIONARY OF LISTS
    highlights = {"highlight": highlights, "color": colors}
    highlights['page'] = 0
    highlights['page_min'] = 0
    highlights['page_max'] = 0
    highlights['match'] = ""
    highlights['split_match_bot'] = ""
    highlights['split_match_top'] = ""
    highlights['length'] = 0
    highlights = pd.DataFrame(highlights)

    log.debug('completed the ``parse_html_highlights`` function')
    return highlights


def embed_highlights_in_pdf(
        log,
        highlights,
        pdfPath):
    """*summary of function*

    **Key Arguments:**

    - ``log`` -- logger
    - ``highlights`` -- list of highlighted text content
    - ``pdfPath`` -- path to the PDF version of the book

    **Usage:**

    ```eval_rst
    .. todo::

            add usage info
            create a sublime snippet for usage
    ```

    ```python
    usage code
    ```
    """
    log.debug('starting the ``embed_highlights_in_pdf`` function')

    cleanPDF = fitz.open(pdfPath)
    # cleanPDF.save("/tmp/kindle_book.pdf", garbage=4, deflate=True, clean=True)
    # EMPTY PDF PLACEHOLDER FOR THE SOON-TO-BE HIGHLIGHTED PDF
    annotatedPDF = fitz.open()
    annotatedPDF.insertPDF(
        cleanPDF,
        from_page=0,
        to_page=cleanPDF.pageCount)
    cleanPDF.close()

    total = len(highlights)
    count = 0

    # FIND SINGLE PAGE MATCHES
    percent = 0
    matchTolerance = 4
    while percent < 90 and matchTolerance < 40:
        time.sleep(2)
        highlights = highlights.apply(
            find_in_single_page_hightlight, axis=1, pdf=annotatedPDF, matchTolerance=matchTolerance)
        # FILTER DATA FRAME
        # FIRST CREATE THE MASK
        mask = (highlights['match'] != "")
        totalMatched = len(highlights.loc[mask])
        total = len(highlights.index)
        percent = int(totalMatched * 100 / total)
        print(f"Matched {totalMatched}/{total} annotations ({percent}%) (max levenshtein distance = {matchTolerance})")
        pages = np.copy(highlights["page"].values)
        pages[pages == 0] = -999999
        matchTolerance = matchTolerance + 5
        theseMin = []
        theseMin[:] = [np.max(pages[:i + 1])
                       for i in range(len(highlights.index))]
        pages[pages == -999999] = 999999
        theseMax = []
        theseMax[:] = [i for i in range(len(highlights.index))]
        theseMax[:] = [np.min(pages[i:])
                       for i in range(len(highlights.index))]
        highlights['page_min'] = theseMin
        highlights['page_max'] = theseMax
        if totalMatched == total:
            break

    matchTolerance = 4
    while matchTolerance < 40:
        if totalMatched == total:
            break
        time.sleep(0.5)
        highlights = highlights.apply(
            find_in_over_multiple_pages, axis=1, pdf=annotatedPDF, matchTolerance=matchTolerance)
        # FILTER DATA FRAME
        # FIRST CREATE THE MASK
        mask = ((highlights['split_match_bot'] !=
                 "") & (highlights['split_match_top'] != ""))
        highlights.loc[mask, 'match'] = highlights[
            'split_match_bot'] + " " + highlights['split_match_top']
        mask = (highlights['match'] != "")
        totalMatched = len(highlights.loc[mask])
        total = len(highlights.index)
        percent = int(totalMatched * 100 / total)
        print(f"Matched {totalMatched}/{total} annotations ({percent}%) (max levenshtein distance = {matchTolerance})")
        if totalMatched == total:
            break
        matchTolerance = matchTolerance + 2

    outfile = pdfPath.replace(".pdf", "_highlighted.pdf")
    annotatedPDF.save(outfile, garbage=4, deflate=True, clean=True)
    annotatedPDF.close()

    # CONNECT TO THE DATABASE
    conn = sql.connect("highlights_export.db")
    # SEND TO DATABASE
    highlights.to_sql('highlights', con=conn,
                      index=False, if_exists='replace')

    log.debug('completed the ``embed_highlights_in_pdf`` function')
    return outfile


def find_in_single_page_hightlight(
        series,
        pdf,
        matchTolerance):
    """*given a highlighted text, find its location in the PDF (doesn't find highlights spanning multiple pages)*

    **Key Arguments:**

    - ``series`` -- the dataframe row/series to apply work on
    - ``pdf`` -- the fitz PDF object
    - ``matchTolerance`` -- lower tolerance is stricter, matching less, but faster
    """
    h = series['highlight']
    c = series['color']
    color = rgbColors[c]
    series['length'] = len(h)
    matchText = None

    if len(series['match']):
        return series

    page_min = series['page_min']
    page_max = series['page_max'] + 1

    if page_max == 1 or page_max > 99998:
        page_max = pdf.pageCount
    if page_min < 1:
        page_min = 1

    for p in range(page_min, page_max):
        page = pdf[p]
        pageText = page.getText("text")
        matches = find_near_matches(
            h, pageText, max_l_dist=matchTolerance)
        for match in matches:
            matchText = match.matched
            clMatch = matchText.replace("\n", " ")[:30]
            print(f"    MATCH: {clMatch}...")
            textOnPage = page.searchFor(matchText)
            while not textOnPage:
                textOnPage = page.searchFor(matchText)
            start = textOnPage[0].top_left
            end = textOnPage[-1].bottom_right
            series["page"] = p
            series['match'] = matchText
            annot = page.addHighlightAnnot(None, start=start, stop=end)
            annot.set_colors({"stroke": color})

            # annot.update()
        if matchText:
            break

    return series


def find_in_over_multiple_pages(
        series,
        pdf,
        matchTolerance):
    """*given a highlighted text, find its location in the PDF (doesn't find highlights spanning multiple pages)*

    **Key Arguments:**

    - ``series`` -- the dataframe row/series to apply work on
    - ``pdf`` -- the fitz PDF object
    - ``matchTolerance`` -- lower tolerance is stricter, matching less, but faster
    """
    h = series['highlight']
    c = series['color']
    color = rgbColors[c]
    series['length'] = len(h)
    matchText = None

    if len(series['match']):
        return series

    page_min = series['page_min']
    page_max = series['page_max'] + 1

    if page_max == 1 or page_max > 99998:
        page_max = pdf.pageCount
    if page_min < 1:
        page_min = 1

    if not len(series['split_match_bot']):

        snippet = h[:40]
        for p in range(page_min, page_max):
            if len(series['split_match_bot']):
                break
            page = pdf[p]
            pageText = page.getText("text")
            matches = find_near_matches(
                snippet, pageText, max_l_dist=matchTolerance)
            for match in matches:
                matchText = match.matched
                if len(matchText):
                    matchText = matchText + pageText.split(matchText)[-1]
                else:
                    matchText = None

            if matchText:
                matches = find_near_matches(
                    matchText, pageText, max_l_dist=4)
                for match in matches:
                    matchText = match.matched
                    clMatch = matchText.replace("\n", " ")[:30]
                    print(f"    MATCH: {clMatch}...")
                    textOnPage = page.searchFor(matchText)
                    while not textOnPage:
                        textOnPage = page.searchFor(matchText)
                    start = textOnPage[0].top_left
                    end = textOnPage[-1].bottom_right
                    series["page"] = p
                    # series['match'] = matchText
                    series["split_match_bot"] = matchText
                    annot = page.addHighlightAnnot(
                        None, start=start, stop=end)
                    annot.set_colors({"stroke": color})

                    series['page_min'] = p
                    page_min = p
            if matchText:
                break

    if not len(series['split_match_top']):

        matchText = None
        snippet = h[-40:]
        for p in range(page_min, page_max):
            if len(series['split_match_top']):
                break
            page = pdf[p]
            pageText = page.getText("text")
            matches = find_near_matches(
                snippet, pageText, max_l_dist=matchTolerance)
            for match in matches:
                matchText = match.matched
                matchText = pageText.split(matchText)[0] + matchText
            if matchText:
                matches = find_near_matches(
                    matchText, pageText, max_l_dist=4)
                for match in matches:
                    matchText = match.matched
                    clMatch = matchText.replace("\n", " ")[:30]
                    print(f"    MATCH: {clMatch}...")
                    textOnPage = page.searchFor(matchText)
                    while not textOnPage:
                        textOnPage = page.searchFor(matchText)
                    start = textOnPage[0].top_left
                    end = textOnPage[-1].bottom_right
                    series["page"] = p
                    # series['match'] = matchText
                    series["split_match_top"] = matchText
                    annot = page.addHighlightAnnot(
                        None, start=start, stop=end)
                    annot.set_colors({"stroke": color})

                    series['page_max'] = p
                    page_max = p + 1
            if matchText:
                break

    return series

# use the tab-trigger below for new function
# xt-def-function

if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	# encoding: utf-8
	"""
	Parse kindle book highlights from exported kindle HTML file and embed them in PDF version of the same book

	:Author:
	David Young

	:Date Created:
	August 21, 2021

	Usage:
	kindle_highlights_to_pdf <htmlPath> <pdfPath>

	Options:
	-h, --help show this help message
	htmlPath path to the kindle export of book highlights
	pdfPath path to the PDF version of the book
	"""
	################# GLOBAL IMPORTS ####################
	import sys
	import os
	from fundamentals import tools
	from os.path import expanduser
	import codecs
	from bs4 import BeautifulSoup
	from fuzzysearch import find_near_matches
	import fitz
	import pandas as pd
	from tabulate import tabulate
	import sqlite3 as sql
	import numpy as np
	import time
	rgbColors = {
	"blue": (0.75, 0.8, 0.95),
	"yellow": (1, 1, 0.6),
	"orange": (1, 0.75, 0.5),
	"pink": (1, 0.7, 0.9)
	}


	def main(arguments=None):
	"""
	The main function used when ``kindle_highlights_to_pdf.py`` is run as a single script from the cl
	"""

	# SETUP THE COMMAND-LINE UTIL SETTINGS
	su = tools(
	arguments=arguments,
	docString=__doc__,
	logLevel="WARNING",
	options_first=False,
	projectName=False
	)
	arguments, settings, log, dbConn = su.setup()

	# UNPACK REMAINING CL ARGUMENTS USING `EXEC` TO SETUP THE VARIABLE NAMES
	# AUTOMATICALLY
	a = {}
	for arg, val in list(arguments.items()):
	if arg[0] == "-":
	varname = arg.replace("-", "") + "Flag"
	else:
	varname = arg.replace("<", "").replace(">", "")
	a[varname] = val
	if arg == "--dbConn":
	dbConn = val
	a["dbConn"] = val
	log.debug('%s = %s' % (varname, val,))

	htmlPath = a['htmlPath']
	pdfPath = a['pdfPath']
	# MAKE RELATIVE HOME PATH ABSOLUTE

	home = expanduser("~")
	if htmlPath[0] == "~":
	htmlPath = htmlPath.replace("~", home)
	if pdfPath[0] == "~":
	pdfPath = pdfPath.replace("~", home)

	highlights = parse_html_highlights(
	log=log,
	htmlPath=htmlPath)

	output = embed_highlights_in_pdf(
	log=log,
	highlights=highlights,
	pdfPath=pdfPath)

	print(f"Highlighted PDF is here: '{output}'")

	return


	def parse_html_highlights(
	log,
	htmlPath):
	"""return a list of highlight content to embed in PDF

	Key Arguments:

	- ``log`` -- logger
	- ``htmlPath`` -- path to the kindle export of book highlights

	Usage:

	```python
	highlights = parse_html_highlights(
	log=log,
	htmlPath=htmlPath
	)
	```
	"""
	log.debug('starting the ``parse_html_highlights`` function')

	with codecs.open(htmlPath, encoding='utf-8', mode='r') as readFile:
	thisData = readFile.read()
	soup = BeautifulSoup(thisData, 'html.parser')

	highlights = soup.find_all("div", {"class": "noteText"})
	headings = soup.find_all("h3", {"class": "noteHeading"})

	colors = [h.find("span").text for h in headings]
	highlights = [h.text.split("Highlight (")[0].strip() for h in highlights]

	# REMOVE HIGHLIGHT LESS THAN 4 WORDS OR 20 CHARACTERS
	colors = [c for h, c in zip(highlights, colors)
	if len(h) > 19 and len(h.split()) > 3]
	highlights = [h for h in highlights if len(h) > 19 and len(h.split()) > 3]

	# CREATE DATA FRAME FROM A DICTIONARY OF LISTS
	highlights = {"highlight": highlights, "color": colors}
	highlights['page'] = 0
	highlights['page_min'] = 0
	highlights['page_max'] = 0
	highlights['match'] = ""
	highlights['split_match_bot'] = ""
	highlights['split_match_top'] = ""
	highlights['length'] = 0
	highlights = pd.DataFrame(highlights)

	log.debug('completed the ``parse_html_highlights`` function')
	return highlights


	def embed_highlights_in_pdf(
	log,
	highlights,
	pdfPath):
	"""summary of function

	Key Arguments:

	- ``log`` -- logger
	- ``highlights`` -- list of highlighted text content
	- ``pdfPath`` -- path to the PDF version of the book

	Usage:

	```eval_rst
	.. todo::

	add usage info
	create a sublime snippet for usage
	```

	```python
	usage code
	```
	"""
	log.debug('starting the ``embed_highlights_in_pdf`` function')

	cleanPDF = fitz.open(pdfPath)
	# cleanPDF.save("/tmp/kindle_book.pdf", garbage=4, deflate=True, clean=True)
	# EMPTY PDF PLACEHOLDER FOR THE SOON-TO-BE HIGHLIGHTED PDF
	annotatedPDF = fitz.open()
	annotatedPDF.insertPDF(
	cleanPDF,
	from_page=0,
	to_page=cleanPDF.pageCount)
	cleanPDF.close()

	total = len(highlights)
	count = 0

	# FIND SINGLE PAGE MATCHES
	percent = 0
	matchTolerance = 4
	while percent < 90 and matchTolerance < 40:
	time.sleep(2)
	highlights = highlights.apply(
	find_in_single_page_hightlight, axis=1, pdf=annotatedPDF, matchTolerance=matchTolerance)
	# FILTER DATA FRAME
	# FIRST CREATE THE MASK
	mask = (highlights['match'] != "")
	totalMatched = len(highlights.loc[mask])
	total = len(highlights.index)
	percent = int(totalMatched * 100 / total)
	print(f"Matched {totalMatched}/{total} annotations ({percent}%) (max levenshtein distance = {matchTolerance})")
	pages = np.copy(highlights["page"].values)
	pages[pages == 0] = -999999
	matchTolerance = matchTolerance + 5
	theseMin = []
	theseMin[:] = [np.max(pages[:i + 1])
	for i in range(len(highlights.index))]
	pages[pages == -999999] = 999999
	theseMax = []
	theseMax[:] = [i for i in range(len(highlights.index))]
	theseMax[:] = [np.min(pages[i:])
	for i in range(len(highlights.index))]
	highlights['page_min'] = theseMin
	highlights['page_max'] = theseMax
	if totalMatched == total:
	break

	matchTolerance = 4
	while matchTolerance < 40:
	if totalMatched == total:
	break
	time.sleep(0.5)
	highlights = highlights.apply(
	find_in_over_multiple_pages, axis=1, pdf=annotatedPDF, matchTolerance=matchTolerance)
	# FILTER DATA FRAME
	# FIRST CREATE THE MASK
	mask = ((highlights['split_match_bot'] !=
	"") & (highlights['split_match_top'] != ""))
	highlights.loc[mask, 'match'] = highlights[
	'split_match_bot'] + " " + highlights['split_match_top']
	mask = (highlights['match'] != "")
	totalMatched = len(highlights.loc[mask])
	total = len(highlights.index)
	percent = int(totalMatched * 100 / total)
	print(f"Matched {totalMatched}/{total} annotations ({percent}%) (max levenshtein distance = {matchTolerance})")
	if totalMatched == total:
	break
	matchTolerance = matchTolerance + 2

	outfile = pdfPath.replace(".pdf", "_highlighted.pdf")
	annotatedPDF.save(outfile, garbage=4, deflate=True, clean=True)
	annotatedPDF.close()

	# CONNECT TO THE DATABASE
	conn = sql.connect("highlights_export.db")
	# SEND TO DATABASE
	highlights.to_sql('highlights', con=conn,
	index=False, if_exists='replace')

	log.debug('completed the ``embed_highlights_in_pdf`` function')
	return outfile


	def find_in_single_page_hightlight(
	series,
	pdf,
	matchTolerance):
	"""given a highlighted text, find its location in the PDF (doesn't find highlights spanning multiple pages)

	Key Arguments:

	- ``series`` -- the dataframe row/series to apply work on
	- ``pdf`` -- the fitz PDF object
	- ``matchTolerance`` -- lower tolerance is stricter, matching less, but faster
	"""
	h = series['highlight']
	c = series['color']
	color = rgbColors[c]
	series['length'] = len(h)
	matchText = None

	if len(series['match']):
	return series

	page_min = series['page_min']
	page_max = series['page_max'] + 1

	if page_max == 1 or page_max > 99998:
	page_max = pdf.pageCount
	if page_min < 1:
	page_min = 1

	for p in range(page_min, page_max):
	page = pdf[p]
	pageText = page.getText("text")
	matches = find_near_matches(
	h, pageText, max_l_dist=matchTolerance)
	for match in matches:
	matchText = match.matched
	clMatch = matchText.replace("\n", " ")[:30]
	print(f" MATCH: {clMatch}...")
	textOnPage = page.searchFor(matchText)
	while not textOnPage:
	textOnPage = page.searchFor(matchText)
	start = textOnPage[0].top_left
	end = textOnPage[-1].bottom_right
	series["page"] = p
	series['match'] = matchText
	annot = page.addHighlightAnnot(None, start=start, stop=end)
	annot.set_colors({"stroke": color})

	# annot.update()
	if matchText:
	break

	return series


	def find_in_over_multiple_pages(
	series,
	pdf,
	matchTolerance):
	"""given a highlighted text, find its location in the PDF (doesn't find highlights spanning multiple pages)

	Key Arguments:

	- ``series`` -- the dataframe row/series to apply work on
	- ``pdf`` -- the fitz PDF object
	- ``matchTolerance`` -- lower tolerance is stricter, matching less, but faster
	"""
	h = series['highlight']
	c = series['color']
	color = rgbColors[c]
	series['length'] = len(h)
	matchText = None

	if len(series['match']):
	return series

	page_min = series['page_min']
	page_max = series['page_max'] + 1

	if page_max == 1 or page_max > 99998:
	page_max = pdf.pageCount
	if page_min < 1:
	page_min = 1

	if not len(series['split_match_bot']):

	snippet = h[:40]
	for p in range(page_min, page_max):
	if len(series['split_match_bot']):
	break
	page = pdf[p]
	pageText = page.getText("text")
	matches = find_near_matches(
	snippet, pageText, max_l_dist=matchTolerance)
	for match in matches:
	matchText = match.matched
	if len(matchText):
	matchText = matchText + pageText.split(matchText)[-1]
	else:
	matchText = None

	if matchText:
	matches = find_near_matches(
	matchText, pageText, max_l_dist=4)
	for match in matches:
	matchText = match.matched
	clMatch = matchText.replace("\n", " ")[:30]
	print(f" MATCH: {clMatch}...")
	textOnPage = page.searchFor(matchText)
	while not textOnPage:
	textOnPage = page.searchFor(matchText)
	start = textOnPage[0].top_left
	end = textOnPage[-1].bottom_right
	series["page"] = p
	# series['match'] = matchText
	series["split_match_bot"] = matchText
	annot = page.addHighlightAnnot(
	None, start=start, stop=end)
	annot.set_colors({"stroke": color})

	series['page_min'] = p
	page_min = p
	if matchText:
	break

	if not len(series['split_match_top']):

	matchText = None
	snippet = h[-40:]
	for p in range(page_min, page_max):
	if len(series['split_match_top']):
	break
	page = pdf[p]
	pageText = page.getText("text")
	matches = find_near_matches(
	snippet, pageText, max_l_dist=matchTolerance)
	for match in matches:
	matchText = match.matched
	matchText = pageText.split(matchText)[0] + matchText
	if matchText:
	matches = find_near_matches(
	matchText, pageText, max_l_dist=4)
	for match in matches:
	matchText = match.matched
	clMatch = matchText.replace("\n", " ")[:30]
	print(f" MATCH: {clMatch}...")
	textOnPage = page.searchFor(matchText)
	while not textOnPage:
	textOnPage = page.searchFor(matchText)
	start = textOnPage[0].top_left
	end = textOnPage[-1].bottom_right
	series["page"] = p
	# series['match'] = matchText
	series["split_match_top"] = matchText
	annot = page.addHighlightAnnot(
	None, start=start, stop=end)
	annot.set_colors({"stroke": color})

	series['page_max'] = p
	page_max = p + 1
	if matchText:
	break

	return series

	# use the tab-trigger below for new function
	# xt-def-function

	if __name__ == '__main__':
	main()