Skip to content

Instantly share code, notes, and snippets.

@reagle reagle/gr-fix.py
Last active Mar 24, 2020

Embed
What would you like to do?
Transform GoodReader PDF app annotations to more useful format while correcting common OCR scanning issues
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# (c) Copyright 2019 by Joseph Reagle
# Licensed under the GPLv3, see <http://www.gnu.org/licenses/gpl-3.0.html>
import argparse # http://docs.python.org/dev/library/argparse.html
import difflib
import logging
from os.path import basename, splitext
import re
import sys
from enchant.checker import SpellChecker # https://pypi.org/project/pyenchant/
debug = logging.debug
info = logging.info
warning = logging.warning
error = logging.error
critical = logging.critical
exception = logging.exception
def restore_spaces(text):
"""Restore spaces to OCR text using pyenchant, taken from
https://stackoverflow.com/questions/23314834/tokenizing-unsplit-words-from-ocr-using-nltk
"""
checker = SpellChecker("en_US")
# remove spurious hyphens, too aggressive right now...
text = re.sub(r"([a-zA-Z]{,2})(-)([a-zA-Z]{,2})", r"\1\3", text)
# info(text)
checker.set_text(text)
for error in checker:
# info(f'{error.word}, {error.suggest()}')
for suggestion in error.suggest():
# suggestion must be same as original with spaces removed
if error.word.replace(" ", "") == suggestion.replace(" ", ""):
error.replace(suggestion)
break
return checker.get_text()
def uncurly(text):
"""Replace curly quotes with straight, and dashes to markdown"""
text = (
text.replace("“", '"')
.replace("”", '"')
.replace("‘", "'")
.replace("’", "'")
.replace("–", "--")
.replace("—", "---")
)
return text
def process_text(text):
"""Process text for annotation kind, color, and page number, joining
lines that need not be separated"""
text_new = []
RE_PAGE_NUM = re.compile(r"--- Page (\d+) ---")
RE_ANNOTATION = re.compile(r"^(?P<kind>\w+) \((?P<color>\w+)\),")
RE_JOIN_LINES = re.compile(r"([a-z] ?)\n\n([a-z])")
page_num = 0 # parsed page number
page_num_adjusted = 0 # page number offset -- by command argument
page_num_print = "" # page number printed (or not for section/chapters)
kind = ""
color = ""
prefix = ""
ignore_next_line = False
text_joined = RE_JOIN_LINES.sub(r"\1\2", text) # remove spurious \n
for line in text_joined.split("\n"):
info(f"{line=}")
info(f"{page_num=}")
info(f"{page_num_print=}")
if not line.strip() or ignore_next_line:
ignore_next_line = False
continue
if line.startswith("Bookmark:"):
ignore_next_line = True
continue
if RE_PAGE_NUM.match(line):
info(f"RE_PAGE_NUM match")
page_num = RE_PAGE_NUM.match(line).groups(0)[0]
info(f"{page_num=} SET")
page_num_adjusted = str(int(page_num) + args.number)
info(f"{page_num_adjusted=}")
elif RE_ANNOTATION.match(line):
info(f"RE_ANNOTATION match")
page_num_print = page_num_adjusted
kind, color = RE_ANNOTATION.match(line).groupdict().values()
if kind == "Note":
prefix = ""
elif kind == "Highlight":
if color == "yellow":
prefix = "excerpt."
if color == "blue":
prefix = "section."
page_num_print = ""
else:
fixed_line = uncurly(restore_spaces(line))
info(f"{page_num_print} {prefix} {fixed_line}".strip())
text_new.append(f"{page_num_print} {prefix} {fixed_line}".strip())
return "\n".join(text_new)
def main(argv):
"""Process arguments"""
# https://docs.python.org/3/library/argparse.html
arg_parser = argparse.ArgumentParser(
description="""Format GoodRead annotation export.
Annotations exported via email https://youtu.be/jh8tebAPWTc?t=117
Can then be used by dictation-extract.py in
https://github.com/reagle/thunderdell
"""
)
# positional arguments
arg_parser.add_argument("file_name", nargs="?", metavar="FILE_NAME")
# optional arguments
arg_parser.add_argument(
"-n",
"--number",
type=int,
default=0,
help="sum positive/negative number with pagination",
)
arg_parser.add_argument(
"-o",
"--output-to-file",
action="store_true",
default=False,
help="output to FILE-fixed.EXT",
)
arg_parser.add_argument(
"-L",
"--log-to-file",
action="store_true",
default=False,
help="log to file %(prog)s.log",
)
arg_parser.add_argument(
"-T",
"--test",
action="store_true",
default=False,
help="run doc tests",
)
arg_parser.add_argument(
"-V",
"--verbose",
action="count",
default=0,
help="Increase verbosity (specify multiple times for more)",
)
arg_parser.add_argument("--version", action="version", version="0.1")
args = arg_parser.parse_args(argv)
log_level = 100 # default
if args.verbose >= 3:
log_level = logging.DEBUG # 10
elif args.verbose == 2:
log_level = logging.INFO # 20
elif args.verbose == 1:
log_level = logging.ERROR # 40
LOG_FORMAT = "%(levelno)s %(funcName).5s: %(message)s"
if args.log_to_file:
logging.basicConfig(
filename="PROG-TEMPLATE.log",
filemode="w",
level=log_level,
format=LOG_FORMAT,
)
else:
logging.basicConfig(level=log_level, format=LOG_FORMAT)
return args
TEST_IN = """
--- Page 36 ---
Bookmark:
Dec 9
Highlight (blue), reagle:
3 A World Brain as an “Education System”
Highlight (blue), reagle:
3.1 A World BrainasaLearning System
Note (yellow), reagle:
Little discussion of HG Wells directly—rather a literaturereview of work related to motifs in Wells
--- Page 39 ---
Highlight (blue), reagle:
3.2 A World Brain asaTeaching System
--- Page 71 ---
Highlight (yellow), reagle:
In my own view, there is an urgentneed for a sudden surge of
understanding, positive thinking andaltruistic attitudes.
"""
TEST_OUT = """section. 3 A World Brain as an "Education System"
section. 3.1 A World BrainasaLearning System
36 Little discussion of HG Wells directly---rather a literature review of work related to motifs in Wells
section. 3.2 A World Brain asaTeaching System
71 excerpt. In my own view, there is an urgent need for a sudden surge of understanding, positive thinking and altruistic attitudes."""
if __name__ == "__main__":
args = main(sys.argv[1:])
critical(f"==================================")
critical(f"{args=}")
file_name = args.file_name
if args.file_name:
with open(file_name) as f:
text = f.read()
if args.output_to_file:
fixed_fn = splitext(args.file_name)[0] + "-fixed.txt"
fixed_fd = open(fixed_fn, "w")
else:
fixed_fd = sys.stdout
fixed_fd.write(process_text(text))
fixed_fd.close()
elif args.test:
TEST_RESULTS = process_text(TEST_IN)
print(TEST_RESULTS)
for diff in difflib.context_diff(
process_text(TEST_IN).split("\n"), TEST_OUT.split("\n")
):
print(diff)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.