Skip to content

Instantly share code, notes, and snippets.

@reagle reagle/gr-fix.py
Created Dec 18, 2019

Embed
What would you like to do?
Transform GoodReader PDF app annotations to more useful format while correcting common OCR scanning issues
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# (c) Copyright 2019 by Joseph Reagle
# Licensed under the GPLv3, see <http://www.gnu.org/licenses/gpl-3.0.html>
import argparse # http://docs.python.org/dev/library/argparse.html
from os.path import basename, splitext
import re
import sys
from enchant.checker import SpellChecker
def restore_spaces(text):
"""Restore spaces to OCR text using pyenchant, taken from
https://stackoverflow.com/questions/23314834/tokenizing-unsplit-words-from-ocr-using-nltk
"""
checker = SpellChecker("en_US")
# remove spurious hyphens, too aggressive right now...
text = re.sub(r"([a-zA-Z]{,2})(-)([a-zA-Z]{,2})", r"\1\3", text)
# print(text)
checker.set_text(text)
for error in checker:
# print(f'{error.word}, {error.suggest()}')
for suggestion in error.suggest():
# suggestion must be same as original with spaces removed
if error.word.replace(' ', '') == suggestion.replace(' ', ''):
error.replace(suggestion)
break
return checker.get_text()
def uncurly(text):
"""Replace curly quotes with straight, and dashes to markdown"""
text = text.replace(
'“', '"').replace(
'”', '"').replace(
'‘', "'").replace(
'’', "'").replace(
'–', "--").replace(
'—', '---')
return text
def process_lines(lines):
"""Process each line for annotation kind, color, and page number"""
new_text = []
RE_PAGE_NUM = re.compile(r'--- Page (\d+) ---')
RE_ANNOTATION = re.compile(r'^(?P<kind>\w+) \((?P<color>\w+)\),')
adjusted_page_num = page_num = 0
kind = ''
color = ''
prefix = ''
ignore_next_line = False
for line in lines:
if not line.strip() or ignore_next_line:
ignore_next_line = False
continue
if line.startswith('Bookmark:'):
ignore_next_line = True
continue
if RE_PAGE_NUM.match(line):
page_num = RE_PAGE_NUM.match(line).groups(0)[0]
if args.number:
adjusted_page_num = str(int(page_num) + args.number)
elif RE_ANNOTATION.match(line):
print_page_num = adjusted_page_num
kind, color = RE_ANNOTATION.match(line).groupdict().values()
if kind == 'Note':
prefix = ''
elif kind == 'Highlight':
if color == 'yellow':
prefix = 'excerpt.'
if color == 'blue':
prefix = 'section.'
print_page_num = ''
else:
fixed_line = uncurly(restore_spaces(line))
new_text.append(
f'{print_page_num} {prefix} {fixed_line}'.strip())
return '\n'.join(new_text)
def main(argv):
"""Process arguments"""
arg_parser = argparse.ArgumentParser(
description="Format GoodRead annotation export for de.py")
arg_parser.add_argument("file_name", nargs="?", metavar="FILE_NAME")
# positional arguments
arg_parser.add_argument(
"-n",
"--number",
type=int,
default=0,
help="sum positive/negative number with pagination",
)
arg_parser.add_argument("--version", action="version", version="0.1")
return arg_parser.parse_args(argv)
TEST = """
--- Page 36 ---
Bookmark:
Dec 9
Highlight (blue), reagle:
3 A World Brain as an “Education System”
Highlight (blue), reagle:
3.1 A World BrainasaLearning System
Note (yellow), reagle:
Little discussion of HG Wells directly—rather a literaturereview of work related to motifs in Wells
--- Page 39 ---
Highlight (blue), reagle:
3.2 A World Brain asaTeaching System
--- Page 71 ---
Highlight (yellow), reagle:
In my own view, there is an urgentneed for a sudden surge of understanding, positive thinking andaltruistic attitudes.
""".split('\n')
if __name__ == '__main__':
args = main(sys.argv[1:])
if args.file_name:
with open(args.file_name) as f:
lines = f.readlines()
fixed_fn = splitext(args.file_name)[0]+'-fix.txt'
with open(fixed_fn, 'w') as ff:
ff.write(process_lines(lines))
else:
lines = TEST
print(process_lines(lines))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.