Created
July 5, 2023 02:05
-
-
Save edcottrell/ba62d6a64a65f405806a34b40d49dd98 to your computer and use it in GitHub Desktop.
TextExpander Snippet for Cleaning OCR Output
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Attempt to clean up messy OCR output (in English) from the clipboard by fixing broken paragraphs, | |
// removing clutter from page headers and footers, and correcting a few other common glitches. | |
// It's always a good idea to read over whatever OCRed content you're pasting, but this helps me often. | |
// Start with the clipboard content | |
var text = TextExpander.pasteboardText; | |
// helper function to escape special characters modified from https://stackoverflow.com/questions/3115150/how-to-escape-regular-expression-special-characters-using-javascript | |
function escapeRegExp(text) { | |
return text.replace(/[-[\]{}()*+?.,\\^$|#]/g, '\\$&'); | |
} | |
// Remove page numbers (including simple, number-only lines, and more complex repetitions of titles or book/journal names on every page or alternating pages) | |
// number-only lines | |
text = text.replace(/^[ \t]*\d+[ \t]*[\r\n]+/g, ''); | |
// lines with a leading page number | |
pattern = text.match(/^\d+ ([A-Z].{0,70})(?=[\r\n]+[\s\S]+(^\d+ \1))/m); | |
if (pattern) { | |
pattern = pattern[1]; | |
replacementRegex = new RegExp('^ *\\d+ ' + escapeRegExp(pattern).replace(/ /g, ' *') + ' *[\r\n]+', 'gm'); | |
text = text.replace(replacementRegex, ' '); | |
// TextExpander.appendOutput(escapeRegExp(pattern).replace(/ /g, ' *') + "\n\n"); | |
// TextExpander.appendOutput(replacementRegex + "\n\n"); | |
} | |
// lines with a trailing page number | |
pattern = text.match(/^([A-Z].{0,70}) \d+(?=[\r\n]+[\s\S]+(^\1 \d+$))/); | |
if (pattern) { | |
pattern = pattern[1]; | |
replacementRegex = new RegExp('^ *' + escapeRegExp(pattern).replace(/ /g, ' *') + ' +\\d+[\r\n]+', 'gm'); | |
text = text.replace(replacementRegex, ' '); | |
} | |
// titles without page numbers | |
pattern = text.match(/^([A-Z].{0,70})(?=[\r\n]+[\s\S]+(^\1$))/); | |
if (pattern) { | |
pattern = pattern[1]; | |
text = text.replace(new RegExp('^ *' + escapeRegExp(pattern).replace(/ /g, ' *') + ' *[\r\n]+', 'g'), ' '); | |
} | |
// Handle line breaks using hyphens | |
text = text.replace(/([a-z])-[\r\n]([a-z])/g, "$1$2"); | |
text = text.replace(/(\d+)-[\r\n](\d+)/g, "$1-$2"); | |
// Handle line breaks with other non-sentence punctuation | |
text = text.replace(/([A-Za-z0-9])([,;:])[\r\n]([a-z]|[A-Z]{2,})/g, "$1$2 $3"); | |
// Handle line breaks with no punctuation | |
text = text.replace(/([A-Za-z0-9])[\r\n]([A-Za-z])/g, "$1 $2"); | |
// Make quotation marks dumb and fix em-dashes | |
text = text.replace(/\u2018\b/g, "'") // Opening singles | |
.replace(/\u2019/g, "'") // Closing singles | |
.replace(/\u201c/g, '"') // Opening doubles | |
.replace(/\u201d/g, '"') // Closing doubles | |
.replace(/--/g, "\u2014"); // em-dashes | |
// Fix breaks in two-letter words | |
text = text.replace(/([a-z]) (?![bvpw]\.)([b-zàáâäæãåāçććèéêëēėęîïíīįìłñńôöòóœøōõßśšûüùúūÿžźż])(?!'[a-z])(?=[\s.'])/g, " $1$2"); | |
// Fix "w." that should be "vv." | |
text = text.replace(/\bw\. (?=\d+)/g, "vv. "); | |
// Return the result | |
TextExpander.appendOutput(text.trim()); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment