Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save edcottrell/ba62d6a64a65f405806a34b40d49dd98 to your computer and use it in GitHub Desktop.
Save edcottrell/ba62d6a64a65f405806a34b40d49dd98 to your computer and use it in GitHub Desktop.
TextExpander Snippet for Cleaning OCR Output
// Attempt to clean up messy OCR output (in English) from the clipboard by fixing broken paragraphs,
// removing clutter from page headers and footers, and correcting a few other common glitches.
// It's always a good idea to read over whatever OCRed content you're pasting, but this helps me often.
// Start with the clipboard content
var text = TextExpander.pasteboardText;
// helper function to escape special characters modified from https://stackoverflow.com/questions/3115150/how-to-escape-regular-expression-special-characters-using-javascript
function escapeRegExp(text) {
return text.replace(/[-[\]{}()*+?.,\\^$|#]/g, '\\$&');
}
// Remove page numbers (including simple, number-only lines, and more complex repetitions of titles or book/journal names on every page or alternating pages)
// number-only lines
text = text.replace(/^[ \t]*\d+[ \t]*[\r\n]+/g, '');
// lines with a leading page number
pattern = text.match(/^\d+ ([A-Z].{0,70})(?=[\r\n]+[\s\S]+(^\d+ \1))/m);
if (pattern) {
pattern = pattern[1];
replacementRegex = new RegExp('^ *\\d+ ' + escapeRegExp(pattern).replace(/ /g, ' *') + ' *[\r\n]+', 'gm');
text = text.replace(replacementRegex, ' ');
// TextExpander.appendOutput(escapeRegExp(pattern).replace(/ /g, ' *') + "\n\n");
// TextExpander.appendOutput(replacementRegex + "\n\n");
}
// lines with a trailing page number
pattern = text.match(/^([A-Z].{0,70}) \d+(?=[\r\n]+[\s\S]+(^\1 \d+$))/);
if (pattern) {
pattern = pattern[1];
replacementRegex = new RegExp('^ *' + escapeRegExp(pattern).replace(/ /g, ' *') + ' +\\d+[\r\n]+', 'gm');
text = text.replace(replacementRegex, ' ');
}
// titles without page numbers
pattern = text.match(/^([A-Z].{0,70})(?=[\r\n]+[\s\S]+(^\1$))/);
if (pattern) {
pattern = pattern[1];
text = text.replace(new RegExp('^ *' + escapeRegExp(pattern).replace(/ /g, ' *') + ' *[\r\n]+', 'g'), ' ');
}
// Handle line breaks using hyphens
text = text.replace(/([a-z])-[\r\n]([a-z])/g, "$1$2");
text = text.replace(/(\d+)-[\r\n](\d+)/g, "$1-$2");
// Handle line breaks with other non-sentence punctuation
text = text.replace(/([A-Za-z0-9])([,;:])[\r\n]([a-z]|[A-Z]{2,})/g, "$1$2 $3");
// Handle line breaks with no punctuation
text = text.replace(/([A-Za-z0-9])[\r\n]([A-Za-z])/g, "$1 $2");
// Make quotation marks dumb and fix em-dashes
text = text.replace(/\u2018\b/g, "'") // Opening singles
.replace(/\u2019/g, "'") // Closing singles
.replace(/\u201c/g, '"') // Opening doubles
.replace(/\u201d/g, '"') // Closing doubles
.replace(/--/g, "\u2014"); // em-dashes
// Fix breaks in two-letter words
text = text.replace(/([a-z]) (?![bvpw]\.)([b-zàáâäæãåāçććèéêëēėęîïíīįìłñńôöòóœøōõßśšûüùúūÿžźż])(?!'[a-z])(?=[\s.'])/g, " $1$2");
// Fix "w." that should be "vv."
text = text.replace(/\bw\. (?=\d+)/g, "vv. ");
// Return the result
TextExpander.appendOutput(text.trim());
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment