edcottrell/textexpander-snippet-for-cleaning-ocr-output.js

## textexpander-snippet-for-cleaning-ocr-output.js
// Attempt to clean up messy OCR output (in English) from the clipboard by fixing broken paragraphs,
// removing clutter from page headers and footers, and correcting a few other common glitches.

// It's always a good idea to read over whatever OCRed content you're pasting, but this helps me often.

// Start with the clipboard content
var text = TextExpander.pasteboardText;


// helper function to escape special characters modified from https://stackoverflow.com/questions/3115150/how-to-escape-regular-expression-special-characters-using-javascript
function escapeRegExp(text) {
  return text.replace(/[-[\]{}()*+?.,\\^$|#]/g, '\\$&');
}


// Remove page numbers (including simple, number-only lines, and more complex repetitions of titles or book/journal names on every page or alternating pages)
// number-only lines
text = text.replace(/^[ \t]*\d+[ \t]*[\r\n]+/g, '');
// lines with a leading page number
pattern = text.match(/^\d+ ([A-Z].{0,70})(?=[\r\n]+[\s\S]+(^\d+ \1))/m);
if (pattern) {
  pattern = pattern[1];
  replacementRegex = new RegExp('^ *\\d+ ' + escapeRegExp(pattern).replace(/ /g, ' *') + ' *[\r\n]+', 'gm');
  text = text.replace(replacementRegex, ' ');
  // TextExpander.appendOutput(escapeRegExp(pattern).replace(/ /g, ' *') + "\n\n");
  // TextExpander.appendOutput(replacementRegex + "\n\n");
}
// lines with a trailing page number
pattern = text.match(/^([A-Z].{0,70}) \d+(?=[\r\n]+[\s\S]+(^\1 \d+$))/);
if (pattern) {
  pattern = pattern[1];
  replacementRegex = new RegExp('^ *' + escapeRegExp(pattern).replace(/ /g, ' *') + ' +\\d+[\r\n]+', 'gm');
  text = text.replace(replacementRegex, ' ');
}
// titles without page numbers
pattern = text.match(/^([A-Z].{0,70})(?=[\r\n]+[\s\S]+(^\1$))/);
if (pattern) {
  pattern = pattern[1];
  text = text.replace(new RegExp('^ *' + escapeRegExp(pattern).replace(/ /g, ' *') + ' *[\r\n]+', 'g'), ' ');
}


// Handle line breaks using hyphens
text = text.replace(/([a-z])-[\r\n]([a-z])/g, "$1$2");
text = text.replace(/(\d+)-[\r\n](\d+)/g, "$1-$2");


// Handle line breaks with other non-sentence punctuation
text = text.replace(/([A-Za-z0-9])([,;:])[\r\n]([a-z]|[A-Z]{2,})/g, "$1$2 $3");


// Handle line breaks with no punctuation
text = text.replace(/([A-Za-z0-9])[\r\n]([A-Za-z])/g, "$1 $2");


// Make quotation marks dumb and fix em-dashes
text = text.replace(/\u2018\b/g, "'")     // Opening singles
      .replace(/\u2019/g, "'")     // Closing singles
      .replace(/\u201c/g, '"')     // Opening doubles
      .replace(/\u201d/g, '"')     // Closing doubles
      .replace(/--/g,  "\u2014");     // em-dashes


// Fix breaks in two-letter words
text = text.replace(/([a-z]) (?![bvpw]\.)([b-zàáâäæãåāçććèéêëēėęîïíīįìłñńôöòóœøōõßśšûüùúūÿžźż])(?!'[a-z])(?=[\s.'])/g, " $1$2");


// Fix "w." that should be "vv."
text = text.replace(/\bw\. (?=\d+)/g, "vv. ");


// Return the result
TextExpander.appendOutput(text.trim());
	// Attempt to clean up messy OCR output (in English) from the clipboard by fixing broken paragraphs,
	// removing clutter from page headers and footers, and correcting a few other common glitches.

	// It's always a good idea to read over whatever OCRed content you're pasting, but this helps me often.

	// Start with the clipboard content
	var text = TextExpander.pasteboardText;


	// helper function to escape special characters modified from https://stackoverflow.com/questions/3115150/how-to-escape-regular-expression-special-characters-using-javascript
	function escapeRegExp(text) {
	return text.replace(/[-[\]{}()*+?.,\\^$\|#]/g, '\\$&');
	}


	// Remove page numbers (including simple, number-only lines, and more complex repetitions of titles or book/journal names on every page or alternating pages)
	// number-only lines
	text = text.replace(/^[ \t]\d+[ \t][\r\n]+/g, '');
	// lines with a leading page number
	pattern = text.match(/^\d+ ([A-Z].{0,70})(?=[\r\n]+[\s\S]+(^\d+ \1))/m);
	if (pattern) {
	pattern = pattern[1];
	replacementRegex = new RegExp('^ \\d+ ' + escapeRegExp(pattern).replace(/ /g, ' ') + ' *[\r\n]+', 'gm');
	text = text.replace(replacementRegex, ' ');
	// TextExpander.appendOutput(escapeRegExp(pattern).replace(/ /g, ' *') + "\n\n");
	// TextExpander.appendOutput(replacementRegex + "\n\n");
	}
	// lines with a trailing page number
	pattern = text.match(/^([A-Z].{0,70}) \d+(?=[\r\n]+[\s\S]+(^\1 \d+$))/);
	if (pattern) {
	pattern = pattern[1];
	replacementRegex = new RegExp('^ ' + escapeRegExp(pattern).replace(/ /g, ' ') + ' +\\d+[\r\n]+', 'gm');
	text = text.replace(replacementRegex, ' ');
	}
	// titles without page numbers
	pattern = text.match(/^([A-Z].{0,70})(?=[\r\n]+[\s\S]+(^\1$))/);
	if (pattern) {
	pattern = pattern[1];
	text = text.replace(new RegExp('^ ' + escapeRegExp(pattern).replace(/ /g, ' ') + ' *[\r\n]+', 'g'), ' ');
	}


	// Handle line breaks using hyphens
	text = text.replace(/([a-z])-[\r\n]([a-z])/g, "$1$2");
	text = text.replace(/(\d+)-[\r\n](\d+)/g, "$1-$2");


	// Handle line breaks with other non-sentence punctuation
	text = text.replace(/([A-Za-z0-9])([,;:])[\r\n]([a-z]\|[A-Z]{2,})/g, "$1$2 $3");


	// Handle line breaks with no punctuation
	text = text.replace(/([A-Za-z0-9])[\r\n]([A-Za-z])/g, "$1 $2");


	// Make quotation marks dumb and fix em-dashes
	text = text.replace(/\u2018\b/g, "'") // Opening singles
	.replace(/\u2019/g, "'") // Closing singles
	.replace(/\u201c/g, '"') // Opening doubles
	.replace(/\u201d/g, '"') // Closing doubles
	.replace(/--/g, "\u2014"); // em-dashes


	// Fix breaks in two-letter words
	text = text.replace(/([a-z]) (?![bvpw]\.)([b-zàáâäæãåāçććèéêëēėęîïíīįìłñńôöòóœøōõßśšûüùúūÿžźż])(?!'[a-z])(?=[\s.'])/g, " $1$2");


	// Fix "w." that should be "vv."
	text = text.replace(/\bw\. (?=\d+)/g, "vv. ");


	// Return the result
	TextExpander.appendOutput(text.trim());