joewiz/check-text-for-ocr-typo-patterns.xq

## check-text-for-ocr-typo-patterns.xq
xquery version "3.1";

(:~
 : Find possible OCR errors in a text by checking for patterns that an OCR
 : process is known to misread, e.g., "day" misread as "clay", or "France"
 : misread as "Prance." If the OCR engine just misread some instances of these
 : words but got other instances correct, then this query will highlight
 : candidates for correction.
 :
 : The query lets you configure a source text and define pattern sets to be used.
 :
 : Pattern sets consist of four parameters:
 :
 : 1. "sic": the OCR error
 : 2. "corr": the correct version
 : 3. "anchored": whether the error appears at the start of a word, the end of
 :    the word, or anywhere in the word
 : 4. "case-sensitive": whether the pattern should be treated as case-sensitive
 :    or case-insensitive
 :
 : In the case of the day/clay example above, the parameters would be:
 :
 : 1. "sic": "cl"
 : 2. "corr": "d"
 : 3. "anchored": "start"
 : 4. "case-sensitive": true
 :
 : If we apply this pattern to the following text:
 :
 : "clay clown day down"
 :
 : ... the expected result would be:
 :
 : [
 :    {
 :        "sic": "clay"
 :        "corr": "day"
 :    },
 :    {
 :        "sic": "clown",
 :        "corr": "down"
 :    }
 : ]
 :
 :)

declare namespace array="http://www.w3.org/2005/xpath-functions/array";
declare namespace map="http://www.w3.org/2005/xpath-functions/map";
declare namespace output="http://www.w3.org/2010/xslt-xquery-serialization";

declare namespace tei="http://www.tei-c.org/ns/1.0";

declare option output:method "adaptive";

declare variable $local:pattern-sets := (
    (: States > Slates :)
    map {
        "sic-literal": "Sl",
        "corr-literal": "St",
        "anchored": "start",
        "case-sensitive": true()
    },
    (: Mexiro > Mexico :)
    map {
        "sic-literal": "ro",
        "corr-literal": "co",
        "anchored": "end",
        "case-sensitive": true()
    },
    (: Prance > France :)
    map {
        "sic-literal": "Pr",
        "corr-literal": "Fr",
        "anchored": "start",
        "case-sensitive": true()
    },
    (: clay > day :)
    map {
        "sic-literal": "cl",
        "corr-literal": "d",
        "anchored": "start",
            "case-sensitive": true()
    },
    (: elate > date :)
    map {
        "sic-literal": "el",
        "corr-literal": "d",
        "anchored": "start",
        "case-sensitive": true()
    }
);

(:~ Convert a string literal into a regular expression for the position :)
declare function local:literal-to-regex($literal as xs:string, $position as xs:string) {
    switch ($position)
        case "start" return "^(" || $literal || ")(.+)$"
        case "end" return "^(.+)(" || $literal || ")$"
        default (: anywhere :) return "^(.*)(" || $literal || ")(.*)$"
};

(:~ Find matches for a regular expression in a sequence of words :)
declare function local:find-matches($words as xs:string*, $regex as xs:string, $flags as xs:string) as map(*)* {
    for $word in $words
    let $analysis := analyze-string($word, $regex, $flags)[fn:match]
    return
        if (exists($analysis)) then
            map {
                "word": $word,
                "match": $analysis/fn:match
            }
        else
            ()
};

(:~ Check a sequence of words for sic/corr pairs that match known OCR error patterns :)
declare function local:check-words($words as xs:string*, $pattern-set as map(*)) {
    (: Transform the literals into regular expressions :)
    let $sic-regex := local:literal-to-regex($pattern-set?sic-literal, $pattern-set?anchored)
    let $corr-regex := local:literal-to-regex($pattern-set?corr-literal, $pattern-set?anchored)
    let $regex-flags := if ($pattern-set?case-sensitive) then "" else "i"
    (: Find any cases matching the known OCR errors :)
    let $sics := local:find-matches($words, $sic-regex, $regex-flags)
    return
        if (exists($sics)) then
            (: Find any cases matching the known correct versions :)
            let $corrs := local:find-matches($words, $corr-regex, $regex-flags)
            return
                if (exists($corrs)) then
                    (: Compare every sic to every corr :)
                    for $sic in $sics, $corr in $corrs
                    let $sic-groups := $sic?match/fn:group
                    let $corr-groups := $corr?match/fn:group
                    return
                        (: Only compare if the OCR error occurs in the same place in the word :)
                        (: This is currently naive - stripping out all of the literals :)
                        if (count($sic-groups) eq count($corr-groups)) then
                            let $sic-segs := $sic-groups[. ne $pattern-set?sic-literal]/string()
                            let $corr-segs := $corr-groups[. ne $pattern-set?corr-literal]/string()
                            (: Do the non-error portions of the two words match? :)
                            let $segs-align :=
                                for-each-pair(
                                    $sic-segs,
                                    $corr-segs,
                                    function($sic-seg, $corr-seg) {
                                        $sic-seg eq $corr-seg
                                    }
                                )
                            return
                                (: The first two conditions here are the result of the naive approach described above :)
                                if (exists($sic-segs) and exists($corr-segs) and (every $seg-align in $segs-align satisfies $seg-align)) then
                                    map {
                                        "sic": $sic?word,
                                        "corr": $corr?word
                                    }
                                else
                                    ()
                        else
                            ()
                else
                    ()
        else
            ()
};

(: Define the text here or reference an external document :)
let $text :=
    "United Slates? of America United States: of America slates Mexico!
    Mexiro. France) Prance day clay; down clown date elate*"
    (:
    (: Note: in BaseX, enter `SET XINCLUDE false` in the command window to ignore the XInclude in the FRUS vol :)
    doc("https://github.com/HistoryAtState/frus/raw/eaaf9957b4e1ec2b0039db6d2ef8bc54864bdd1c/volumes/frus1861.xml")//tei:text//text() => string-join(" ")
    :)
(: Split the text into words :)
let $words :=
    $text
    => tokenize()
    (: strip off trailing punctuation :)
    => for-each( function($item) { replace($item, "\p{P}+$", "") } )
    => distinct-values()
(: Attempt to find matches :)
let $attempts :=
    for $pattern-set in $local:pattern-sets
    return
        map {
            (: optionally return the original pattern set :)
            (: "pattern-set": $pattern-set, :)
            "results": array { local:check-words($words, $pattern-set) }
        }
(: Filter out attempts that returned no results :)
(: Note: on eXist 5.2, comment out the predicate; see https://github.com/eXist-db/exist/issues/3240 :)
let $candidates := $attempts[array:size(?results) gt 0]
return
    map {
        "distinct-word-count": count($words),
        "pattern-match-count": count($candidates?results?*),
        "candidates": array {
            (: Display just the result pairs, sorted alphabetically :)
            $candidates?results?*
            => sort((), function($match) { ($match?sic, $match?corr) ! lower-case(.) })
        }
    }

## check-text-for-ocr-typo-patterns_results-1.json
{
    "distinct-word-count": 16,
    "pattern-match-count": 6,
    "candidates": [
        {
            "corr": "day",
            "sic": "clay"
        },
        {
            "corr": "down",
            "sic": "clown"
        },
        {
            "corr": "date",
            "sic": "elate"
        },
        {
            "corr": "Mexico",
            "sic": "Mexiro"
        },
        {
            "corr": "France",
            "sic": "Prance"
        },
        {
            "corr": "States",
            "sic": "Slates"
        }
    ]
}

## check-text-for-ocr-typo-patterns_results-2.json
{
    "distinct-word-count": 10357,
    "pattern-match-count": 5,
    "candidates": [
        {
            "corr": "day",
            "sic": "clay"
        },
        {
            "corr": "dear",
            "sic": "clear"
        },
        {
            "corr": "date",
            "sic": "elate"
        },
        {
            "corr": "France",
            "sic": "Prance"
        },
        {
            "corr": "States",
            "sic": "Slates"
        }
    ]
}
	xquery version "3.1";

	(:~
	: Find possible OCR errors in a text by checking for patterns that an OCR
	: process is known to misread, e.g., "day" misread as "clay", or "France"
	: misread as "Prance." If the OCR engine just misread some instances of these
	: words but got other instances correct, then this query will highlight
	: candidates for correction.
	:
	: The query lets you configure a source text and define pattern sets to be used.
	:
	: Pattern sets consist of four parameters:
	:
	: 1. "sic": the OCR error
	: 2. "corr": the correct version
	: 3. "anchored": whether the error appears at the start of a word, the end of
	: the word, or anywhere in the word
	: 4. "case-sensitive": whether the pattern should be treated as case-sensitive
	: or case-insensitive
	:
	: In the case of the day/clay example above, the parameters would be:
	:
	: 1. "sic": "cl"
	: 2. "corr": "d"
	: 3. "anchored": "start"
	: 4. "case-sensitive": true
	:
	: If we apply this pattern to the following text:
	:
	: "clay clown day down"
	:
	: ... the expected result would be:
	:
	: [
	: {
	: "sic": "clay"
	: "corr": "day"
	: },
	: {
	: "sic": "clown",
	: "corr": "down"
	: }
	: ]
	:
	:)

	declare namespace array="http://www.w3.org/2005/xpath-functions/array";
	declare namespace map="http://www.w3.org/2005/xpath-functions/map";
	declare namespace output="http://www.w3.org/2010/xslt-xquery-serialization";

	declare namespace tei="http://www.tei-c.org/ns/1.0";

	declare option output:method "adaptive";

	declare variable $local:pattern-sets := (
	(: States > Slates :)
	map {
	"sic-literal": "Sl",
	"corr-literal": "St",
	"anchored": "start",
	"case-sensitive": true()
	},
	(: Mexiro > Mexico :)
	map {
	"sic-literal": "ro",
	"corr-literal": "co",
	"anchored": "end",
	"case-sensitive": true()
	},
	(: Prance > France :)
	map {
	"sic-literal": "Pr",
	"corr-literal": "Fr",
	"anchored": "start",
	"case-sensitive": true()
	},
	(: clay > day :)
	map {
	"sic-literal": "cl",
	"corr-literal": "d",
	"anchored": "start",
	"case-sensitive": true()
	},
	(: elate > date :)
	map {
	"sic-literal": "el",
	"corr-literal": "d",
	"anchored": "start",
	"case-sensitive": true()
	}
	);

	(:~ Convert a string literal into a regular expression for the position :)
	declare function local:literal-to-regex($literal as xs:string, $position as xs:string) {
	switch ($position)
	case "start" return "^(" \|\| $literal \|\| ")(.+)$"
	case "end" return "^(.+)(" \|\| $literal \|\| ")$"
	default (: anywhere :) return "^(.)(" \|\| $literal \|\| ")(.)$"
	};

	(:~ Find matches for a regular expression in a sequence of words :)
	declare function local:find-matches($words as xs:string, $regex as xs:string, $flags as xs:string) as map()* {
	for $word in $words
	let $analysis := analyze-string($word, $regex, $flags)[fn:match]
	return
	if (exists($analysis)) then
	map {
	"word": $word,
	"match": $analysis/fn:match
	}
	else
	()
	};

	(:~ Check a sequence of words for sic/corr pairs that match known OCR error patterns :)
	declare function local:check-words($words as xs:string, $pattern-set as map()) {
	(: Transform the literals into regular expressions :)
	let $sic-regex := local:literal-to-regex($pattern-set?sic-literal, $pattern-set?anchored)
	let $corr-regex := local:literal-to-regex($pattern-set?corr-literal, $pattern-set?anchored)
	let $regex-flags := if ($pattern-set?case-sensitive) then "" else "i"
	(: Find any cases matching the known OCR errors :)
	let $sics := local:find-matches($words, $sic-regex, $regex-flags)
	return
	if (exists($sics)) then
	(: Find any cases matching the known correct versions :)
	let $corrs := local:find-matches($words, $corr-regex, $regex-flags)
	return
	if (exists($corrs)) then
	(: Compare every sic to every corr :)
	for $sic in $sics, $corr in $corrs
	let $sic-groups := $sic?match/fn:group
	let $corr-groups := $corr?match/fn:group
	return
	(: Only compare if the OCR error occurs in the same place in the word :)
	(: This is currently naive - stripping out all of the literals :)
	if (count($sic-groups) eq count($corr-groups)) then
	let $sic-segs := $sic-groups[. ne $pattern-set?sic-literal]/string()
	let $corr-segs := $corr-groups[. ne $pattern-set?corr-literal]/string()
	(: Do the non-error portions of the two words match? :)
	let $segs-align :=
	for-each-pair(
	$sic-segs,
	$corr-segs,
	function($sic-seg, $corr-seg) {
	$sic-seg eq $corr-seg
	}
	)
	return
	(: The first two conditions here are the result of the naive approach described above :)
	if (exists($sic-segs) and exists($corr-segs) and (every $seg-align in $segs-align satisfies $seg-align)) then
	map {
	"sic": $sic?word,
	"corr": $corr?word
	}
	else
	()
	else
	()
	else
	()
	else
	()
	};

	(: Define the text here or reference an external document :)
	let $text :=
	"United Slates? of America United States: of America slates Mexico!
	Mexiro. France) Prance day clay; down clown date elate*"
	(:
	(: Note: in BaseX, enter `SET XINCLUDE false` in the command window to ignore the XInclude in the FRUS vol :)
	doc("https://github.com/HistoryAtState/frus/raw/eaaf9957b4e1ec2b0039db6d2ef8bc54864bdd1c/volumes/frus1861.xml")//tei:text//text() => string-join(" ")
	:)
	(: Split the text into words :)
	let $words :=
	$text
	=> tokenize()
	(: strip off trailing punctuation :)
	=> for-each( function($item) { replace($item, "\p{P}+$", "") } )
	=> distinct-values()
	(: Attempt to find matches :)
	let $attempts :=
	for $pattern-set in $local:pattern-sets
	return
	map {
	(: optionally return the original pattern set :)
	(: "pattern-set": $pattern-set, :)
	"results": array { local:check-words($words, $pattern-set) }
	}
	(: Filter out attempts that returned no results :)
	(: Note: on eXist 5.2, comment out the predicate; see https://github.com/eXist-db/exist/issues/3240 :)
	let $candidates := $attempts[array:size(?results) gt 0]
	return
	map {
	"distinct-word-count": count($words),
	"pattern-match-count": count($candidates?results?*),
	"candidates": array {
	(: Display just the result pairs, sorted alphabetically :)
	$candidates?results?*
	=> sort((), function($match) { ($match?sic, $match?corr) ! lower-case(.) })
	}
	}
	{
	"distinct-word-count": 16,
	"pattern-match-count": 6,
	"candidates": [
	{
	"corr": "day",
	"sic": "clay"
	},
	{
	"corr": "down",
	"sic": "clown"
	},
	{
	"corr": "date",
	"sic": "elate"
	},
	{
	"corr": "Mexico",
	"sic": "Mexiro"
	},
	{
	"corr": "France",
	"sic": "Prance"
	},
	{
	"corr": "States",
	"sic": "Slates"
	}
	]
	}
	{
	"distinct-word-count": 10357,
	"pattern-match-count": 5,
	"candidates": [
	{
	"corr": "day",
	"sic": "clay"
	},
	{
	"corr": "dear",
	"sic": "clear"
	},
	{
	"corr": "date",
	"sic": "elate"
	},
	{
	"corr": "France",
	"sic": "Prance"
	},
	{
	"corr": "States",
	"sic": "Slates"
	}
	]
	}