Created
November 21, 2017 18:38
-
-
Save joewiz/d46dc4a490590a9f1e6fe80b12b12d31 to your computer and use it in GitHub Desktop.
Enrich dates in mixed content, with XQuery
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xquery version "3.1"; | |
(: Turning "December 7, 1941" into <date>December 7, 1941</date> isn't too hard, with XPath 3.0's | |
fn:analyze-string() function, but if the date string occurs in mixed text, such as: | |
<p>Pearl Harbor was attacked on <em>December</em> 7, 1941.</p> | |
and you want to preserve the existing element structure to return: | |
<p>Pearl Harbor was attacked on <date><em>December</em> 7, 1941</date>.</p> | |
it's quite a bit more challenging. | |
This query uses string processing to align the results of fn:string-analyze() with the input's | |
original node structure. | |
Caveats: The local:hack() function may result in non-well-formed XML. This isn't guaranteed to | |
work with all source XML. I think it should work in cases where the date components are wrapped in | |
"inline" elements, but haven't tested this extensively. | |
The regex and many functions used here are drawn from @WaxCylinderRevival's date-matching query: | |
https://gist.github.com/WaxCylinderRevival/b61b9843f118909bf6cf41c922632559 | |
:) | |
declare boundary-space preserve; | |
declare function local:month-to-mm($m) { | |
switch (analyze-string(lower-case($m),'(january|janvier|enero|february|février|fevrier|febrero|march|mart|marzo|april|avril|abril|may|mai|mayo|june|juin|junio|july|juillet|julio|august|août|aout|agosto|september|septembre|septiembre|setiembre|october|octobre|octubre|november|novembre|noviembre|december|décembre|decembre|diciembre)',"i")/fn:match) | |
case "January" | |
case "january" | |
case "janvier" | |
case "enero" return "01" | |
case "February" | |
case "february" | |
case "février" | |
case "fevrier" | |
case "febrero" return "02" | |
case "March" | |
case "march" | |
case "mart" | |
case "marzo" return "03" | |
case "April" | |
case "april" | |
case "avril" | |
case "abril" return "04" | |
case "May" | |
case "may" | |
case "mai" | |
case "mayo" return "05" | |
case "June" | |
case "june" | |
case "juin" | |
case "junio" return "06" | |
case "July" | |
case "july" | |
case "juillet" | |
case "julio" return "07" | |
case "August" | |
case "august" | |
case "août" | |
case "aout" | |
case "agosto" return "08" | |
case "September" | |
case "september" | |
case "septembre" | |
case "septiembre" | |
case "setiembre" return "09" | |
case "October" | |
case "october" | |
case "octobre" | |
case "octubre" return "10" | |
case "November" | |
case "november" | |
case "novembre" | |
case "noviembre" return "11" | |
case "December" | |
case "december" | |
case "décembre" | |
case "decembre" | |
case "diciembre" return "12" | |
default return "error" | |
}; | |
declare function local:enrich-matches($nodes, $enrich as function(*)) { | |
for $node in $nodes | |
return | |
typeswitch ( $node ) | |
case element(fn:non-match) return $node/string() | |
case element(fn:match) return $enrich($node) | |
case element() return local:enrich-matches($node/node(), $enrich) | |
default return | |
$node | |
}; | |
declare function local:analyze-text($text, $patterns) { | |
if (exists($patterns)) then | |
let $pattern := head($patterns) | |
let $analysis := analyze-string($text, $pattern?regex) | |
return | |
if ($analysis/fn:match) then | |
local:enrich-matches($analysis, $pattern?enrich-match) | |
else | |
local:analyze-text($text, tail($patterns)) | |
else | |
$text | |
}; | |
declare function local:analyze($nodes, $text-patterns) { | |
for $node in $nodes | |
return | |
typeswitch ( $node ) | |
case text() return | |
local:analyze-text($node, $text-patterns) | |
case element() return | |
element { node-name($node) } { $node/@*, local:analyze($node/node(), $text-patterns) } | |
default return | |
$node | |
}; | |
declare variable $local:regex-primitives := | |
(: regex primitives :) | |
map { | |
"months-en": "(January|February|March|April|May|June|July|August|September|October|November|December)", | |
"day": "(\d{1,2})", | |
"year": "(\d{4})", | |
"space": " ", | |
"comma-space": ", " | |
} | |
; | |
declare function local:compile-pattern($components) { | |
$components?* ! $local:regex-primitives(.) | |
=> string-join() | |
}; | |
declare function local:get-text-node-at-offset($text-nodes, $offset) { | |
if (count($text-nodes) gt 1) then | |
let $node := $text-nodes => head() | |
let $preceding := $node/preceding::text() | |
let $start-pos := string-length($preceding => string-join()) | |
let $end-pos := $start-pos + string-length($node) | |
return | |
if ($start-pos le $offset and $end-pos ge $offset) then | |
map { "node": $node, "position": $offset - $start-pos } | |
else | |
local:get-text-node-at-offset($text-nodes => tail(), $offset) | |
else | |
$text-nodes | |
}; | |
declare function local:insert-milestones($nodes, $milestones) { | |
for $node in $nodes | |
return | |
typeswitch ($node) | |
case element() return | |
let $milestone := $milestones[?node is ($node//text())[1] and ?position eq 1] | |
return | |
( | |
if (exists($milestone) and $node/preceding-sibling::node()) then | |
element { $milestone?name } { () } | |
else | |
() | |
, | |
element {node-name($node)} {$node/@*, local:insert-milestones($node/node(), $milestones) } | |
) | |
case text() return | |
let $milestone := $milestones[?node is $node] | |
return | |
if (exists($milestone) and exists($node/preceding-sibling::node())) then | |
( | |
if ($milestone?position gt 1) then | |
substring($node, 1, $milestone?position - 1) | |
else | |
() | |
, | |
element { $milestone?name } { () } | |
, | |
if ($milestone?position lt string-length($node)) then | |
substring($node, $milestone?position) | |
else | |
() | |
) | |
else | |
$node | |
default return $node | |
}; | |
declare function local:process-dates($node, $dates, $pos) { | |
if (exists($dates)) then | |
let $date := $dates => head() | |
let $start := string-length($date/preceding::text() => string-join()) + 1 | |
let $length := string-length($date) | |
let $end := $start + $length | |
let $start-node := local:get-text-node-at-offset($node//text(), $start) | |
let $end-node := local:get-text-node-at-offset($node//text(), $end) | |
let $milestones := (map:put($start-node, "name", "date-start-" || $pos), map:put($end-node, "name", "date-end-" || $pos)) | |
let $insert-milestones := local:insert-milestones($node, $milestones) | |
return | |
local:process-dates($insert-milestones, $dates => tail(), $pos + 1) | |
else | |
$node | |
}; | |
declare function local:hack($serialized, $dates, $pos) { | |
if (exists($dates)) then | |
let $date := $dates => head() | |
let $processed := | |
$serialized | |
=> replace("<date-start-" || $pos || "/>", '<date when="' || $date/@when || '">') | |
=> replace("<date-end-" || $pos || "/>", "</date>") | |
return | |
local:hack($processed, $dates => tail(), $pos + 1) | |
else | |
$serialized | |
}; | |
let $patterns := | |
( | |
map { | |
"name": "month-day-year", | |
"regex": ["months-en", "space", "day", "comma-space", "year"] => local:compile-pattern(), | |
"enrich-match": function($match) { | |
let $groups := $match/fn:group | |
let $when := | |
( | |
$groups[@nr='3'], | |
$groups[@nr='1'] => local:month-to-mm(), | |
$groups[@nr='2'] => format-number('00') | |
) | |
=> string-join("-") | |
return | |
<date when="{$when}">{$match/string()}</date> | |
} | |
} | |
) | |
let $node := | |
<div> | |
<p> | |
<em>Pearl Harbor</em> was attacked on <strong><em>December</em></strong> <strong>7</strong>, 1941. My birthday is <birthday>September 29, <year>1976</year></birthday>. | |
</p> | |
</div> | |
let $simple := local:analyze(element {node-name($node)} {$node/string()}, $patterns) | |
let $dates := $simple//date | |
let $processed := local:process-dates($node, $dates, 1) | |
let $serialized := serialize($processed) | |
let $hack := local:hack($serialized, $dates, 1) | |
let $final := $hack => parse-xml() | |
return | |
$final |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<div> | |
<p> | |
<em>Pearl Harbor</em> was attacked on <date when="1941-12-07"> | |
<strong> | |
<em>December</em> | |
</strong> <strong>7</strong>, 1941</date>. My birthday is <date when="1976-09-29"> | |
<birthday>September 29, <year>1976</year> | |
</birthday> | |
</date>. | |
</p> | |
</div> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment