Skip to content

Instantly share code, notes, and snippets.

@joewiz
Created November 21, 2017 18:38
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joewiz/d46dc4a490590a9f1e6fe80b12b12d31 to your computer and use it in GitHub Desktop.
Save joewiz/d46dc4a490590a9f1e6fe80b12b12d31 to your computer and use it in GitHub Desktop.
Enrich dates in mixed content, with XQuery
xquery version "3.1";
(: Turning "December 7, 1941" into <date>December 7, 1941</date> isn't too hard, with XPath 3.0's
fn:analyze-string() function, but if the date string occurs in mixed text, such as:
<p>Pearl Harbor was attacked on <em>December</em> 7, 1941.</p>
and you want to preserve the existing element structure to return:
<p>Pearl Harbor was attacked on <date><em>December</em> 7, 1941</date>.</p>
it's quite a bit more challenging.
This query uses string processing to align the results of fn:string-analyze() with the input's
original node structure.
Caveats: The local:hack() function may result in non-well-formed XML. This isn't guaranteed to
work with all source XML. I think it should work in cases where the date components are wrapped in
"inline" elements, but haven't tested this extensively.
The regex and many functions used here are drawn from @WaxCylinderRevival's date-matching query:
https://gist.github.com/WaxCylinderRevival/b61b9843f118909bf6cf41c922632559
:)
declare boundary-space preserve;
declare function local:month-to-mm($m) {
switch (analyze-string(lower-case($m),'(january|janvier|enero|february|février|fevrier|febrero|march|mart|marzo|april|avril|abril|may|mai|mayo|june|juin|junio|july|juillet|julio|august|août|aout|agosto|september|septembre|septiembre|setiembre|october|octobre|octubre|november|novembre|noviembre|december|décembre|decembre|diciembre)',"i")/fn:match)
case "January"
case "january"
case "janvier"
case "enero" return "01"
case "February"
case "february"
case "février"
case "fevrier"
case "febrero" return "02"
case "March"
case "march"
case "mart"
case "marzo" return "03"
case "April"
case "april"
case "avril"
case "abril" return "04"
case "May"
case "may"
case "mai"
case "mayo" return "05"
case "June"
case "june"
case "juin"
case "junio" return "06"
case "July"
case "july"
case "juillet"
case "julio" return "07"
case "August"
case "august"
case "août"
case "aout"
case "agosto" return "08"
case "September"
case "september"
case "septembre"
case "septiembre"
case "setiembre" return "09"
case "October"
case "october"
case "octobre"
case "octubre" return "10"
case "November"
case "november"
case "novembre"
case "noviembre" return "11"
case "December"
case "december"
case "décembre"
case "decembre"
case "diciembre" return "12"
default return "error"
};
declare function local:enrich-matches($nodes, $enrich as function(*)) {
for $node in $nodes
return
typeswitch ( $node )
case element(fn:non-match) return $node/string()
case element(fn:match) return $enrich($node)
case element() return local:enrich-matches($node/node(), $enrich)
default return
$node
};
declare function local:analyze-text($text, $patterns) {
if (exists($patterns)) then
let $pattern := head($patterns)
let $analysis := analyze-string($text, $pattern?regex)
return
if ($analysis/fn:match) then
local:enrich-matches($analysis, $pattern?enrich-match)
else
local:analyze-text($text, tail($patterns))
else
$text
};
declare function local:analyze($nodes, $text-patterns) {
for $node in $nodes
return
typeswitch ( $node )
case text() return
local:analyze-text($node, $text-patterns)
case element() return
element { node-name($node) } { $node/@*, local:analyze($node/node(), $text-patterns) }
default return
$node
};
declare variable $local:regex-primitives :=
(: regex primitives :)
map {
"months-en": "(January|February|March|April|May|June|July|August|September|October|November|December)",
"day": "(\d{1,2})",
"year": "(\d{4})",
"space": " ",
"comma-space": ", "
}
;
declare function local:compile-pattern($components) {
$components?* ! $local:regex-primitives(.)
=> string-join()
};
declare function local:get-text-node-at-offset($text-nodes, $offset) {
if (count($text-nodes) gt 1) then
let $node := $text-nodes => head()
let $preceding := $node/preceding::text()
let $start-pos := string-length($preceding => string-join())
let $end-pos := $start-pos + string-length($node)
return
if ($start-pos le $offset and $end-pos ge $offset) then
map { "node": $node, "position": $offset - $start-pos }
else
local:get-text-node-at-offset($text-nodes => tail(), $offset)
else
$text-nodes
};
declare function local:insert-milestones($nodes, $milestones) {
for $node in $nodes
return
typeswitch ($node)
case element() return
let $milestone := $milestones[?node is ($node//text())[1] and ?position eq 1]
return
(
if (exists($milestone) and $node/preceding-sibling::node()) then
element { $milestone?name } { () }
else
()
,
element {node-name($node)} {$node/@*, local:insert-milestones($node/node(), $milestones) }
)
case text() return
let $milestone := $milestones[?node is $node]
return
if (exists($milestone) and exists($node/preceding-sibling::node())) then
(
if ($milestone?position gt 1) then
substring($node, 1, $milestone?position - 1)
else
()
,
element { $milestone?name } { () }
,
if ($milestone?position lt string-length($node)) then
substring($node, $milestone?position)
else
()
)
else
$node
default return $node
};
declare function local:process-dates($node, $dates, $pos) {
if (exists($dates)) then
let $date := $dates => head()
let $start := string-length($date/preceding::text() => string-join()) + 1
let $length := string-length($date)
let $end := $start + $length
let $start-node := local:get-text-node-at-offset($node//text(), $start)
let $end-node := local:get-text-node-at-offset($node//text(), $end)
let $milestones := (map:put($start-node, "name", "date-start-" || $pos), map:put($end-node, "name", "date-end-" || $pos))
let $insert-milestones := local:insert-milestones($node, $milestones)
return
local:process-dates($insert-milestones, $dates => tail(), $pos + 1)
else
$node
};
declare function local:hack($serialized, $dates, $pos) {
if (exists($dates)) then
let $date := $dates => head()
let $processed :=
$serialized
=> replace("<date-start-" || $pos || "/>", '<date when="' || $date/@when || '">')
=> replace("<date-end-" || $pos || "/>", "</date>")
return
local:hack($processed, $dates => tail(), $pos + 1)
else
$serialized
};
let $patterns :=
(
map {
"name": "month-day-year",
"regex": ["months-en", "space", "day", "comma-space", "year"] => local:compile-pattern(),
"enrich-match": function($match) {
let $groups := $match/fn:group
let $when :=
(
$groups[@nr='3'],
$groups[@nr='1'] => local:month-to-mm(),
$groups[@nr='2'] => format-number('00')
)
=> string-join("-")
return
<date when="{$when}">{$match/string()}</date>
}
}
)
let $node :=
<div>
<p>
<em>Pearl Harbor</em> was attacked on <strong><em>December</em></strong> <strong>7</strong>, 1941. My birthday is <birthday>September 29, <year>1976</year></birthday>.
</p>
</div>
let $simple := local:analyze(element {node-name($node)} {$node/string()}, $patterns)
let $dates := $simple//date
let $processed := local:process-dates($node, $dates, 1)
let $serialized := serialize($processed)
let $hack := local:hack($serialized, $dates, 1)
let $final := $hack => parse-xml()
return
$final
<div>
<p>
<em>Pearl Harbor</em> was attacked on <date when="1941-12-07">
<strong>
<em>December</em>
</strong> <strong>7</strong>, 1941</date>. My birthday is <date when="1976-09-29">
<birthday>September 29, <year>1976</year>
</birthday>
</date>.
</p>
</div>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment