Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save WaxCylinderRevival/36e7ffa53317a0f5255596aaee18a8e5 to your computer and use it in GitHub Desktop.
Save WaxCylinderRevival/36e7ffa53317a0f5255596aaee18a8e5 to your computer and use it in GitHub Desktop.
Find FRUS documents whose English dates do not match the date metadata
xquery version "3.1";
(:
Find cases such as this:
<dateline>
<placeName>Washington</placeName>,
<date when="1971-10-05">October 15, 1971</date>.
</dateline>
... where the supplied English date, October 15, 1971, does not match the
supplied machine-readable date, 1971-10-05.
:)
declare namespace tei="http://www.tei-c.org/ns/1.0";
import module namespace dates="http://xqdev.com/dateparser" at "/db/apps/twitter/modules/date-parser.xqm";
let $vols :=
doc('/db/apps/frus/volumes/frus1969-76v17.xml')
(: collection('/db/apps/frus/volumes'):)
let $datelines := $vols//tei:dateline[.//tei:date/@when]
for $dateline in $datelines
let $div-id := $dateline/ancestor::tei:div[@xml:id][1]/@xml:id
let $vol-id := util:document-name($dateline) ! substring-before(., '.xml')
let $supplied-english-date := $dateline//tei:date[@when][1]
let $supplied-when-attribute := $supplied-english-date/@when
let $english-date-regex := '[A-Z][a-z]+\s+\d{1,2},\s+\d{4}'
let $parsed-english-date := analyze-string($supplied-english-date, $english-date-regex)//fn:match
let $parsed-iso-date :=
try
{
(: To improve likelihood of success, we'll try parsing the pre-processed date :)
if ($parsed-english-date) then
dates:parseDate($parsed-english-date)/string()
(: But if the date doesn't match our regex, we might as well let the dates module try on the raw date :)
else
dates:parseDate($supplied-english-date)
}
catch *
{
"parseDate had problems with " || $supplied-english-date
}
let $matches := substring($supplied-when-attribute, 1, 10) = $parsed-iso-date
(: limit results to cases where the date portion of the supplied when attribute doesn't match the iso-date :)
where not($matches)
return
element doc {
element vol-id { $vol-id },
element div-id { $div-id/string() },
element supplied-english-date { $supplied-english-date/string() },
element supplied-when-attribute { $supplied-when-attribute/string() },
element parsed-english-date { $parsed-english-date/string() },
element parsed-iso-date { $parsed-iso-date },
element matches { $matches }
}
<doc>
<vol-id>frus1969-76v17</vol-id>
<div-id>d142</div-id>
<supplied-english-date>July11, 1971, midnight–1:40 a.m. and 9:50–10:35 a.m.</supplied-english-date>
<supplied-when-attribute>1971-07-11T10:35:00</supplied-when-attribute>
<parsed-english-date/>
<parsed-iso-date>parseDate had problems with July11, 1971, midnight–1:40 a.m. and 9:50–10:35 a.m.</parsed-iso-date>
<matches>false</matches>
</doc>
<doc>
<vol-id>frus1969-76v17</vol-id>
<div-id>d159</div-id>
<supplied-english-date>October 15, 1971</supplied-english-date>
<supplied-when-attribute>1971-10-05</supplied-when-attribute>
<parsed-english-date>October 15, 1971</parsed-english-date>
<parsed-iso-date>1971-10-15</parsed-iso-date>
<matches>false</matches>
</doc>
<doc>
<vol-id>frus1969-76v17</vol-id>
<div-id>d164</div-id>
<supplied-english-date>November 1971</supplied-english-date>
<supplied-when-attribute>1971-11</supplied-when-attribute>
<parsed-english-date/>
<parsed-iso-date>
<date resolution="month">
<range>
<start>1971-11-01</start>
<end>1971-11-30</end>
</range>
<value>1971-11-01</value>
</date>
</parsed-iso-date>
<matches>false</matches>
</doc>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment