Skip to content

Instantly share code, notes, and snippets.

@WaxCylinderRevival
Last active October 30, 2017 04:53
Show Gist options
  • Save WaxCylinderRevival/677233e2baae9a5e414d57b8cdc2ed08 to your computer and use it in GitHub Desktop.
Save WaxCylinderRevival/677233e2baae9a5e414d57b8cdc2ed08 to your computer and use it in GitHub Desktop.
analyze-text-for-date-patterns.xq
(: declare namespace dp='https://history.state.gov/ns/xquery/date-processing' :)
declare variable $local:regexes :=
map {
"month-regex" : "(?:January|February|March|April|May|June|July|August|September|October|November|December)",
"month-regex-fr" : "(?:janvier|février|fevrier|mart|avril|mai|juin|juillet|août|aout|septembre|octobre|novembre|décembre|decembre)",
"month-regex-sp" : "(?:enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|setiembre|octubre|noviembre|diciembre)",
"day-regex" : "(?:\d{1,2})(?:st|d|nd|rd|th)?",
"day-range-regex" : "(?:\d{1,2})(?:st|d|nd|rd|th)?\s*[-–—]\s*(?:\d{1,2})(?:st|d|nd|rd|th)?",
"year-regex" : "(?:\d{4})",
"day-spelled-out-regex" : "(?:(?:\d{1,2})(?:st|d|nd|rd|th)?|(?:thirtieth|thirty|twentieth|twenty)?(?:-|–|\s+)?(?:nineteenth|eighteenth|seventeenth|sixteenth|fifteenth|fourteenth|thirteenth|twelfth|eleventh|tenth|ninth|eighth|seventh|sixth|fifth|fourth|third|second|first)?)",
"year-spelled-out-regex" : "one\s+thousand\s+(?:nine|eight)\s+hundred\s+(?:ninety|eighty|seventy|sixty|fifty|forty|thirty|twenty)?(?:-\s+)?(?:nineteen|eighteen|seventeen|sixteen|fifteen|fourteen|thirteen|twelve|eleven|ten|nine|eight|seven|six|five|four|three|two|one)?"
}
;
declare function local:get-dates($source) {
let $day-month-year-regex := $local:regexes?day-regex || '\s+' || $local:regexes?month-regex || ',?\s+' || $local:regexes?year-regex
let $month-day-year-regex := $local:regexes?month-regex || '\s+' || $local:regexes?day-regex || ',?\s+' || $local:regexes?year-regex
let $month-day-range-year-regex := $local:regexes?month-regex || '\s+' || $local:regexes?day-range-regex || ',?\s+' || $local:regexes?year-regex
let $day-month-year-official-regex := $local:regexes?day-spelled-out-regex || '\s+day\s+of\s+' || $local:regexes?month-regex || ',\s+in\s+the\s+year\s+of\s+(?:our|the)\s+lord\s+' || $local:regexes?year-spelled-out-regex
let $day-month-year-regex-fr := $local:regexes?day-regex || '\s+' || $local:regexes?month-regex-fr || ',?\s+' || $local:regexes?year-regex
let $day-month-year-regex-sp := $local:regexes?day-regex || '\s+(?:de\s+)?' || $local:regexes?month-regex-sp || ',?\s+(?:(?:de|del)\s+)?' || $local:regexes?year-regex
return
(
analyze-string($source, $month-day-year-regex, "i"),
analyze-string($source, $month-day-range-year-regex, "i"),
analyze-string($source, $day-month-year-regex, "i"),
analyze-string($source, $day-month-year-official-regex, 'i'),
analyze-string($source, $day-month-year-regex-fr, "i"),
analyze-string($source, $day-month-year-regex-sp, "i")
)
};
@WaxCylinderRevival
Copy link
Author

WaxCylinderRevival commented Oct 26, 2017

Results:

<results>
  <fn:match xmlns:fn="http://www.w3.org/2005/xpath-functions">February 2d, 1865</fn:match>
  <fn:match xmlns:fn="http://www.w3.org/2005/xpath-functions">march 1 2010</fn:match>
  <fn:match xmlns:fn="http://www.w3.org/2005/xpath-functions">September 1-3rd, 1929</fn:match>
  <fn:match xmlns:fn="http://www.w3.org/2005/xpath-functions">31st July 2015</fn:match>
  <fn:match xmlns:fn="http://www.w3.org/2005/xpath-functions">2d day of May, in the year of the lord one thousand nine hundred nineteen</fn:match>
  <fn:match xmlns:fn="http://www.w3.org/2005/xpath-functions"> fifth day of June, in the year of our Lord one thousand eight hundred sixty</fn:match>
  <fn:match xmlns:fn="http://www.w3.org/2005/xpath-functions">6 août
                            1902</fn:match>
  <fn:match xmlns:fn="http://www.w3.org/2005/xpath-functions">6 de Agosto de 1902</fn:match>
  <fn:match xmlns:fn="http://www.w3.org/2005/xpath-functions">8 de septiembre del 2010</fn:match>
</results>

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment