Last active
August 29, 2015 14:06
-
-
Save joewiz/9cf9a19aeb113c574572 to your computer and use it in GitHub Desktop.
Generate a Culturomics Bookworm file from FRUS TEI XML data, using XQuery and eXist
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xquery version "3.0"; | |
(: Transform volumes from the FRUS series (TEI XML) into a zip file containing JSON and text files | |
formatted for Bookworm http://bookworm.culturomics.org/. The resulting data can be accessed at: | |
http://static.history.state.gov/temp/frus-all.zip (140 MB) | |
A link to the running demo of the resulting data awaits fixes to the bookworm server, | |
but @bmschmidt kindly posted one volume's worth of data at | |
http://benschmidt.org/joewiz/ | |
:) | |
import module namespace frus="http://history.state.gov/xquery/frus" at "xmldb:exist:///db/history/modules/frus.xq"; | |
import module namespace xqjson="http://xqilla.sourceforge.net/lib/xqjson"; | |
declare namespace tei="http://www.tei-c.org/ns/1.0"; | |
declare function local:field-descriptions-entry($field-descriptions as element(field-descriptions)) { | |
let $pre-json := | |
(: we want to arrive at something like this, for the purposes of generating JSON | |
via xqjson (see https://github.com/joewiz/xqjson): | |
<json type="array"> | |
<item type="object"> | |
<pair name="field" type="string">date</pair> | |
<pair name="datatype" type="string">time</pair> | |
<pair name="type" type="string">numeric</pair> | |
<pair name="unique" type="boolean">true</pair> | |
<pair name="derived" type="array"> | |
<item type="object"> | |
<pair name="resolution" type="string">month</pair> | |
</item> | |
</pair> | |
</item> | |
<item type="object"> | |
<pair name="field" type="string">searchstring</pair> | |
<pair name="datatype" type="string">searchstring</pair> | |
<pair name="type" type="string">text</pair> | |
<pair name="unique" type="boolean">true</pair> | |
</item> | |
<item type="object"> | |
<pair name="field" type="string">series</pair> | |
<pair name="datatype" type="string">categorical</pair> | |
<pair name="type" type="string">text</pair> | |
<pair name="unique" type="boolean">false</pair> | |
</item> | |
<item type="object"> | |
<pair name="field" type="string">persons</pair> | |
<pair name="datatype" type="string">categorical</pair> | |
<pair name="type" type="string">text</pair> | |
<pair name="unique" type="boolean">false</pair> | |
</item> | |
</json> | |
:) | |
<json type="array"> | |
{ | |
for $field in $field-descriptions/field-description | |
return | |
<item type="object"> | |
{ | |
for $pair in $field/* | |
let $name := $pair/name() | |
let $type := | |
if ($name = 'unique') then 'boolean' | |
else if ($pair/*) then 'array' | |
else 'string' | |
return | |
<pair name="{$name}" type="{$type}">{ | |
if ($pair/*) then | |
<item type="object"> | |
{ | |
(: assumes single nested pair :) | |
<pair name="{$pair/*/name()}" type="string">{$pair/*/string()}</pair> | |
} | |
</item> | |
else | |
$pair/string() | |
}</pair> | |
} | |
</item> | |
} | |
</json> | |
let $json := xqjson:serialize-json($pre-json) | |
return | |
<entry name="metadata/field_descriptions.json" type="text">{$json}</entry> | |
}; | |
declare function local:doc-filename($doc as element(tei:div)+) { | |
let $vol-id := substring-before(util:document-name($doc), '.xml') | |
let $doc-id := $doc/@xml:id | |
let $filename := concat($vol-id, '_', $doc-id, '.txt') | |
return | |
$filename | |
}; | |
declare function local:json-catalog-entry($docs as element(tei:div)+) { | |
let $hashmaps := | |
(: we want to arrive at something like this: | |
<json type="object"> | |
<pair name="filename" type="string">77-80v02d1</pair> | |
<pair name="date" type="string">1977-01-05</pair> | |
<pair name="searchstring" type="string">1. Memorandum From the Coordinator for Human Rights and | |
Humanitarian Affairs ( Wilson ) to All Regional and Functional Assistant Secretaries of | |
State and the Administrator of the Agency for International Development ( Parker )</pair> | |
<pair name="series" type="array"> | |
<item type="string">1977-1980</item> | |
</pair> | |
<pair name="persons" type="array"> | |
<item type="string">Atherton, Alfred L., Jr.</item> | |
<item type="string">Ford, Gerald R.</item> | |
<item type="string">Habib, Philip C.</item> | |
<item type="string">Hartman, Arthur A.</item> | |
<item type="string">Hummel, Arthur W., Jr.</item> | |
<item type="string">Jenkins, Kempton B.</item> | |
<item type="string">Jordan, Amos A. (Joe)</item> | |
<item type="string">Katz, Julius</item> | |
<item type="string">Leigh, Monroe</item> | |
<item type="string">Lewis, Samuel W.</item> | |
<item type="string">Parker, Daniel</item> | |
<item type="string">Rogers, Kenneth N.</item> | |
<item type="string">Schaufele, William E., Jr.</item> | |
<item type="string">Shlaudeman, Harry W.</item> | |
<item type="string">Vest, George S.</item> | |
<item type="string">Wilson, James M.</item> | |
</pair> | |
</json> | |
:) | |
for $doc in $docs | |
let $filename := | |
(: bookworm docs say filenames shouldn't have .txt in the json-catalog-entry :) | |
substring-before(local:doc-filename($doc), '.txt') | |
let $date := | |
(: strip out time; use only first date in the document :) | |
substring(($doc/tei:dateline/tei:date)[1]/@when/string(), 1, 10) | |
let $searchstring := | |
(: strip notes out :) | |
normalize-space(string-join($doc/tei:head//text()[not(ancestor::tei:note)], ' ')) | |
let $series := | |
(: grab the series from the volume-id for now :) | |
replace($filename, '^frus(\d{4}-\d{2}).*$', '$1') | |
let $persons := | |
let $names := $doc//tei:persName | |
let $distinct-corresps := distinct-values($names/@corresp) | |
let $distinct-corresps-sans-pound := $distinct-corresps ! substring-after(., '#') | |
let $vol := root($doc) | |
let $persons-list-entries := $vol/id('persons')//tei:persName | |
let $people-to-show := $persons-list-entries[@xml:id = $distinct-corresps-sans-pound] ! normalize-space(.) | |
return | |
$people-to-show | |
let $pre-json := | |
<json type="object"> | |
<pair name="filename" type="string">{$filename}</pair> | |
<pair name="date" type="string">{$date}</pair> | |
<pair name="searchstring" type="string">{$searchstring}</pair> | |
<pair name="series" type="array"> | |
<item type="string">{$series}</item> | |
</pair> | |
<pair name="persons" type="array"> | |
{ | |
for $person in $persons | |
return | |
<item type="string">{$person}</item> | |
} | |
</pair> | |
</json> | |
return | |
xqjson:serialize-json($pre-json) | |
let $json-catalog := string-join($hashmaps, ' ') | |
return | |
<entry name="metadata/jsoncatalog.txt" type="text">{$json-catalog}</entry> | |
}; | |
declare function local:raw-text-entries($docs as element(tei:div)+) { | |
for $doc in $docs | |
let $vol-id := substring-before(util:document-name($doc), '.xml') | |
let $doc-id := $doc/@xml:id | |
let $filename := concat($vol-id, '_', $doc-id, '.txt') | |
let $raw-text := | |
(: strip document heading, dateline, and footnotes :) | |
string-join($doc//text()[not(ancestor::tei:note | ancestor::tei:head | ancestor::tei:dateline)], ' ') | |
let $clean-up := | |
(: strip extraneous spaces at the beginning of each line :) | |
string-join(tokenize($raw-text, '\n\s+'), ' ') | |
return | |
<entry name="texts/raw/{$filename}" type="text">{$clean-up}</entry> | |
}; | |
declare function local:docs-to-bookworm($docs as element(tei:div)+, $field-descriptions as element(field-descriptions), $project-name as xs:string) { | |
let $filename := concat($project-name, '.zip') | |
let $field-descriptions-entry := local:field-descriptions-entry($field-descriptions) | |
let $json-catalog-entry := local:json-catalog-entry($docs) | |
let $raw-text-entries := local:raw-text-entries($docs) | |
let $all-entries := ($field-descriptions-entry, $json-catalog-entry, $raw-text-entries) | |
let $bookworm-zip := compression:zip( $all-entries, true() ) | |
return | |
( | |
response:set-header("Content-Disposition", concat("attachment; filename=", $filename)) | |
, | |
response:stream-binary( | |
$bookworm-zip, | |
'application/zip', | |
$filename | |
) | |
) | |
}; | |
declare function local:vols-to-bookworm($vol-ids as xs:string+, $field-descriptions as element(field-descriptions), $project-name as xs:string) { | |
let $docs := | |
for $vol-id in $vol-ids | |
let $vol := doc(concat('/db/cms/apps/tei-content/data/frus-volumes/', $vol-id, '.xml')) | |
(: only include dated documents; excludes editorial notes and undated documents :) | |
return | |
$vol//tei:div[@xml:id and @type='document' and matches(./tei:dateline/tei:date/@when, '^\d{4}-\d{2}-\d{2}')] | |
return | |
local:docs-to-bookworm($docs, $field-descriptions, $project-name) | |
}; | |
let $vol-ids := frus:fulltext-volumes-in-db() | |
let $project-name := 'frus-all' | |
let $field-descriptions := | |
<field-descriptions> | |
<field-description> | |
<field>date</field> | |
<datatype>time</datatype> | |
<type>numeric</type> | |
<unique>true</unique> | |
<derived> | |
<resolution>month</resolution> | |
</derived> | |
</field-description> | |
<field-description> | |
<field>searchstring</field> | |
<datatype>searchstring</datatype> | |
<type>text</type> | |
<unique>true</unique> | |
</field-description> | |
<field-description> | |
<field>series</field> | |
<datatype>categorical</datatype> | |
<type>text</type> | |
<unique>false</unique> | |
</field-description> | |
<field-description> | |
<field>persons</field> | |
<datatype>categorical</datatype> | |
<type>text</type> | |
<unique>false</unique> | |
</field-description> | |
</field-descriptions> | |
return | |
local:vols-to-bookworm($vol-ids, $field-descriptions, $project-name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment