Skip to content

Instantly share code, notes, and snippets.

@joewiz
Last active August 29, 2015 14:06
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joewiz/9cf9a19aeb113c574572 to your computer and use it in GitHub Desktop.
Save joewiz/9cf9a19aeb113c574572 to your computer and use it in GitHub Desktop.
Generate a Culturomics Bookworm file from FRUS TEI XML data, using XQuery and eXist
xquery version "3.0";
(: Transform volumes from the FRUS series (TEI XML) into a zip file containing JSON and text files
formatted for Bookworm http://bookworm.culturomics.org/. The resulting data can be accessed at:
http://static.history.state.gov/temp/frus-all.zip (140 MB)
A link to the running demo of the resulting data awaits fixes to the bookworm server,
but @bmschmidt kindly posted one volume's worth of data at
http://benschmidt.org/joewiz/
:)
import module namespace frus="http://history.state.gov/xquery/frus" at "xmldb:exist:///db/history/modules/frus.xq";
import module namespace xqjson="http://xqilla.sourceforge.net/lib/xqjson";
declare namespace tei="http://www.tei-c.org/ns/1.0";
declare function local:field-descriptions-entry($field-descriptions as element(field-descriptions)) {
let $pre-json :=
(: we want to arrive at something like this, for the purposes of generating JSON
via xqjson (see https://github.com/joewiz/xqjson):
<json type="array">
<item type="object">
<pair name="field" type="string">date</pair>
<pair name="datatype" type="string">time</pair>
<pair name="type" type="string">numeric</pair>
<pair name="unique" type="boolean">true</pair>
<pair name="derived" type="array">
<item type="object">
<pair name="resolution" type="string">month</pair>
</item>
</pair>
</item>
<item type="object">
<pair name="field" type="string">searchstring</pair>
<pair name="datatype" type="string">searchstring</pair>
<pair name="type" type="string">text</pair>
<pair name="unique" type="boolean">true</pair>
</item>
<item type="object">
<pair name="field" type="string">series</pair>
<pair name="datatype" type="string">categorical</pair>
<pair name="type" type="string">text</pair>
<pair name="unique" type="boolean">false</pair>
</item>
<item type="object">
<pair name="field" type="string">persons</pair>
<pair name="datatype" type="string">categorical</pair>
<pair name="type" type="string">text</pair>
<pair name="unique" type="boolean">false</pair>
</item>
</json>
:)
<json type="array">
{
for $field in $field-descriptions/field-description
return
<item type="object">
{
for $pair in $field/*
let $name := $pair/name()
let $type :=
if ($name = 'unique') then 'boolean'
else if ($pair/*) then 'array'
else 'string'
return
<pair name="{$name}" type="{$type}">{
if ($pair/*) then
<item type="object">
{
(: assumes single nested pair :)
<pair name="{$pair/*/name()}" type="string">{$pair/*/string()}</pair>
}
</item>
else
$pair/string()
}</pair>
}
</item>
}
</json>
let $json := xqjson:serialize-json($pre-json)
return
<entry name="metadata/field_descriptions.json" type="text">{$json}</entry>
};
declare function local:doc-filename($doc as element(tei:div)+) {
let $vol-id := substring-before(util:document-name($doc), '.xml')
let $doc-id := $doc/@xml:id
let $filename := concat($vol-id, '_', $doc-id, '.txt')
return
$filename
};
declare function local:json-catalog-entry($docs as element(tei:div)+) {
let $hashmaps :=
(: we want to arrive at something like this:
<json type="object">
<pair name="filename" type="string">77-80v02d1</pair>
<pair name="date" type="string">1977-01-05</pair>
<pair name="searchstring" type="string">1. Memorandum From the Coordinator for Human Rights and
Humanitarian Affairs ( Wilson ) to All Regional and Functional Assistant Secretaries of
State and the Administrator of the Agency for International Development ( Parker )</pair>
<pair name="series" type="array">
<item type="string">1977-1980</item>
</pair>
<pair name="persons" type="array">
<item type="string">Atherton, Alfred L., Jr.</item>
<item type="string">Ford, Gerald R.</item>
<item type="string">Habib, Philip C.</item>
<item type="string">Hartman, Arthur A.</item>
<item type="string">Hummel, Arthur W., Jr.</item>
<item type="string">Jenkins, Kempton B.</item>
<item type="string">Jordan, Amos A. (Joe)</item>
<item type="string">Katz, Julius</item>
<item type="string">Leigh, Monroe</item>
<item type="string">Lewis, Samuel W.</item>
<item type="string">Parker, Daniel</item>
<item type="string">Rogers, Kenneth N.</item>
<item type="string">Schaufele, William E., Jr.</item>
<item type="string">Shlaudeman, Harry W.</item>
<item type="string">Vest, George S.</item>
<item type="string">Wilson, James M.</item>
</pair>
</json>
:)
for $doc in $docs
let $filename :=
(: bookworm docs say filenames shouldn't have .txt in the json-catalog-entry :)
substring-before(local:doc-filename($doc), '.txt')
let $date :=
(: strip out time; use only first date in the document :)
substring(($doc/tei:dateline/tei:date)[1]/@when/string(), 1, 10)
let $searchstring :=
(: strip notes out :)
normalize-space(string-join($doc/tei:head//text()[not(ancestor::tei:note)], ' '))
let $series :=
(: grab the series from the volume-id for now :)
replace($filename, '^frus(\d{4}-\d{2}).*$', '$1')
let $persons :=
let $names := $doc//tei:persName
let $distinct-corresps := distinct-values($names/@corresp)
let $distinct-corresps-sans-pound := $distinct-corresps ! substring-after(., '#')
let $vol := root($doc)
let $persons-list-entries := $vol/id('persons')//tei:persName
let $people-to-show := $persons-list-entries[@xml:id = $distinct-corresps-sans-pound] ! normalize-space(.)
return
$people-to-show
let $pre-json :=
<json type="object">
<pair name="filename" type="string">{$filename}</pair>
<pair name="date" type="string">{$date}</pair>
<pair name="searchstring" type="string">{$searchstring}</pair>
<pair name="series" type="array">
<item type="string">{$series}</item>
</pair>
<pair name="persons" type="array">
{
for $person in $persons
return
<item type="string">{$person}</item>
}
</pair>
</json>
return
xqjson:serialize-json($pre-json)
let $json-catalog := string-join($hashmaps, '&#10;')
return
<entry name="metadata/jsoncatalog.txt" type="text">{$json-catalog}</entry>
};
declare function local:raw-text-entries($docs as element(tei:div)+) {
for $doc in $docs
let $vol-id := substring-before(util:document-name($doc), '.xml')
let $doc-id := $doc/@xml:id
let $filename := concat($vol-id, '_', $doc-id, '.txt')
let $raw-text :=
(: strip document heading, dateline, and footnotes :)
string-join($doc//text()[not(ancestor::tei:note | ancestor::tei:head | ancestor::tei:dateline)], ' ')
let $clean-up :=
(: strip extraneous spaces at the beginning of each line :)
string-join(tokenize($raw-text, '\n\s+'), '&#10;')
return
<entry name="texts/raw/{$filename}" type="text">{$clean-up}</entry>
};
declare function local:docs-to-bookworm($docs as element(tei:div)+, $field-descriptions as element(field-descriptions), $project-name as xs:string) {
let $filename := concat($project-name, '.zip')
let $field-descriptions-entry := local:field-descriptions-entry($field-descriptions)
let $json-catalog-entry := local:json-catalog-entry($docs)
let $raw-text-entries := local:raw-text-entries($docs)
let $all-entries := ($field-descriptions-entry, $json-catalog-entry, $raw-text-entries)
let $bookworm-zip := compression:zip( $all-entries, true() )
return
(
response:set-header("Content-Disposition", concat("attachment; filename=", $filename))
,
response:stream-binary(
$bookworm-zip,
'application/zip',
$filename
)
)
};
declare function local:vols-to-bookworm($vol-ids as xs:string+, $field-descriptions as element(field-descriptions), $project-name as xs:string) {
let $docs :=
for $vol-id in $vol-ids
let $vol := doc(concat('/db/cms/apps/tei-content/data/frus-volumes/', $vol-id, '.xml'))
(: only include dated documents; excludes editorial notes and undated documents :)
return
$vol//tei:div[@xml:id and @type='document' and matches(./tei:dateline/tei:date/@when, '^\d{4}-\d{2}-\d{2}')]
return
local:docs-to-bookworm($docs, $field-descriptions, $project-name)
};
let $vol-ids := frus:fulltext-volumes-in-db()
let $project-name := 'frus-all'
let $field-descriptions :=
<field-descriptions>
<field-description>
<field>date</field>
<datatype>time</datatype>
<type>numeric</type>
<unique>true</unique>
<derived>
<resolution>month</resolution>
</derived>
</field-description>
<field-description>
<field>searchstring</field>
<datatype>searchstring</datatype>
<type>text</type>
<unique>true</unique>
</field-description>
<field-description>
<field>series</field>
<datatype>categorical</datatype>
<type>text</type>
<unique>false</unique>
</field-description>
<field-description>
<field>persons</field>
<datatype>categorical</datatype>
<type>text</type>
<unique>false</unique>
</field-description>
</field-descriptions>
return
local:vols-to-bookworm($vol-ids, $field-descriptions, $project-name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment