Created
May 19, 2017 13:33
-
-
Save kefo/62526e8aee6bf50a569580595ad7e3fa to your computer and use it in GitHub Desktop.
XQuery: Split large XML into parts; Extract LCC Letter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xquery version "3.0"; | |
(:~ | |
: Takes a file of LCC MARC records and extracts all the records per Class, | |
: as indicated by letter. | |
: | |
: Example invocation: | |
: zorba -i --serialize-text -q file:///Users/kefo/Work/mlenv/lcc/extract_schedule.xqy -e sourceuri:="/Users/kefo/Work/mlenv/lcc/source/d140612.records.xml" -e savedir:="/Users/kefo/Work/mlenv/lcc/schedules/" -e letter:="a" | |
: | |
: @author Kevin Ford (kefo@3windmills.com) | |
: @since May 6, 2017 | |
: @version 1.0 | |
:) | |
(: IMPORTED MODULES :) | |
import module namespace file = "http://expath.org/ns/file"; | |
import module namespace parsexml = "http://zorba.io/modules/xml"; | |
import schema namespace parseoptions = "http://zorba.io/modules/xml-options"; | |
(: NAMESPACES :) | |
declare namespace marcxml = "http://www.loc.gov/MARC21/slim"; | |
declare namespace output = "http://www.w3.org/2010/xslt-xquery-serialization"; | |
declare namespace err = "http://www.w3.org/2005/xqt-errors"; | |
(:~ | |
: This variable is to indicate which file should be split. | |
:) | |
declare variable $sourceuri external; | |
(:~ | |
: This variable is to indicate where the resulting files will be saved. | |
:) | |
declare variable $savedir external; | |
(:~ | |
: This variable is to indicate which should be processed, tables or schedules. | |
:) | |
declare variable $letter external; | |
let $LETTER := | |
if ( fn:string-length($letter) eq 1) then | |
fn:lower-case($letter) | |
else | |
fn:error(xs:QName("ERROR"), "Invalid letter.") | |
let $xmlstr := file:read-text($sourceuri) | |
let $marcxml := parsexml:parse($xmlstr, <parseoptions:options/>) | |
let $marccollection := | |
element marcxml:collection { | |
for $mrecord in $marcxml//marcxml:record | |
let $code := | |
if ($mrecord/marcxml:datafield[@tag="153"]/marcxml:subfield[1][@code eq "z"]) then | |
xs:string($mrecord/marcxml:datafield[@tag eq "153"]/marcxml:subfield[@code eq "z"][1]) | |
else | |
xs:string($mrecord/marcxml:datafield[@tag eq "153"]/marcxml:subfield[@code eq "a"][1]) | |
let $code_letter := fn:lower-case(fn:substring($code, 1, 1)) | |
where $LETTER eq $code_letter or fn:contains($LETTER, $code_letter) | |
return $mrecord | |
} | |
let $savefilename := fn:concat($LETTER, ".xml") | |
let $xml := serialize($marccollection, | |
<output:serialization-parameters> | |
<output:indent value="yes"/> | |
<output:method value="xml"/> | |
<output:omit-xml-declaration value="no"/> | |
</output:serialization-parameters> | |
) | |
let $_ := file:write-text(fn:concat($savedir, $savefilename), $xml) | |
return ("File written to: ", fn:concat($savedir, $savefilename)) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xquery version "3.0"; | |
(:~ | |
: Takes a generic XML file and splits it into parts. | |
: | |
: Example invocation: | |
: zorba -i --serialize-text -q file:///Users/kefo/Work/mlenv/lcc/xml_split.xqy -e savedir:="/Users/kefo/Work/mlenv/lcc/schedules/" -e sourceuri:="/Users/kefo/Work/mlenv/lcc/schedules/a.xml" -e number_parts:="4" | |
: | |
: @author Kevin Ford (kefo@3windmills.com) | |
: @since May 8, 2017 | |
: @version 1.0 | |
:) | |
(: IMPORTED MODULES :) | |
import module namespace file = "http://expath.org/ns/file"; | |
import module namespace parsexml = "http://zorba.io/modules/xml"; | |
import schema namespace parseoptions = "http://zorba.io/modules/xml-options"; | |
(: NAMESPACES :) | |
declare namespace marcxml = "http://www.loc.gov/MARC21/slim"; | |
declare namespace output = "http://www.w3.org/2010/xslt-xquery-serialization"; | |
declare namespace err = "http://www.w3.org/2005/xqt-errors"; | |
(:~ | |
: This variable is to indicate which file should be split. | |
:) | |
declare variable $sourceuri external; | |
(:~ | |
: This variable is to indicate where the resulting files will be saved. | |
:) | |
declare variable $savedir external; | |
(:~ | |
: This variable is to indicate how many parts the file should be split into. | |
:) | |
declare variable $number_parts external; | |
let $xmlstr := file:read-text($sourceuri) | |
let $xml := parsexml:parse($xmlstr, <parseoptions:options/>) | |
let $root := $xml/child::node()[fn:name()][1] | |
let $count_children := fn:count($root/child::node()) | |
let $size := fn:ceiling($count_children div xs:integer($number_parts)) | |
let $starts := | |
for $i at $pos in (1 to xs:integer($number_parts)) | |
let $start := $size * $i - $size + 1 | |
let $end := $size * $i | |
return $start | |
let $xml_chunks := | |
for $i in $starts | |
return | |
element {fn:name($root)} { | |
$root/@*, | |
fn:subsequence($root/child::node(), $i, $size) | |
} | |
let $basefilename := fn:tokenize($sourceuri, "/")[fn:last()] | |
let $basefilename := fn:replace($basefilename, '.xml', '') | |
return | |
for $chunk at $pos in $xml_chunks | |
let $savefilename := fn:concat($basefilename, "-", xs:string($pos), ".xml") | |
let $content := serialize($chunk, | |
<output:serialization-parameters> | |
<output:indent value="yes"/> | |
<output:method value="xml"/> | |
<output:omit-xml-declaration value="no"/> | |
</output:serialization-parameters> | |
) | |
let $_ := file:write-text(fn:concat($savedir, $savefilename), $content) | |
return ("Chunk written to: ", fn:concat($savedir, $savefilename)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment