Skip to content

Instantly share code, notes, and snippets.

@kefo
Created May 19, 2017 13:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kefo/62526e8aee6bf50a569580595ad7e3fa to your computer and use it in GitHub Desktop.
Save kefo/62526e8aee6bf50a569580595ad7e3fa to your computer and use it in GitHub Desktop.
XQuery: Split large XML into parts; Extract LCC Letter
xquery version "3.0";
(:~
: Takes a file of LCC MARC records and extracts all the records per Class,
: as indicated by letter.
:
: Example invocation:
: zorba -i --serialize-text -q file:///Users/kefo/Work/mlenv/lcc/extract_schedule.xqy -e sourceuri:="/Users/kefo/Work/mlenv/lcc/source/d140612.records.xml" -e savedir:="/Users/kefo/Work/mlenv/lcc/schedules/" -e letter:="a"
:
: @author Kevin Ford (kefo@3windmills.com)
: @since May 6, 2017
: @version 1.0
:)
(: IMPORTED MODULES :)
import module namespace file = "http://expath.org/ns/file";
import module namespace parsexml = "http://zorba.io/modules/xml";
import schema namespace parseoptions = "http://zorba.io/modules/xml-options";
(: NAMESPACES :)
declare namespace marcxml = "http://www.loc.gov/MARC21/slim";
declare namespace output = "http://www.w3.org/2010/xslt-xquery-serialization";
declare namespace err = "http://www.w3.org/2005/xqt-errors";
(:~
: This variable is to indicate which file should be split.
:)
declare variable $sourceuri external;
(:~
: This variable is to indicate where the resulting files will be saved.
:)
declare variable $savedir external;
(:~
: This variable is to indicate which should be processed, tables or schedules.
:)
declare variable $letter external;
let $LETTER :=
if ( fn:string-length($letter) eq 1) then
fn:lower-case($letter)
else
fn:error(xs:QName("ERROR"), "Invalid letter.")
let $xmlstr := file:read-text($sourceuri)
let $marcxml := parsexml:parse($xmlstr, <parseoptions:options/>)
let $marccollection :=
element marcxml:collection {
for $mrecord in $marcxml//marcxml:record
let $code :=
if ($mrecord/marcxml:datafield[@tag="153"]/marcxml:subfield[1][@code eq "z"]) then
xs:string($mrecord/marcxml:datafield[@tag eq "153"]/marcxml:subfield[@code eq "z"][1])
else
xs:string($mrecord/marcxml:datafield[@tag eq "153"]/marcxml:subfield[@code eq "a"][1])
let $code_letter := fn:lower-case(fn:substring($code, 1, 1))
where $LETTER eq $code_letter or fn:contains($LETTER, $code_letter)
return $mrecord
}
let $savefilename := fn:concat($LETTER, ".xml")
let $xml := serialize($marccollection,
<output:serialization-parameters>
<output:indent value="yes"/>
<output:method value="xml"/>
<output:omit-xml-declaration value="no"/>
</output:serialization-parameters>
)
let $_ := file:write-text(fn:concat($savedir, $savefilename), $xml)
return ("File written to: ", fn:concat($savedir, $savefilename))
xquery version "3.0";
(:~
: Takes a generic XML file and splits it into parts.
:
: Example invocation:
: zorba -i --serialize-text -q file:///Users/kefo/Work/mlenv/lcc/xml_split.xqy -e savedir:="/Users/kefo/Work/mlenv/lcc/schedules/" -e sourceuri:="/Users/kefo/Work/mlenv/lcc/schedules/a.xml" -e number_parts:="4"
:
: @author Kevin Ford (kefo@3windmills.com)
: @since May 8, 2017
: @version 1.0
:)
(: IMPORTED MODULES :)
import module namespace file = "http://expath.org/ns/file";
import module namespace parsexml = "http://zorba.io/modules/xml";
import schema namespace parseoptions = "http://zorba.io/modules/xml-options";
(: NAMESPACES :)
declare namespace marcxml = "http://www.loc.gov/MARC21/slim";
declare namespace output = "http://www.w3.org/2010/xslt-xquery-serialization";
declare namespace err = "http://www.w3.org/2005/xqt-errors";
(:~
: This variable is to indicate which file should be split.
:)
declare variable $sourceuri external;
(:~
: This variable is to indicate where the resulting files will be saved.
:)
declare variable $savedir external;
(:~
: This variable is to indicate how many parts the file should be split into.
:)
declare variable $number_parts external;
let $xmlstr := file:read-text($sourceuri)
let $xml := parsexml:parse($xmlstr, <parseoptions:options/>)
let $root := $xml/child::node()[fn:name()][1]
let $count_children := fn:count($root/child::node())
let $size := fn:ceiling($count_children div xs:integer($number_parts))
let $starts :=
for $i at $pos in (1 to xs:integer($number_parts))
let $start := $size * $i - $size + 1
let $end := $size * $i
return $start
let $xml_chunks :=
for $i in $starts
return
element {fn:name($root)} {
$root/@*,
fn:subsequence($root/child::node(), $i, $size)
}
let $basefilename := fn:tokenize($sourceuri, "/")[fn:last()]
let $basefilename := fn:replace($basefilename, '.xml', '')
return
for $chunk at $pos in $xml_chunks
let $savefilename := fn:concat($basefilename, "-", xs:string($pos), ".xml")
let $content := serialize($chunk,
<output:serialization-parameters>
<output:indent value="yes"/>
<output:method value="xml"/>
<output:omit-xml-declaration value="no"/>
</output:serialization-parameters>
)
let $_ := file:write-text(fn:concat($savedir, $savefilename), $content)
return ("Chunk written to: ", fn:concat($savedir, $savefilename))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment