Last active
November 4, 2016 21:06
-
-
Save CliffordAnderson/159899389fb4978f8461df525f652145 to your computer and use it in GitHub Desktop.
Sample code for extracting, transforming, and loading IA metadata into BaseX
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xquery version "3.1"; | |
declare %updating function local:persist($db as xs:string, $doc as element()) as empty-sequence() | |
{ | |
let $book := element book {$doc/*} | |
let $key := $book//key/text() | |
return db:replace($db, $key, $book) | |
}; | |
let $db := "books" | |
let $search-term := "tree" | |
let $uri := "http://openlibrary.org/search.json?title=" || $search-term | |
let $docs := fetch:text($uri) => json:parse() | |
for $doc in $docs/json/docs/_ | |
return local:persist($db, $doc) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xquery version "3.1"; | |
declare function local:convert($doc as element()*) as element()* | |
{ | |
element book { | |
attribute key {$doc/edition__key/_/text()}, | |
attribute full-text {$doc/has__fulltext/text()}, | |
for $title in $doc/title__suggest/text() return element title { attribute title {$doc/title/text}, $title }, | |
for $author at $x in $doc/author__name/_/text() return element author {attribute key {$doc/author__key/_[$x]/text()}, $author }, | |
element date { | |
attribute first {$doc/publish__year/_/text()}, | |
attribute year {$doc/first__publish__year/_/text()}, | |
$doc/publish__date/text() }, | |
element type { | |
$doc/type/text() | |
}, | |
element place { | |
$doc/publish__place/_/text() | |
}, | |
element publisher { | |
$doc/publisher/_/text() | |
}, | |
element language { | |
$doc/language/_/text() | |
}, | |
element lccn { | |
$doc/lccn/_/text() | |
}, | |
element key { | |
$doc/key/text() | |
}, | |
for $isbn in $doc/isbn/text() return element isbn { $isbn }, | |
element texts { for $text in $doc/text/_/text() return element text { $text } }, | |
element seeds { for $seed in $doc/seed/_/text() return element seed { $seed } }, | |
element subjects {for $subject in $doc/subject/_/text() return element subject { $subject } } | |
} | |
}; | |
declare %updating function local:add( $book as element(book), $db as xs:string ) as empty-sequence() | |
{ | |
let $uri := $book/key | |
return db:replace($db, $uri, $book) | |
}; | |
let $search-term := "literature" | |
let $db := "books" | |
let $uri := "http://openlibrary.org/search.json?title=" || $search-term | |
let $json := fetch:text($uri) | |
let $xml := json:parse($json) | |
for $doc in $xml/json/docs/_ | |
return local:convert($doc) => local:add($db) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xquery version "3.1"; | |
declare function local:convert($doc as element()*) as element()* | |
{ | |
element book { | |
attribute key {$doc/edition__key/_/text()}, | |
attribute full-text {$doc/has__fulltext/text()}, | |
for $title in $doc/title__suggest/text() return element title { attribute title {$doc/title/text}, $title }, | |
for $author at $x in $doc/author__name/_/text() return element author {attribute key {$doc/author__key/_[$x]/text()}, $author }, | |
element date { | |
attribute first {$doc/publish__year/_/text()}, | |
attribute year {$doc/first__publish__year/_/text()}, | |
$doc/publish__date/text() }, | |
element type { | |
$doc/type/text() | |
}, | |
element place { | |
$doc/publish__place/_/text() | |
}, | |
element publisher { | |
$doc/publisher/_/text() | |
}, | |
element language { | |
$doc/language/_/text() | |
}, | |
element lccn { | |
$doc/lccn/_/text() | |
}, | |
element key { | |
$doc/key/text() | |
}, | |
for $isbn in $doc/isbn/text() return element isbn { $isbn }, | |
element texts { for $text in $doc/text/_/text() return element text { $text } }, | |
element seeds { for $seed in $doc/seed/_/text() return element seed { $seed } }, | |
element subjects {for $subject in $doc/subject/_/text() return element subject { $subject } } | |
} | |
}; | |
declare %updating function local:add( $book as element(book), $db as xs:string ) as empty-sequence() | |
{ | |
let $uri := $book/key | |
return db:replace($db, $uri, $book) | |
}; | |
let $search-term := "barth" | |
let $db := "books" | |
let $uri := "http://openlibrary.org/search.json?title=" | |
let $json := fetch:text($uri || $search-term ) | |
let $xml := json:parse($json) | |
let $pages := fn:ceiling($xml/json/numFound/text() div 100) | |
for $page in (1 to xs:int($pages)) | |
return | |
let $json := fetch:text($uri || $search-term || "&page=" || $page ) | |
let $xml := json:parse($json) | |
for $doc in $xml/json/docs/_ | |
return local:convert($doc) => local:add($db) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xquery version "3.1"; | |
declare %updating function local:add($full-text as element(full-text), $db as xs:string, $key as xs:string ) as empty-sequence() | |
{ | |
let $uri := "full-text/" || fn:substring($key, 7) | |
return db:replace($db, $uri, $full-text) | |
}; | |
for $doc in fn:collection("books")//book[@full-text eq "true"] | |
where fn:not($doc//subjects/subject = "Protected DAISY") | |
for $key in $doc/key/text() | |
let $data := fetch:text("https://openlibrary.org" || $key || ".json") => json:parse() | |
let $ocaid := $data//ocaid/text() | |
for $book in $ocaid | |
let $full-text := fetch:text("https://archive.org/5/items/" || $book || "/" || $book || "_djvu.txt") | |
return local:add(element full-text {$full-text}, "books", $key) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment