Skip to content

Instantly share code, notes, and snippets.

@CliffordAnderson
Last active November 4, 2016 21:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save CliffordAnderson/159899389fb4978f8461df525f652145 to your computer and use it in GitHub Desktop.
Save CliffordAnderson/159899389fb4978f8461df525f652145 to your computer and use it in GitHub Desktop.
Sample code for extracting, transforming, and loading IA metadata into BaseX
xquery version "3.1";
declare %updating function local:persist($db as xs:string, $doc as element()) as empty-sequence()
{
let $book := element book {$doc/*}
let $key := $book//key/text()
return db:replace($db, $key, $book)
};
let $db := "books"
let $search-term := "tree"
let $uri := "http://openlibrary.org/search.json?title=" || $search-term
let $docs := fetch:text($uri) => json:parse()
for $doc in $docs/json/docs/_
return local:persist($db, $doc)
xquery version "3.1";
declare function local:convert($doc as element()*) as element()*
{
element book {
attribute key {$doc/edition__key/_/text()},
attribute full-text {$doc/has__fulltext/text()},
for $title in $doc/title__suggest/text() return element title { attribute title {$doc/title/text}, $title },
for $author at $x in $doc/author__name/_/text() return element author {attribute key {$doc/author__key/_[$x]/text()}, $author },
element date {
attribute first {$doc/publish__year/_/text()},
attribute year {$doc/first__publish__year/_/text()},
$doc/publish__date/text() },
element type {
$doc/type/text()
},
element place {
$doc/publish__place/_/text()
},
element publisher {
$doc/publisher/_/text()
},
element language {
$doc/language/_/text()
},
element lccn {
$doc/lccn/_/text()
},
element key {
$doc/key/text()
},
for $isbn in $doc/isbn/text() return element isbn { $isbn },
element texts { for $text in $doc/text/_/text() return element text { $text } },
element seeds { for $seed in $doc/seed/_/text() return element seed { $seed } },
element subjects {for $subject in $doc/subject/_/text() return element subject { $subject } }
}
};
declare %updating function local:add( $book as element(book), $db as xs:string ) as empty-sequence()
{
let $uri := $book/key
return db:replace($db, $uri, $book)
};
let $search-term := "literature"
let $db := "books"
let $uri := "http://openlibrary.org/search.json?title=" || $search-term
let $json := fetch:text($uri)
let $xml := json:parse($json)
for $doc in $xml/json/docs/_
return local:convert($doc) => local:add($db)
xquery version "3.1";
declare function local:convert($doc as element()*) as element()*
{
element book {
attribute key {$doc/edition__key/_/text()},
attribute full-text {$doc/has__fulltext/text()},
for $title in $doc/title__suggest/text() return element title { attribute title {$doc/title/text}, $title },
for $author at $x in $doc/author__name/_/text() return element author {attribute key {$doc/author__key/_[$x]/text()}, $author },
element date {
attribute first {$doc/publish__year/_/text()},
attribute year {$doc/first__publish__year/_/text()},
$doc/publish__date/text() },
element type {
$doc/type/text()
},
element place {
$doc/publish__place/_/text()
},
element publisher {
$doc/publisher/_/text()
},
element language {
$doc/language/_/text()
},
element lccn {
$doc/lccn/_/text()
},
element key {
$doc/key/text()
},
for $isbn in $doc/isbn/text() return element isbn { $isbn },
element texts { for $text in $doc/text/_/text() return element text { $text } },
element seeds { for $seed in $doc/seed/_/text() return element seed { $seed } },
element subjects {for $subject in $doc/subject/_/text() return element subject { $subject } }
}
};
declare %updating function local:add( $book as element(book), $db as xs:string ) as empty-sequence()
{
let $uri := $book/key
return db:replace($db, $uri, $book)
};
let $search-term := "barth"
let $db := "books"
let $uri := "http://openlibrary.org/search.json?title="
let $json := fetch:text($uri || $search-term )
let $xml := json:parse($json)
let $pages := fn:ceiling($xml/json/numFound/text() div 100)
for $page in (1 to xs:int($pages))
return
let $json := fetch:text($uri || $search-term || "&page=" || $page )
let $xml := json:parse($json)
for $doc in $xml/json/docs/_
return local:convert($doc) => local:add($db)
xquery version "3.1";
declare %updating function local:add($full-text as element(full-text), $db as xs:string, $key as xs:string ) as empty-sequence()
{
let $uri := "full-text/" || fn:substring($key, 7)
return db:replace($db, $uri, $full-text)
};
for $doc in fn:collection("books")//book[@full-text eq "true"]
where fn:not($doc//subjects/subject = "Protected DAISY")
for $key in $doc/key/text()
let $data := fetch:text("https://openlibrary.org" || $key || ".json") => json:parse()
let $ocaid := $data//ocaid/text()
for $book in $ocaid
let $full-text := fetch:text("https://archive.org/5/items/" || $book || "/" || $book || "_djvu.txt")
return local:add(element full-text {$full-text}, "books", $key)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment