Skip to content

Instantly share code, notes, and snippets.

@emchateau
Forked from CliffordAnderson/0-data-source.md
Created May 20, 2020 23:54
Show Gist options
  • Save emchateau/c17de15f2d11b135116bce8f60efb6ef to your computer and use it in GitHub Desktop.
Save emchateau/c17de15f2d11b135116bce8f60efb6ef to your computer and use it in GitHub Desktop.

XQuery Working Group

Text Mining at Scale

In this session, we will extract poems from the Victorian Women Writers Project. The electronic editions of these documents are maintained in TEI P5 format on Github. You can also download a zip file of the entire corpus.

The following exercises assume that you have loaded the documents from the Victorian Women Writers Project into a BaseX database. It is also assumed that you have named that database vwwp_tei.

(: Look for documents belonging to the genre of poetry :)
xquery version "3.1";
declare default element namespace "http://www.tei-c.org/ns/1.0";
for $doc in fn:collection("vwwp_tei")
where $doc//textClass/keywords[@scheme="#mla"]/list/item/term/text() = "poetry"
return fn:base-uri($doc)
xquery version "3.1";
declare default element namespace "http://www.tei-c.org/ns/1.0";
fn:doc("/vwwp_tei/VAB7015.xml")
xquery version "3.1";
declare default element namespace "http://www.tei-c.org/ns/1.0";
let $poem-titles := fn:doc("/vwwp_tei/VAB7015.xml")//div[@type="poem"]/head/text()
return $poem-titles
xquery version "3.1";
declare default element namespace "http://www.tei-c.org/ns/1.0";
for $poem in fn:doc("/vwwp_tei/VAB7015.xml")//div[@type="poem"]
return
<poem>
{
fn:normalize-space($poem/fn:data())
}
</poem>
xquery version "3.1";
declare default element namespace "http://www.tei-c.org/ns/1.0";
for $doc in fn:collection("vwwp_tei")
for $poem in $doc//div[@type="poem"]
where $poem/head
return $poem
xquery version "3.1";
declare default element namespace "http://www.tei-c.org/ns/1.0";
<csv>
{
for $doc in fn:collection("vwwp_tei")
let $bookTitle := $doc/TEI/teiHeader/fileDesc/titleStmt/title/text()
for $poem in $doc//div[@type="poem"]
let $poemTitle := $poem/head/text()
let $poemText := $poem//text()
where $doc//textClass/keywords[@scheme="#mla"]/list/item/term/text() = "poetry" and $poem/head
return
<record>
<bookTitle>{$bookTitle}</bookTitle>
<poemTitle>{$poemTitle}</poemTitle>
<poemText>{$poemText}</poemText>
</record>
}
</csv>
xquery version "3.1";
declare default element namespace "http://www.tei-c.org/ns/1.0";
<csv>
{
for $doc in fn:collection("vwwp_tei")
let $bookTitle := ($doc/TEI/teiHeader/fileDesc/titleStmt/title/text())[1]
for $poem in $doc//div[@type="poem"]
let $poemTitle := $poem/head/text()
let $poemText := fn:lower-case(fn:normalize-space(fn:string-join($poem//text(), " ")))
where $doc//textClass/keywords[@scheme="#mla"]/list/item/term/text() = "poetry" and $poem/head
return
<record>
<bookTitle>{$bookTitle}</bookTitle>
<poemTitle>{$poemTitle}</poemTitle>
<poemText>{$poemText}</poemText>
</record>
}
</csv>
xquery version "3.1";
declare default element namespace "http://www.tei-c.org/ns/1.0";
declare namespace output = "http://www.w3.org/2010/xslt-xquery-serialization";
declare option output:method "csv";
declare option output:csv "header=yes, separator=comma";
<csv>
{
for $doc in fn:collection("vwwp_tei")
let $bookTitle := ($doc/TEI/teiHeader/fileDesc/titleStmt/title/text())[1]
for $poem in $doc//div[@type="poem"]
let $poemTitle := $poem/head/text()
let $poemText := fn:lower-case(fn:normalize-space(fn:string-join($poem//text(), " ")))
where $doc//textClass/keywords[@scheme="#mla"]/list/item/term/text() = "poetry" and $poem/head
return
<record>
<bookTitle>{$bookTitle}</bookTitle>
<poemTitle>{$poemTitle}</poemTitle>
<poemText>{$poemText}</poemText>
</record>
}
</csv>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment