Skip to content

Instantly share code, notes, and snippets.

@CliffordAnderson
Last active May 20, 2020 23:54
Show Gist options
  • Save CliffordAnderson/b1b81d193b038c882480f1302580b7ec to your computer and use it in GitHub Desktop.
Save CliffordAnderson/b1b81d193b038c882480f1302580b7ec to your computer and use it in GitHub Desktop.
Code snippets for XQuery Working Group (Text Mining at Scale)

XQuery Working Group

XQuery and XPath Full Text 1.0

In this session, we will be exploring the XQuery and XPath Full Text 1.0 standard. Our goal is to take the records that we created during our prior class from the Victorian Women Writers Project and persist them to another database where we will analyze their contents for textual patterns.

The following exercises assume that you have loaded the documents from the Victorian Women Writers Project into a BaseX database. It is also assumed that you have named that database vwwp_tei.

xquery version "3.1";
declare namespace tei = "http://www.tei-c.org/ns/1.0";
let $records :=
for $doc in fn:collection("vwwp_tei")
let $docId := fn:base-uri($doc)
let $bookTitle := ($doc/tei:TEI/tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title/text())[1]
let $bookAuthor := $doc/tei:TEI/tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:author/text()
let $bookDate := ($doc//tei:imprint/tei:date/text(), $doc//tei:bibl/tei:date/text(), $doc//tei:publicationStmt/tei:date/text())[1]
for $poem in $doc//tei:div[@type="poem"]
let $poemTitle := $poem/tei:head/text()
let $poemPath := fn:path($poem)
let $poemText :=
fn:string-join($poem//text(), " ") =>
fn:translate("—'*‘’.,?!:;-","") =>
fn:normalize-space()
where $doc//tei:textClass/tei:keywords[@scheme="#mla"]/tei:list/tei:item/tei:term/text() = "poetry" and $poem/tei:head
return
<record>
<docId>{$docId}</docId>
<bookAuthor>{$bookAuthor}</bookAuthor>
<bookTitle>{$bookTitle}</bookTitle>
<bookDate>{$bookDate}</bookDate>
<poemTitle>{$poemTitle}</poemTitle>
<poemPath>{$poemPath}</poemPath>
<poemText>{$poemText}</poemText>
</record>
return $records
xquery version "3.1";
for $record in $records
count $file-id
return db:replace("text-mining-records", $file-id || ".xml",$record)
xquery version "3.1";
for $record in fn:collection("text-mining-records")
return $record
xquery version "3.1";
for $record in fn:collection("text-mining-records")
where $record contains text "windy"
return $record
xquery version "3.1";
for $record in fn:collection("text-mining-records")
where $record contains text {"windy", "city"} all
return $record
xquery version "3.1";
for $record score $score in fn:collection("text-mining-records")[. contains text {"husbands"}]
return
<hit score='{$score}'>
{ $record/record/* except $record/record/poemText,
<poemText>{ft:mark($record/record/poemText/text()[. contains text {"husbands"}])}</poemText>
}
</hit>
xquery version "3.1";
for $record score $score in fn:collection("text-mining-records")[. contains text {"husbands", "wives"} all]
return
<hit score='{$score}'>
{ $record/record/* except $record/record/poemText,
<poemText>{ft:mark($record/record/poemText/text()[. contains text {"husbands", "wives"} all])}</poemText>
}
</hit>
xquery version "3.1";
for $record score $score in fn:collection("text-mining-records")[.contains text {"jury", "court"} all using stemming]
return
<hit score='{$score}'>
{ $record/record/* except $record/record/poemText,
<poemText>{ft:extract($record/record/poemText/text()[. contains text {"jury", "court"} all using stemming])}</poemText>
}
</hit>
xquery version "3.1";
"My duty is to teach XQuery" contains text "task" using thesaurus at "https://dev.w3.org/cvsweb/~checkout~/2007/xpath-full-text-10-test-suite/TestSuiteStagingArea/TestSources/usability.xml?rev=1.8;content-type=application%2Fxml"
"propagating few errors" contains text "propagating of errors" using stop words at "http://files.basex.org/etc/stopwords.txt"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment