Skip to content

Instantly share code, notes, and snippets.

@mathias-goebel
Created December 12, 2017 10:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mathias-goebel/76132794c4a242fd0016506bb12c62a7 to your computer and use it in GitHub Desktop.
Save mathias-goebel/76132794c4a242fd0016506bb12c62a7 to your computer and use it in GitHub Desktop.
TCF to sentence in plain text
xquery version "3.1";
declare namespace dta="http://www.dspin.de/data/textcorpus";
declare function local:load-dta($source, $target) {
let $collection-uri := $target,
$directory := $source,
$pattern := "**/*",
$mime-type := "application/xml",
$preserve-structure := true()
return
xmldb:store-files-from-pattern($collection-uri, $directory, $pattern, $mime-type, $preserve-structure)
};
let $col := "/db/data/full"
for $corpus in collection($col)//dta:TextCorpus
let $name := ($corpus/base-uri() => tokenize("/"))[last()]
let $lemmas := $corpus//dta:lemma
let $token :=
map:new(
for $lemma in $lemmas
return
map:entry($lemma/string(@tokenIDs), $lemma)
)
let $sentence :=
for $sentence in $corpus//dta:sentence
let $tokens := $sentence/@tokenIDs => tokenize(" ")
let $sen := $tokens ! $token(.)
return
string-join($sen, " ")
let $data := string-join($sentence, "
")
=> replace("[^a-zA-ZüöäßÜÖÄ \n]", "")
let $binarydata := util:string-to-binary($data)
let $path := "/home/mathias/development/word2vec/texts/dta/"
return
file:serialize-binary($binarydata, $path||replace($name, "\.tcf\.xml", ".sentences.txt"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment