Lord of the Rings txt to XML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xquery version "1.0-ml"; | |
for $character in ("Frodo", "Sam", "Merry", "Pippin", "Boromir", "Gimli", "Legolas", "Aragorn", "Gandalf") | |
return | |
xdmp:eval(' | |
xquery version"1.0-ml"; | |
declare variable $character as xs:string external; | |
for $line in cts:search(/line-doc, $character) | |
return | |
xdmp:node-replace( | |
$line/line-content/LINE, | |
cts:highlight( $line/line-content/LINE, cts:word-query($character), <character>{$cts:text}</character>) | |
) | |
', (xs:QName("character"), $character) | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# regular expressions | |
import re | |
## xml libs | |
import lxml.etree as etree | |
from lxml.builder import E | |
from lxml.builder import ElementMaker | |
E = ElementMaker() | |
CHAPTER = E.CHAPTER | |
TITLE = E.TITLE | |
VOLUMETITLE = E.VOLUMETITLE | |
VOLUMENUM = E.VOLUMENUM | |
BOOKTITLE = E.BOOKTITLE | |
BOOKNUM = E.BOOKNUM | |
CHAPTERTITLE = E.CHAPTERTITLE | |
CHAPTERNUM = E.CHAPTERNUM | |
LINES = E.LINES | |
def saveChapter(lines, volNum, volText, bookNum, bookText, chapNum, chapText): | |
encoding = "mac-roman" | |
print "\tLines", volNum, volText, bookNum, bookText, chapNum, chapText | |
linesStr = "<LINES>" | |
for line in lines: | |
linesStr += '<LINE>'+line.decode(encoding)+'</LINE>' | |
linesStr += "</LINES>" | |
my_doc = CHAPTER( | |
TITLE("Lord of The Rings"), | |
VOLUMETITLE(volText.decode(encoding)), | |
VOLUMENUM(str(volNum)), | |
BOOKTITLE(bookText.decode(encoding)), | |
BOOKNUM(str(bookNum)), | |
CHAPTERTITLE(chapText.decode(encoding)), | |
CHAPTERNUM(str(chapNum)), | |
etree.XML(linesStr) | |
) | |
wf = file("lotr-"+str(volNum)+"-"+str(bookNum)+"-"+str(chapNum)+".xml", "w") | |
wf.write( etree.tostring(my_doc, pretty_print=False) ) | |
wf.close() | |
# filenames of LOTR books | |
books = ["Fellowship of The Ring.txt", "The Two Towers.txt", "The Return of TheKing.txt"] | |
# read the fellowship book into a variable | |
fh = file( books[0] , 'r') | |
fellowship = fh.read() | |
fh.close() | |
lines = fellowship.split('\r') | |
tocLine = -1 | |
endLine = -1 | |
# Identify line boundaries | |
count = -1 | |
for line in lines: | |
count += 1 | |
if tocLine == -1 and re.search("CONTENTS",line) : | |
tocLine = count | |
elif endLine == -1 and re.search("------------",line) : | |
endLine = count | |
print "tocLine:", tocLine | |
print "endLine:", endLine | |
#TOC lists | |
tocVolumes = [] | |
volumeIter = -1 | |
tocBooks = [] | |
bookIter = -1 | |
tocChapters = [] | |
# full the TOC lists | |
count = -1 | |
for line in lines[tocLine+1:endLine]: | |
if re.match("\t",line): | |
#new volume | |
print "Detected Volume:", line.strip() | |
tocVolumes.append(line.strip()) | |
volumeIter += 1 | |
bookIter = -1 | |
tocBooks.append([]) | |
tocChapters.append([]) | |
elif re.match(' Book',line): | |
#new book | |
print "Detected Book:", line.strip() | |
tocBooks[volumeIter].append(line.strip()) | |
bookIter += 1 | |
tocChapters[volumeIter].append([]) | |
elif re.match(' Chapter [\d]* ',line): | |
#new chapter | |
chap = re.sub("Chapter [\d]* ","", line.strip()) | |
print "Detected Chapter:", chap | |
tocChapters[volumeIter][bookIter].append(chap) | |
# array representation of the TOC | |
print tocVolumes | |
print tocBooks | |
print tocChapters | |
# nested text representation of the TOC | |
for v, vol in enumerate(tocVolumes): | |
print v, vol | |
for b, book in enumerate(tocBooks[v]): | |
print ".".join([str(v), str(b)]), book | |
for c, chap in enumerate(tocChapters[v][b]): | |
print ".".join([str(v), str(b), str(c)]), chap | |
tocVolIter = -1 | |
tocBookIter = -1 | |
tocChapIter = -1 | |
# Load all of LOTR | |
fh = file( "lotr.txt" , 'r') | |
lotr = fh.read() | |
fh.close() | |
lines = lotr.split('\r') | |
#identify the start of all chapters | |
chapterStartLines = [] | |
i = iter(lines[endLine+1:]) | |
lineCount = endLine | |
try: | |
# Loop through all volumes identifying their start lines | |
for volumeNum, volumeText in enumerate(tocVolumes): | |
# find the volume head | |
while True: | |
line = i.next() | |
lineCount += 1 | |
if re.match("[\ ]{4}[\s]*"+volumeText,line): | |
print lineCount, line | |
break | |
# book heads are not really mentioned | |
for bookNum, book in enumerate(tocBooks[volumeNum]): | |
print lineCount, book | |
isReadingChap = False | |
for chapNum, chapter in enumerate(tocChapters[volumeNum][bookNum]): | |
# find the chapter head | |
while True: | |
line = i.next() | |
lineCount += 1 | |
if re.match('[\ ]{4}[\s]*"Chapter '+str(chapNum+1)+'"',line): | |
#found chapter head | |
print lineCount, chapter | |
print "Identified: ","Volume",volumeNum+1,"Book",bookNum+1,"Chapter",chapNum+1,tocChapters[volumeNum][bookNum][chapNum] | |
chapterStartLines.append(lineCount) | |
isReadingChap = True | |
break | |
elif isReadingChap: | |
pass | |
except StopIteration: | |
pass | |
# Using identified line starts now split and save the lines | |
chapCounter = -1 | |
for volumeNum, volumeText in enumerate(tocVolumes): | |
for bookNum, book in enumerate(tocBooks[volumeNum]): | |
for chapNum, chapter in enumerate(tocChapters[volumeNum][bookNum]): | |
chapCounter += 1 | |
startLine = chapterStartLines[chapCounter] | |
endLine = "" | |
if chapCounter+1 < len(chapterStartLines): | |
endLine = chapterStartLines[chapCounter+1] -1 | |
else: | |
endLine = "END" | |
print chapCounter, tocChapters[volumeNum][bookNum][chapNum], startLine, endLine | |
print "Saving: ","Volume",volumeNum+1,"Book",bookNum+1,"Chapter",chapNum-1,tocChapters[volumeNum][bookNum][chapNum-2] | |
saveLines = [] | |
if endLine == "END": | |
saveLines = lines[startLine:] | |
else: | |
saveLines = lines[startLine:endLine] | |
saveChapter(saveLines, volumeNum+1, tocVolumes[volumeNum], bookNum+1, tocBooks[volumeNum][bookNum], chapNum+1, tocChapters[volumeNum][bookNum][chapNum]) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xquery version "1.0-ml"; | |
declare variable $fellowship as xs:string* := ("Frodo", "Sam", "Merry", "Pippin", "Boromir", "Gimli", "Legolas", "Aragorn", "Gandalf"); | |
<table border="1"> | |
<thead> | |
<tr> | |
<th>Chapter</th> | |
{ | |
for $character in $fellowship | |
return | |
<th>{$character}</th> | |
} | |
</tr> | |
</thead> | |
<tbody> | |
{ | |
for $volume in cts:element-values(xs:QName("VOLUMENUM")) | |
order by $volume ascending | |
return | |
for $book in cts:element-values(xs:QName("BOOKNUM"), (), (), cts:element-value-query(xs:QName("VOLUMENUM"), fn:string($volume))) | |
order by $book ascending | |
return | |
for $chapter in cts:element-values(xs:QName("CHAPTERNUM"), (), (), cts:and-query(( | |
cts:element-value-query(xs:QName("VOLUMENUM"), fn:string($volume)), | |
cts:element-value-query(xs:QName("BOOKNUM"), fn:string($book))))) | |
let $chapter-text := fn:string-join((fn:format-number($volume, "00"), fn:format-number($book, "00"), fn:format-number($chapter, "00")), "-") | |
order by $chapter ascending | |
return | |
<tr> | |
<td>{$chapter-text}</td> | |
{ | |
for $character in $fellowship | |
let $count := xdmp:estimate( cts:search(/line-doc, cts:and-query(( | |
cts:element-value-query(xs:QName("VOLUMENUM"), fn:string($volume)), | |
cts:element-value-query(xs:QName("BOOKNUM"), fn:string($book)), | |
cts:element-value-query(xs:QName("CHAPTERNUM"), fn:string($chapter)), | |
cts:element-value-query(xs:QName("character"), $character) | |
)) ) ) | |
return | |
<td>{$count}</td> | |
} | |
</tr> | |
} | |
</tbody> | |
</table> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xquery version "1.0-ml"; | |
for $chap in fn:collection("staging")/CHAPTER | |
let $title := $chap/TITLE/fn:string() | |
let $volnum := $chap/VOLUMENUM/fn:string() | |
let $booknum := $chap/BOOKNUM/fn:string() | |
let $chapternum := $chap/CHAPTERNUM/fn:string() | |
let $folder-uri := fn:string-join(("/books", fn:encode-for-uri($title), $volnum, $booknum, $chapternum ),"/") | |
return | |
for $line at $l in $chap/LINES/LINE | |
let $uri := fn:concat($folder-uri, "/", $l, ".xml") | |
let $doc := | |
element line-doc { | |
element line-meta { | |
for $n in $chap/node() except $chap/LINES return $n, | |
element line-number { $l }, | |
element original-text { $line/fn:string() } | |
}, | |
element line-content { | |
$line | |
} | |
} | |
return | |
($uri, xdmp:document-insert($uri, $doc, (), ("book", $title))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment