Skip to content

Instantly share code, notes, and snippets.

@derickson
Created May 20, 2012 04:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save derickson/2743783 to your computer and use it in GitHub Desktop.
Save derickson/2743783 to your computer and use it in GitHub Desktop.
Lord of the Rings txt to XML
xquery version "1.0-ml";
for $character in ("Frodo", "Sam", "Merry", "Pippin", "Boromir", "Gimli", "Legolas", "Aragorn", "Gandalf")
return
xdmp:eval('
xquery version"1.0-ml";
declare variable $character as xs:string external;
for $line in cts:search(/line-doc, $character)
return
xdmp:node-replace(
$line/line-content/LINE,
cts:highlight( $line/line-content/LINE, cts:word-query($character), <character>{$cts:text}</character>)
)
', (xs:QName("character"), $character)
)
#!/usr/bin/python
# regular expressions
import re
## xml libs
import lxml.etree as etree
from lxml.builder import E
from lxml.builder import ElementMaker
E = ElementMaker()
CHAPTER = E.CHAPTER
TITLE = E.TITLE
VOLUMETITLE = E.VOLUMETITLE
VOLUMENUM = E.VOLUMENUM
BOOKTITLE = E.BOOKTITLE
BOOKNUM = E.BOOKNUM
CHAPTERTITLE = E.CHAPTERTITLE
CHAPTERNUM = E.CHAPTERNUM
LINES = E.LINES
def saveChapter(lines, volNum, volText, bookNum, bookText, chapNum, chapText):
encoding = "mac-roman"
print "\tLines", volNum, volText, bookNum, bookText, chapNum, chapText
linesStr = "<LINES>"
for line in lines:
linesStr += '<LINE>'+line.decode(encoding)+'</LINE>'
linesStr += "</LINES>"
my_doc = CHAPTER(
TITLE("Lord of The Rings"),
VOLUMETITLE(volText.decode(encoding)),
VOLUMENUM(str(volNum)),
BOOKTITLE(bookText.decode(encoding)),
BOOKNUM(str(bookNum)),
CHAPTERTITLE(chapText.decode(encoding)),
CHAPTERNUM(str(chapNum)),
etree.XML(linesStr)
)
wf = file("lotr-"+str(volNum)+"-"+str(bookNum)+"-"+str(chapNum)+".xml", "w")
wf.write( etree.tostring(my_doc, pretty_print=False) )
wf.close()
# filenames of LOTR books
books = ["Fellowship of The Ring.txt", "The Two Towers.txt", "The Return of TheKing.txt"]
# read the fellowship book into a variable
fh = file( books[0] , 'r')
fellowship = fh.read()
fh.close()
lines = fellowship.split('\r')
tocLine = -1
endLine = -1
# Identify line boundaries
count = -1
for line in lines:
count += 1
if tocLine == -1 and re.search("CONTENTS",line) :
tocLine = count
elif endLine == -1 and re.search("------------",line) :
endLine = count
print "tocLine:", tocLine
print "endLine:", endLine
#TOC lists
tocVolumes = []
volumeIter = -1
tocBooks = []
bookIter = -1
tocChapters = []
# full the TOC lists
count = -1
for line in lines[tocLine+1:endLine]:
if re.match("\t",line):
#new volume
print "Detected Volume:", line.strip()
tocVolumes.append(line.strip())
volumeIter += 1
bookIter = -1
tocBooks.append([])
tocChapters.append([])
elif re.match(' Book',line):
#new book
print "Detected Book:", line.strip()
tocBooks[volumeIter].append(line.strip())
bookIter += 1
tocChapters[volumeIter].append([])
elif re.match(' Chapter [\d]* ',line):
#new chapter
chap = re.sub("Chapter [\d]* ","", line.strip())
print "Detected Chapter:", chap
tocChapters[volumeIter][bookIter].append(chap)
# array representation of the TOC
print tocVolumes
print tocBooks
print tocChapters
# nested text representation of the TOC
for v, vol in enumerate(tocVolumes):
print v, vol
for b, book in enumerate(tocBooks[v]):
print ".".join([str(v), str(b)]), book
for c, chap in enumerate(tocChapters[v][b]):
print ".".join([str(v), str(b), str(c)]), chap
tocVolIter = -1
tocBookIter = -1
tocChapIter = -1
# Load all of LOTR
fh = file( "lotr.txt" , 'r')
lotr = fh.read()
fh.close()
lines = lotr.split('\r')
#identify the start of all chapters
chapterStartLines = []
i = iter(lines[endLine+1:])
lineCount = endLine
try:
# Loop through all volumes identifying their start lines
for volumeNum, volumeText in enumerate(tocVolumes):
# find the volume head
while True:
line = i.next()
lineCount += 1
if re.match("[\ ]{4}[\s]*"+volumeText,line):
print lineCount, line
break
# book heads are not really mentioned
for bookNum, book in enumerate(tocBooks[volumeNum]):
print lineCount, book
isReadingChap = False
for chapNum, chapter in enumerate(tocChapters[volumeNum][bookNum]):
# find the chapter head
while True:
line = i.next()
lineCount += 1
if re.match('[\ ]{4}[\s]*"Chapter '+str(chapNum+1)+'"',line):
#found chapter head
print lineCount, chapter
print "Identified: ","Volume",volumeNum+1,"Book",bookNum+1,"Chapter",chapNum+1,tocChapters[volumeNum][bookNum][chapNum]
chapterStartLines.append(lineCount)
isReadingChap = True
break
elif isReadingChap:
pass
except StopIteration:
pass
# Using identified line starts now split and save the lines
chapCounter = -1
for volumeNum, volumeText in enumerate(tocVolumes):
for bookNum, book in enumerate(tocBooks[volumeNum]):
for chapNum, chapter in enumerate(tocChapters[volumeNum][bookNum]):
chapCounter += 1
startLine = chapterStartLines[chapCounter]
endLine = ""
if chapCounter+1 < len(chapterStartLines):
endLine = chapterStartLines[chapCounter+1] -1
else:
endLine = "END"
print chapCounter, tocChapters[volumeNum][bookNum][chapNum], startLine, endLine
print "Saving: ","Volume",volumeNum+1,"Book",bookNum+1,"Chapter",chapNum-1,tocChapters[volumeNum][bookNum][chapNum-2]
saveLines = []
if endLine == "END":
saveLines = lines[startLine:]
else:
saveLines = lines[startLine:endLine]
saveChapter(saveLines, volumeNum+1, tocVolumes[volumeNum], bookNum+1, tocBooks[volumeNum][bookNum], chapNum+1, tocChapters[volumeNum][bookNum][chapNum])
xquery version "1.0-ml";
declare variable $fellowship as xs:string* := ("Frodo", "Sam", "Merry", "Pippin", "Boromir", "Gimli", "Legolas", "Aragorn", "Gandalf");
<table border="1">
<thead>
<tr>
<th>Chapter</th>
{
for $character in $fellowship
return
<th>{$character}</th>
}
</tr>
</thead>
<tbody>
{
for $volume in cts:element-values(xs:QName("VOLUMENUM"))
order by $volume ascending
return
for $book in cts:element-values(xs:QName("BOOKNUM"), (), (), cts:element-value-query(xs:QName("VOLUMENUM"), fn:string($volume)))
order by $book ascending
return
for $chapter in cts:element-values(xs:QName("CHAPTERNUM"), (), (), cts:and-query((
cts:element-value-query(xs:QName("VOLUMENUM"), fn:string($volume)),
cts:element-value-query(xs:QName("BOOKNUM"), fn:string($book)))))
let $chapter-text := fn:string-join((fn:format-number($volume, "00"), fn:format-number($book, "00"), fn:format-number($chapter, "00")), "-")
order by $chapter ascending
return
<tr>
<td>{$chapter-text}</td>
{
for $character in $fellowship
let $count := xdmp:estimate( cts:search(/line-doc, cts:and-query((
cts:element-value-query(xs:QName("VOLUMENUM"), fn:string($volume)),
cts:element-value-query(xs:QName("BOOKNUM"), fn:string($book)),
cts:element-value-query(xs:QName("CHAPTERNUM"), fn:string($chapter)),
cts:element-value-query(xs:QName("character"), $character)
)) ) )
return
<td>{$count}</td>
}
</tr>
}
</tbody>
</table>
xquery version "1.0-ml";
for $chap in fn:collection("staging")/CHAPTER
let $title := $chap/TITLE/fn:string()
let $volnum := $chap/VOLUMENUM/fn:string()
let $booknum := $chap/BOOKNUM/fn:string()
let $chapternum := $chap/CHAPTERNUM/fn:string()
let $folder-uri := fn:string-join(("/books", fn:encode-for-uri($title), $volnum, $booknum, $chapternum ),"/")
return
for $line at $l in $chap/LINES/LINE
let $uri := fn:concat($folder-uri, "/", $l, ".xml")
let $doc :=
element line-doc {
element line-meta {
for $n in $chap/node() except $chap/LINES return $n,
element line-number { $l },
element original-text { $line/fn:string() }
},
element line-content {
$line
}
}
return
($uri, xdmp:document-insert($uri, $doc, (), ("book", $title)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment