derickson/enrich.xqy

## enrich.xqy
xquery version "1.0-ml";

for $character in ("Frodo", "Sam", "Merry", "Pippin", "Boromir", "Gimli", "Legolas", "Aragorn", "Gandalf")
return
xdmp:eval('
xquery version"1.0-ml";
declare variable $character as xs:string external;
for $line in cts:search(/line-doc, $character)
return
  xdmp:node-replace(
    $line/line-content/LINE,
    cts:highlight( $line/line-content/LINE, cts:word-query($character), <character>{$cts:text}</character>)
  )
 ', (xs:QName("character"), $character)
)

## parse.py
#!/usr/bin/python

# regular expressions
import re

## xml libs
import lxml.etree as etree
from lxml.builder import E
from lxml.builder import ElementMaker

E = ElementMaker()
CHAPTER = E.CHAPTER
TITLE = E.TITLE
VOLUMETITLE = E.VOLUMETITLE
VOLUMENUM = E.VOLUMENUM
BOOKTITLE = E.BOOKTITLE
BOOKNUM = E.BOOKNUM
CHAPTERTITLE = E.CHAPTERTITLE
CHAPTERNUM = E.CHAPTERNUM
LINES = E.LINES


def saveChapter(lines, volNum, volText, bookNum, bookText, chapNum, chapText):
	encoding = "mac-roman"
	print "\tLines", volNum, volText, bookNum, bookText, chapNum, chapText

	linesStr = "<LINES>"
	for line in lines:
		linesStr += '<LINE>'+line.decode(encoding)+'</LINE>'
	linesStr += "</LINES>"

	my_doc = CHAPTER(
		TITLE("Lord of The Rings"),
		VOLUMETITLE(volText.decode(encoding)),
		VOLUMENUM(str(volNum)),
		BOOKTITLE(bookText.decode(encoding)),
		BOOKNUM(str(bookNum)),
		CHAPTERTITLE(chapText.decode(encoding)),
		CHAPTERNUM(str(chapNum)),
		etree.XML(linesStr)
	)
	wf = file("lotr-"+str(volNum)+"-"+str(bookNum)+"-"+str(chapNum)+".xml", "w")
	wf.write( etree.tostring(my_doc, pretty_print=False) )
	wf.close()

# filenames of LOTR books
books = ["Fellowship of The Ring.txt", "The Two Towers.txt", "The Return of TheKing.txt"]

# read the fellowship book into a variable
fh = file( books[0] , 'r')
fellowship = fh.read()
fh.close()


lines = fellowship.split('\r')

tocLine = -1
endLine = -1

# Identify line boundaries
count = -1
for line in lines:
	count += 1
	if tocLine == -1 and re.search("CONTENTS",line) :
		tocLine = count
	elif endLine == -1 and re.search("------------",line) :
		endLine = count

print "tocLine:", tocLine
print "endLine:", endLine

#TOC lists
tocVolumes = []
volumeIter = -1
tocBooks = []
bookIter = -1
tocChapters = []

# full the TOC lists
count = -1
for line in lines[tocLine+1:endLine]:
	if re.match("\t",line):
		#new volume
		print "Detected Volume:", line.strip()
		tocVolumes.append(line.strip())
		volumeIter += 1
		bookIter = -1
		tocBooks.append([])
		tocChapters.append([])
	elif re.match(' Book',line):
		#new book
		print "Detected Book:", line.strip()
		tocBooks[volumeIter].append(line.strip())
		bookIter += 1
		tocChapters[volumeIter].append([])
	elif re.match(' Chapter [\d]* ',line):
		#new chapter
		chap = re.sub("Chapter [\d]* ","", line.strip())
		print "Detected Chapter:", chap
		tocChapters[volumeIter][bookIter].append(chap)

# array representation of the TOC
print tocVolumes
print tocBooks
print tocChapters

# nested text representation of the TOC
for v, vol in enumerate(tocVolumes):
	print v, vol
	for b, book in enumerate(tocBooks[v]):
		print ".".join([str(v), str(b)]), book
		for c, chap in enumerate(tocChapters[v][b]):
			print ".".join([str(v), str(b), str(c)]), chap


tocVolIter = -1
tocBookIter = -1
tocChapIter = -1

# Load all of LOTR
fh = file( "lotr.txt" , 'r')
lotr = fh.read()
fh.close()
lines = lotr.split('\r')


#identify the start of all chapters
chapterStartLines = []
i = iter(lines[endLine+1:])
lineCount = endLine
try:
	# Loop through all volumes identifying their start lines
	for volumeNum, volumeText in enumerate(tocVolumes):
		# find the volume head
		while True:
			line = i.next()
			lineCount += 1

			if re.match("[\ ]{4}[\s]*"+volumeText,line):
				print lineCount, line
				break

		# book heads are not really mentioned
		for bookNum, book in enumerate(tocBooks[volumeNum]):
			print lineCount, book

			isReadingChap = False

			for chapNum, chapter in enumerate(tocChapters[volumeNum][bookNum]):

				# find the chapter head
				while True:
					line = i.next()
					lineCount += 1

					if re.match('[\ ]{4}[\s]*"Chapter '+str(chapNum+1)+'"',line):
						#found chapter head
						print lineCount, chapter
						print "Identified: ","Volume",volumeNum+1,"Book",bookNum+1,"Chapter",chapNum+1,tocChapters[volumeNum][bookNum][chapNum]
						chapterStartLines.append(lineCount)

						isReadingChap = True
						break
					elif isReadingChap:
						pass


except StopIteration:
	pass


# Using identified line starts now split and save the lines
chapCounter = -1
for volumeNum, volumeText in enumerate(tocVolumes):
	for bookNum, book in enumerate(tocBooks[volumeNum]):
		for chapNum, chapter in enumerate(tocChapters[volumeNum][bookNum]):
			chapCounter += 1
			startLine = chapterStartLines[chapCounter]
			endLine = ""
			if  chapCounter+1 < len(chapterStartLines):
				endLine = chapterStartLines[chapCounter+1] -1
			else:
				endLine = "END"
			print chapCounter, tocChapters[volumeNum][bookNum][chapNum], startLine, endLine
			print "Saving: ","Volume",volumeNum+1,"Book",bookNum+1,"Chapter",chapNum-1,tocChapters[volumeNum][bookNum][chapNum-2]
			saveLines = []
			if endLine == "END":
				saveLines = lines[startLine:]
			else:
				saveLines = lines[startLine:endLine]
			saveChapter(saveLines, volumeNum+1, tocVolumes[volumeNum], bookNum+1, tocBooks[volumeNum][bookNum], chapNum+1, tocChapters[volumeNum][bookNum][chapNum])


## report.xqy
xquery version "1.0-ml";

declare variable $fellowship as xs:string* := ("Frodo", "Sam", "Merry", "Pippin", "Boromir", "Gimli", "Legolas", "Aragorn", "Gandalf");

<table border="1">
<thead>
  <tr>
    <th>Chapter</th>
    {
      for $character in $fellowship
      return
        <th>{$character}</th>
    }
  </tr>
</thead>
<tbody>
{
for $volume in cts:element-values(xs:QName("VOLUMENUM"))
order by $volume ascending
return
for $book in cts:element-values(xs:QName("BOOKNUM"), (), (), cts:element-value-query(xs:QName("VOLUMENUM"), fn:string($volume)))
order by $book ascending
return
for $chapter in cts:element-values(xs:QName("CHAPTERNUM"), (), (), cts:and-query((
      cts:element-value-query(xs:QName("VOLUMENUM"), fn:string($volume)),
      cts:element-value-query(xs:QName("BOOKNUM"), fn:string($book)))))
let $chapter-text := fn:string-join((fn:format-number($volume, "00"), fn:format-number($book, "00"), fn:format-number($chapter, "00")), "-")
order by $chapter ascending
return

<tr>
  <td>{$chapter-text}</td>
  {
    for $character in $fellowship
    let $count := xdmp:estimate( cts:search(/line-doc, cts:and-query((

      cts:element-value-query(xs:QName("VOLUMENUM"), fn:string($volume)),
      cts:element-value-query(xs:QName("BOOKNUM"), fn:string($book)),
      cts:element-value-query(xs:QName("CHAPTERNUM"), fn:string($chapter)),
      cts:element-value-query(xs:QName("character"), $character)

    )) ) )
    return
      <td>{$count}</td>
  }
</tr>

}
</tbody>
</table>

## split-book-lines.xqy
xquery version "1.0-ml";


for $chap in fn:collection("staging")/CHAPTER
let $title := $chap/TITLE/fn:string()
let $volnum := $chap/VOLUMENUM/fn:string()
let $booknum := $chap/BOOKNUM/fn:string()
let $chapternum := $chap/CHAPTERNUM/fn:string()
let $folder-uri := fn:string-join(("/books", fn:encode-for-uri($title), $volnum, $booknum, $chapternum ),"/")
return
  for $line at $l in $chap/LINES/LINE
  let $uri := fn:concat($folder-uri, "/", $l, ".xml")
  let $doc :=
    element line-doc {
      element line-meta {
        for $n in $chap/node() except $chap/LINES return $n,
        element line-number { $l },
        element original-text { $line/fn:string() }
      },
      element line-content {
        $line
      }
    }
  return
    ($uri, xdmp:document-insert($uri, $doc, (), ("book", $title)))
	xquery version "1.0-ml";

	for $character in ("Frodo", "Sam", "Merry", "Pippin", "Boromir", "Gimli", "Legolas", "Aragorn", "Gandalf")
	return
	xdmp:eval('
	xquery version"1.0-ml";
	declare variable $character as xs:string external;
	for $line in cts:search(/line-doc, $character)
	return
	xdmp:node-replace(
	$line/line-content/LINE,
	cts:highlight( $line/line-content/LINE, cts:word-query($character), <character>{$cts:text}</character>)
	)
	', (xs:QName("character"), $character)
	)
	#!/usr/bin/python

	# regular expressions
	import re

	## xml libs
	import lxml.etree as etree
	from lxml.builder import E
	from lxml.builder import ElementMaker

	E = ElementMaker()
	CHAPTER = E.CHAPTER
	TITLE = E.TITLE
	VOLUMETITLE = E.VOLUMETITLE
	VOLUMENUM = E.VOLUMENUM
	BOOKTITLE = E.BOOKTITLE
	BOOKNUM = E.BOOKNUM
	CHAPTERTITLE = E.CHAPTERTITLE
	CHAPTERNUM = E.CHAPTERNUM
	LINES = E.LINES



	def saveChapter(lines, volNum, volText, bookNum, bookText, chapNum, chapText):
	encoding = "mac-roman"
	print "\tLines", volNum, volText, bookNum, bookText, chapNum, chapText

	linesStr = "<LINES>"
	for line in lines:
	linesStr += '<LINE>'+line.decode(encoding)+'</LINE>'
	linesStr += "</LINES>"

	my_doc = CHAPTER(
	TITLE("Lord of The Rings"),
	VOLUMETITLE(volText.decode(encoding)),
	VOLUMENUM(str(volNum)),
	BOOKTITLE(bookText.decode(encoding)),
	BOOKNUM(str(bookNum)),
	CHAPTERTITLE(chapText.decode(encoding)),
	CHAPTERNUM(str(chapNum)),
	etree.XML(linesStr)
	)
	wf = file("lotr-"+str(volNum)+"-"+str(bookNum)+"-"+str(chapNum)+".xml", "w")
	wf.write( etree.tostring(my_doc, pretty_print=False) )
	wf.close()

	# filenames of LOTR books
	books = ["Fellowship of The Ring.txt", "The Two Towers.txt", "The Return of TheKing.txt"]

	# read the fellowship book into a variable
	fh = file( books[0] , 'r')
	fellowship = fh.read()
	fh.close()


	lines = fellowship.split('\r')

	tocLine = -1
	endLine = -1

	# Identify line boundaries
	count = -1
	for line in lines:
	count += 1
	if tocLine == -1 and re.search("CONTENTS",line) :
	tocLine = count
	elif endLine == -1 and re.search("------------",line) :
	endLine = count

	print "tocLine:", tocLine
	print "endLine:", endLine

	#TOC lists
	tocVolumes = []
	volumeIter = -1
	tocBooks = []
	bookIter = -1
	tocChapters = []

	# full the TOC lists
	count = -1
	for line in lines[tocLine+1:endLine]:
	if re.match("\t",line):
	#new volume
	print "Detected Volume:", line.strip()
	tocVolumes.append(line.strip())
	volumeIter += 1
	bookIter = -1
	tocBooks.append([])
	tocChapters.append([])
	elif re.match(' Book',line):
	#new book
	print "Detected Book:", line.strip()
	tocBooks[volumeIter].append(line.strip())
	bookIter += 1
	tocChapters[volumeIter].append([])
	elif re.match(' Chapter [\d]* ',line):
	#new chapter
	chap = re.sub("Chapter [\d]* ","", line.strip())
	print "Detected Chapter:", chap
	tocChapters[volumeIter][bookIter].append(chap)

	# array representation of the TOC
	print tocVolumes
	print tocBooks
	print tocChapters

	# nested text representation of the TOC
	for v, vol in enumerate(tocVolumes):
	print v, vol
	for b, book in enumerate(tocBooks[v]):
	print ".".join([str(v), str(b)]), book
	for c, chap in enumerate(tocChapters[v][b]):
	print ".".join([str(v), str(b), str(c)]), chap


	tocVolIter = -1
	tocBookIter = -1
	tocChapIter = -1

	# Load all of LOTR
	fh = file( "lotr.txt" , 'r')
	lotr = fh.read()
	fh.close()
	lines = lotr.split('\r')


	#identify the start of all chapters
	chapterStartLines = []
	i = iter(lines[endLine+1:])
	lineCount = endLine
	try:
	# Loop through all volumes identifying their start lines
	for volumeNum, volumeText in enumerate(tocVolumes):
	# find the volume head
	while True:
	line = i.next()
	lineCount += 1

	if re.match("[\ ]{4}[\s]*"+volumeText,line):
	print lineCount, line
	break

	# book heads are not really mentioned
	for bookNum, book in enumerate(tocBooks[volumeNum]):
	print lineCount, book

	isReadingChap = False

	for chapNum, chapter in enumerate(tocChapters[volumeNum][bookNum]):

	# find the chapter head
	while True:
	line = i.next()
	lineCount += 1

	if re.match('[\ ]{4}[\s]*"Chapter '+str(chapNum+1)+'"',line):
	#found chapter head
	print lineCount, chapter
	print "Identified: ","Volume",volumeNum+1,"Book",bookNum+1,"Chapter",chapNum+1,tocChapters[volumeNum][bookNum][chapNum]
	chapterStartLines.append(lineCount)

	isReadingChap = True
	break
	elif isReadingChap:
	pass




	except StopIteration:
	pass


	# Using identified line starts now split and save the lines
	chapCounter = -1
	for volumeNum, volumeText in enumerate(tocVolumes):
	for bookNum, book in enumerate(tocBooks[volumeNum]):
	for chapNum, chapter in enumerate(tocChapters[volumeNum][bookNum]):
	chapCounter += 1
	startLine = chapterStartLines[chapCounter]
	endLine = ""
	if chapCounter+1 < len(chapterStartLines):
	endLine = chapterStartLines[chapCounter+1] -1
	else:
	endLine = "END"
	print chapCounter, tocChapters[volumeNum][bookNum][chapNum], startLine, endLine
	print "Saving: ","Volume",volumeNum+1,"Book",bookNum+1,"Chapter",chapNum-1,tocChapters[volumeNum][bookNum][chapNum-2]
	saveLines = []
	if endLine == "END":
	saveLines = lines[startLine:]
	else:
	saveLines = lines[startLine:endLine]
	saveChapter(saveLines, volumeNum+1, tocVolumes[volumeNum], bookNum+1, tocBooks[volumeNum][bookNum], chapNum+1, tocChapters[volumeNum][bookNum][chapNum])
	xquery version "1.0-ml";

	declare variable $fellowship as xs:string* := ("Frodo", "Sam", "Merry", "Pippin", "Boromir", "Gimli", "Legolas", "Aragorn", "Gandalf");

	<table border="1">
	<thead>
	<tr>
	<th>Chapter</th>
	{
	for $character in $fellowship
	return
	<th>{$character}</th>
	}
	</tr>
	</thead>
	<tbody>
	{
	for $volume in cts:element-values(xs:QName("VOLUMENUM"))
	order by $volume ascending
	return
	for $book in cts:element-values(xs:QName("BOOKNUM"), (), (), cts:element-value-query(xs:QName("VOLUMENUM"), fn:string($volume)))
	order by $book ascending
	return
	for $chapter in cts:element-values(xs:QName("CHAPTERNUM"), (), (), cts:and-query((
	cts:element-value-query(xs:QName("VOLUMENUM"), fn:string($volume)),
	cts:element-value-query(xs:QName("BOOKNUM"), fn:string($book)))))
	let $chapter-text := fn:string-join((fn:format-number($volume, "00"), fn:format-number($book, "00"), fn:format-number($chapter, "00")), "-")
	order by $chapter ascending
	return

	<tr>
	<td>{$chapter-text}</td>
	{
	for $character in $fellowship
	let $count := xdmp:estimate( cts:search(/line-doc, cts:and-query((

	cts:element-value-query(xs:QName("VOLUMENUM"), fn:string($volume)),
	cts:element-value-query(xs:QName("BOOKNUM"), fn:string($book)),
	cts:element-value-query(xs:QName("CHAPTERNUM"), fn:string($chapter)),
	cts:element-value-query(xs:QName("character"), $character)

	)) ) )
	return
	<td>{$count}</td>
	}
	</tr>

	}
	</tbody>
	</table>
	xquery version "1.0-ml";


	for $chap in fn:collection("staging")/CHAPTER
	let $title := $chap/TITLE/fn:string()
	let $volnum := $chap/VOLUMENUM/fn:string()
	let $booknum := $chap/BOOKNUM/fn:string()
	let $chapternum := $chap/CHAPTERNUM/fn:string()
	let $folder-uri := fn:string-join(("/books", fn:encode-for-uri($title), $volnum, $booknum, $chapternum ),"/")
	return
	for $line at $l in $chap/LINES/LINE
	let $uri := fn:concat($folder-uri, "/", $l, ".xml")
	let $doc :=
	element line-doc {
	element line-meta {
	for $n in $chap/node() except $chap/LINES return $n,
	element line-number { $l },
	element original-text { $line/fn:string() }
	},
	element line-content {
	$line
	}
	}
	return
	($uri, xdmp:document-insert($uri, $doc, (), ("book", $title)))