Skip to content

Instantly share code, notes, and snippets.

@rsimon
Created December 4, 2013 09:38
Show Gist options
  • Save rsimon/7784870 to your computer and use it in GitHub Desktop.
Save rsimon/7784870 to your computer and use it in GitHub Desktop.
Scala transformation script to extract plaintext from TEI - Pliny Natural History
import scala.xml.XML
import java.io.FileWriter
import scala.xml.transform.RewriteRule
import scala.xml.Node
import scala.xml.NodeSeq
import scala.xml.Elem
import scala.xml.transform.RuleTransformer
import scala.xml.Text
object TEI extends App {
val xml = XML.loadFile("/home/simonr/Workspaces/pelagios/pelagios3-scripts/tei/Perseus_text_1999.02.0137.xml")
val books = xml \\ "div1"
books.foreach(book => {
val bookNumber = (book \ "@n").text.toInt
// We're only interested in books 3 - 6 (the geographical ones)
if (bookNumber > 2 && bookNumber < 7) {
(book \\ "div2").foreach(chapter => {
val writer = new FileWriter("book" + bookNumber + "_chapter" + (chapter \ "@n").text + ".txt")
// Remove 'note' tags
val removeNotes = new RewriteRule {
override def transform(n: Node): NodeSeq = n match {
case e: Elem if (e.label.equals("note")) => Text("")
case e: Elem if (e.label.equals("head")) => Text(e.text + "\n\n")
case e: Text => Text(e.text.replace("\n", " "))
case n => n
}
}
val cleaned = new RuleTransformer(removeNotes).transform(chapter).text.replaceAll(" +", " ").replace("\n ", "\n").trim()
writer.write(cleaned)
writer.flush()
writer.close()
})
}
})
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment