Created February 7, 2012 20:06
Some quick and dirty code to scrape two different wiki's about skyrim books compile them into one list
import groovy.json.JsonBuilder
import groovy.json.JsonSlurper
def knownNonBooks = [ // Not listed on any wiki's
"adonato's book",
"alchemist's note",
"bandit's journal",
"decree of monument",
"imperial missive (battle-born)",
"love poem",
"master illusion text",
"oghma infinium", // Skill Book
"power of the elements", // Diff versions
"adventurer's journal",
"bandit's journal",
"butcher journal",
"cicero's journal - final volume",
"cicero's journal - volume 1",
"cicero's journal - volume 2",
"cicero's journal - volume 3",
"cicero's journal - volume 4",
"lymdrenn tenvanni's journal",
"nepos's journal",
"tolfdir's book",
"uncommon taste - signed" // special
def renames = [ // I got tired of updating the wikis
("ahzirr trajijazaeri"): "ahzirr traajijazeri",
("biography of barenziah, vol 3"): "biography of barenziah, v3",
("feyfolken, book i"): "feyfolken i",
("forge hammer and anvil"): "forge, hammer and anvil",
("incident in necrom"): "incident at necrom",
("kolb and the dragon"): "kolb & the dragon",
("mystery of talara, part 4"): "mystery of talara, part 4",
("mystery of talara, v1"): "mystery of talara, v 1",
("palla, book i"): "palla, volume 1",
("palla, book ii"): "palla, volume 2",
("song of skyrim"): "songs of skyrim",
("sovngarde, a reexamination"): "sovngarde: a reexamination",
("the alduin-akatosh dichotomy"): "the alduin/akatosh dichotomy",
("the lusty argonian maid vol 1"): "the lusty argonian maid, v1",
("the nirnoot missive"): "the nirnroot missive",
("the real barenziah, book i"): "the real barenziah, v1",
("the real barenziah, book ii"): "the real barenziah, v2",
("the real barenziah, book iii"): "the real barenziah, v3",
("the real barenziah, book iv"): "the real barenziah, v4",
("the real barenziah, book v"): "the real barenziah, v5",
("the song of pelinal, book i"): "the song of pelinal, v1",
("the song of pelinal, book ii"): "the song of pelinal, v2",
("the song of pelinal, book iii"): "the song of pelinal, v3",
("the song of pelinal, book iv"): "the song of pelinal, v4",
("the song of pelinal, book v"): "the song of pelinal, v5",
("the song of pelinal, book vi"): "the song of pelinal, v6",
("the song of pelinal, book vii"): "the song of pelinal, v7",
("the song of pelinal, book viii"): "the song of pelinal, v8",
("the tale of dro'zina"): "the tale of dro'zira",
("the wolf queen, book i"): "the wolf queen, v1",
("the wolf queen, book ii"): "the wolf queen, v2",
("the wolf queen, book iii"): "the wolf queen, v3",
("the wolf queen, book iv"): "the wolf queen, v4",
("the wolf queen, book v"): "the wolf queen, v5",
("the wolf queen, book vi"): "the wolf queen, v6",
("the wolf queen, book vii"): "the wolf queen, v7",
("the wolf queen, book viii"): "the wolf queen, v8",
("wabbajack ()"): "wabbajack"
def getPage = {title -> "?action=query&titles=${title}&prop=revisions&rvprop=content"}
def buildWikiaBookFile = {
def wikiaApi = ''
def wikiaBooksJson = new JsonBuilder()
wikiaBooksJson.books {
def wikiaBooksPage = wikiaApi.concat('?action=query&titles=Books_(Skyrim)&prop=revisions&rvprop=content').toURL().getContent().text
def wikiaBooks = (wikiaBooksPage =~ /(-\n\|\[\[([\d\s\w,.'()-:!&;]*\|)?)([\d\s\w,.'()-:!&;]*)/).each {
def bookTitle = it[3]
def pageTitle = URLEncoder.encode(it[2] ? it[2][0..-2] : it[3])
def bookPage = wikiaApi.concat(getPage(pageTitle)).toURL().getContent().text
def titleMatch = bookPage =~ /(\|title\s=\s)([\d\s\w,.'()-:!&;]*)/
bookTitle = (titleMatch.getCount() ? titleMatch[0][2] : bookTitle).trim()
def skyrimPartMatch = bookPage =~ /\|skyrim\s=\s\{\{Book\/game[\s\n\|\w=\[\]\(\)]*/
def skyrimPart = skyrimPartMatch.getCount() ? skyrimPartMatch[0] : ""
def idMatch = skyrimPart =~ /(id\s=\s)([0-9A-F]*)/
def id = idMatch.getCount() ? idMatch[0][2] : null
def skillMatch = skyrimPart =~ /(skill\s=\s\[\[)/
def skill = skillMatch.getCount() > 0
def questItem = bookPage.contains('Related Quest')
if (renames.get(bookTitle.toLowerCase().trim())) {
bookTitle = renames.get(bookTitle.toLowerCase().trim())
"$bookTitle"(id: id, skill: skill, questItem: questItem)
new File("SkyrimWikiaBooks.json").write(wikiaBooksJson.toPrettyString())
//buildWikiaBookFile(); print ".";
def qualityAssertions = {json ->
Set ids = [] as Set
json.books.each {book, props ->
if ( {
assert !ids.contains(, "Book '$book' has duplicate id. ${}"
ids <<
assert book, "Book doesn't have a name. ${props}"
def buildWikiaJournalAndNotes = {
def wikiaApi = ''
def wikiaBooksJson = new JsonBuilder()
def nonBooks = []
def wikiaJournals = wikiaApi.concat('?action=query&titles=Books_(Skyrim)&prop=revisions&rvprop=content&rvsection=3').toURL().getContent().text
def journals = (wikiaJournals =~ /(\*\[\[)([\d\s\w,.'()-:!&;]*)(\|([\d\s\w,.'()-:!&;#]*))?/).collect { it[4] ?: it[2] }
def wikiaNotes = wikiaApi.concat('?action=query&titles=Books_(Skyrim)&prop=revisions&rvprop=content&rvsection=4').toURL().getContent().text
def notes = (wikiaNotes =~ /(\*\[\[)([\d\s\w,.'()-:!&;]*)(\|([\d\s\w,.'()-:!&;#]*))?/).collect { it[4] ?: it[2] }
new File("SkyrimWikiaNonBooks.json").write(wikiaBooksJson.toPrettyString())
//buildWikiaJournalAndNotes(); print ".";
def getWikiaBookJson = {
new JsonSlurper().parseText(new File("SkyrimWikiaBooks.json").text)
def getWikiaNonBooks = {
new JsonSlurper().parseText(new File("SkyrimWikiaNonBooks.json").text)
//qualityAssertions(getWikiaBookJson()); print ".";
def buildUespBookFile = {
def uespApi = ''
def uespBooksPage = uespApi.concat('?action=query&titles=Skyrim:Books&prop=revisions&rvprop=content').toURL().getContent().text
def uespBookTitles = (uespBooksPage =~ /(\{\{Book\sNormal\s\|\stitle=)([\d\s\w,.'()-:!&;]*)(\s?\|\s?alttitle=([\d\s\w,.'()-:!&;]*))?/).collect { it[4] ?: it[2] }
def uespBooksJson = new JsonBuilder()
uespBooksJson.books {
uespBookTitles.each {title ->
def bookPage = uespApi.concat(getPage('Skyrim:' + title.replaceAll(" ", "_"))).toURL().getContent().text
def summaryMatch = bookPage =~ /(\{\{Book\sSummary)([\n\w\d\s\|='\[:,\]\&;#*\(\)-\@\~\!\$\%\^\_\+]*)/
def summary = summaryMatch.getCount() ? summaryMatch[0][2] : ""
def idMatch = summary =~ /(\|id\s?=\s?)([0-9A-F]*)/
def id = idMatch.getCount() ? idMatch[0][2] : null
def skillMatch = summary =~ /(skill\s?=\s?)([A-Z]*)/
def skill = skillMatch.getCount() > 0
if (title.contains("(book)")) {
title = title.replaceAll("(book)", "")
if (renames.get(title.toLowerCase().trim())) {
title = renames.get(title.toLowerCase().trim())
"$title"(id: id, skill: skill)
new File("SkyrimUespBooks.json").write(uespBooksJson.toPrettyString())
//buildUespBookFile(); print ".";
def getUespBookJson = {
new JsonSlurper().parseText(new File("SkyrimUespBooks.json").text)
//qualityAssertions(getUespBookJson()); print ".";
def listUnknownIds = {json ->
json.books.findAll {name, props -> ! }.each { println it.key }
//println "Wikia Books (journals not included) Count ${getWikiaBookJson().books.size()}"
//println "Uesp Books Count ${getUespBookJson().books.size()}"
def printRecords = {json1, title1, json2, title2, nonBooks, showNonBooks = false ->
def books1 = json1.books.findAll {k, v -> !v.journal}.keySet()*.trim()*.toLowerCase().sort()
def books2 = json2.books.findAll {k, v -> !v.journal}.keySet()*.trim()*.toLowerCase().sort()
def lowerNonBooks = nonBooks*.toLowerCase()*.trim().sort()
def printRow = { c1, c2 ->
def nonBook = (lowerNonBooks.contains(c1) || lowerNonBooks.contains(c2) ||
knownNonBooks.contains(c1) || knownNonBooks.contains(c2)) ? "NB" : ""
if (!showNonBooks && nonBook) return
println "\t| ${nonBook.padRight(2)} | ${c1.padRight(45)} | ${c2.padRight(45)} |"
printRow(title1, title2)
def b1Iter = books1.iterator()
def b2Iter = books2.iterator()
String b1 =
String b2 =
while (b1Iter.hasNext() || b2Iter.hasNext()) {
def compare = b1 <=> b2
if (compare == 0) {
printRow(b1, b2)
b1 = b1Iter.hasNext() ? : ""
b2 = b2Iter.hasNext() ? : ""
} else if (compare < 0) {
printRow(b1, "")
b1 = b1Iter.hasNext() ? : ""
} else {
printRow("", b2)
b2 = b2Iter.hasNext() ? : ""
//printRecords(getWikiaBookJson(), "WIKIA", getUespBookJson(), "UESP", getWikiaNonBooks())
def lowerCaseTitleJson = {fileName ->
def json = new JsonSlurper().parseText(new File(fileName).text)
def newBooks = json.books.collectEntries {k, v ->
[(k.toLowerCase().trim()): v]
def newJson = new JsonBuilder()
new File(fileName).write(newJson.toPrettyString())
//lowerCaseTitleJson("SkyrimUespBooks.json"); print ".";
//lowerCaseTitleJson("SkyrimWikiaBooks.json"); print ".";
def lowerNonBooks = {fileName ->
def json = new JsonSlurper().parseText(new File(fileName).text)
def newBooks = json.collect {k ->
def newJson = new JsonBuilder()
new File(fileName).write(newJson.toPrettyString())
//lowerNonBooks("SkyrimWikiaNonBooks.json"); print ".";
def mergeOnlyBooks = {booksComplete, books2, nonBooks ->
def mergedJson = new JsonBuilder()
def merged = booksComplete.findAll {name, p -> !nonBooks.contains(name)}.collectEntries {name, props ->
def otherProps = books2.get(name)
def id =
if (otherProps?.id) {
assert id ==, "Book $name mismatch ids"
[(name): id]
new File("SkyrimMerged.json").write(mergedJson.toPrettyString())
//List nb = getWikiaNonBooks()
//assert nb.addAll(knownNonBooks)
//mergeOnlyBooks(getUespBookJson().books, getWikiaBookJson().books, nb); print ".";
def manuallyFixMissingId = {
def books = new JsonSlurper().parseText(new File("SkyrimMerged.json").text)
def fixedJson = new JsonBuilder()
def fixedBooks
def console = System.console()
fixedBooks = books.collectEntries {name, id ->
if (!id) {
id = console.readLine("$name :")
[(name): id]
new File("SkyrimMerged.json").write(fixedJson.toPrettyString())
def assertUniqueIds = {books->
Set ids = [] as Set
books.each {book, id ->
if (id) {
assert !ids.contains(id), "Book '$book' has duplicate id. ${id}"
ids << id
//assertUniqueIds(new JsonSlurper().parseText(new File("SkyrimMerged.json").text)); print ".";
def buildAddAll = {books->
String addBooks = books.sort().inject("") {s, book->
s << "player.additem ${book.getValue()}; ${book.getKey()} \n"
new File("SkyrimBooks.txt").write(addBooks)
buildAddAll(new JsonSlurper().parseText(new File("SkyrimMerged.json").text)); print ".";
