Skip to content

Instantly share code, notes, and snippets.

@bendoerr
Created February 7, 2012 20:06
Show Gist options
  • Save bendoerr/1761644 to your computer and use it in GitHub Desktop.
Save bendoerr/1761644 to your computer and use it in GitHub Desktop.
Some quick and dirty code to scrape two different wiki's about skyrim books compile them into one list
import groovy.json.JsonBuilder
import groovy.json.JsonSlurper
def knownNonBooks = [ // Not listed on any wiki's
"adonato's book",
"alchemist's note",
"bandit's journal",
"decree of monument",
"imperial missive (battle-born)",
"love poem",
"master illusion text",
"oghma infinium", // Skill Book
"power of the elements", // Diff versions
"adventurer's journal",
"bandit's journal",
"butcher journal",
"cicero's journal - final volume",
"cicero's journal - volume 1",
"cicero's journal - volume 2",
"cicero's journal - volume 3",
"cicero's journal - volume 4",
"lymdrenn tenvanni's journal",
"nepos's journal",
"tolfdir's book",
"uncommon taste - signed" // special
]
def renames = [ // I got tired of updating the wikis
("ahzirr trajijazaeri"): "ahzirr traajijazeri",
("biography of barenziah, vol 3"): "biography of barenziah, v3",
("feyfolken, book i"): "feyfolken i",
("forge hammer and anvil"): "forge, hammer and anvil",
("incident in necrom"): "incident at necrom",
("kolb and the dragon"): "kolb & the dragon",
("mystery of talara, part 4"): "mystery of talara, part 4",
("mystery of talara, v1"): "mystery of talara, v 1",
("palla, book i"): "palla, volume 1",
("palla, book ii"): "palla, volume 2",
("song of skyrim"): "songs of skyrim",
("sovngarde, a reexamination"): "sovngarde: a reexamination",
("the alduin-akatosh dichotomy"): "the alduin/akatosh dichotomy",
("the lusty argonian maid vol 1"): "the lusty argonian maid, v1",
("the nirnoot missive"): "the nirnroot missive",
("the real barenziah, book i"): "the real barenziah, v1",
("the real barenziah, book ii"): "the real barenziah, v2",
("the real barenziah, book iii"): "the real barenziah, v3",
("the real barenziah, book iv"): "the real barenziah, v4",
("the real barenziah, book v"): "the real barenziah, v5",
("the song of pelinal, book i"): "the song of pelinal, v1",
("the song of pelinal, book ii"): "the song of pelinal, v2",
("the song of pelinal, book iii"): "the song of pelinal, v3",
("the song of pelinal, book iv"): "the song of pelinal, v4",
("the song of pelinal, book v"): "the song of pelinal, v5",
("the song of pelinal, book vi"): "the song of pelinal, v6",
("the song of pelinal, book vii"): "the song of pelinal, v7",
("the song of pelinal, book viii"): "the song of pelinal, v8",
("the tale of dro'zina"): "the tale of dro'zira",
("the wolf queen, book i"): "the wolf queen, v1",
("the wolf queen, book ii"): "the wolf queen, v2",
("the wolf queen, book iii"): "the wolf queen, v3",
("the wolf queen, book iv"): "the wolf queen, v4",
("the wolf queen, book v"): "the wolf queen, v5",
("the wolf queen, book vi"): "the wolf queen, v6",
("the wolf queen, book vii"): "the wolf queen, v7",
("the wolf queen, book viii"): "the wolf queen, v8",
("wabbajack ()"): "wabbajack"
]
def getPage = {title -> "?action=query&titles=${title}&prop=revisions&rvprop=content"}
def buildWikiaBookFile = {
def wikiaApi = 'http://elderscrolls.wikia.com/api.php'
def wikiaBooksJson = new JsonBuilder()
wikiaBooksJson.books {
def wikiaBooksPage = wikiaApi.concat('?action=query&titles=Books_(Skyrim)&prop=revisions&rvprop=content').toURL().getContent().text
def wikiaBooks = (wikiaBooksPage =~ /(-\n\|\[\[([\d\s\w,.'()-:!&;]*\|)?)([\d\s\w,.'()-:!&;]*)/).each {
def bookTitle = it[3]
def pageTitle = URLEncoder.encode(it[2] ? it[2][0..-2] : it[3])
def bookPage = wikiaApi.concat(getPage(pageTitle)).toURL().getContent().text
def titleMatch = bookPage =~ /(\|title\s=\s)([\d\s\w,.'()-:!&;]*)/
bookTitle = (titleMatch.getCount() ? titleMatch[0][2] : bookTitle).trim()
def skyrimPartMatch = bookPage =~ /\|skyrim\s=\s\{\{Book\/game[\s\n\|\w=\[\]\(\)]*/
def skyrimPart = skyrimPartMatch.getCount() ? skyrimPartMatch[0] : ""
def idMatch = skyrimPart =~ /(id\s=\s)([0-9A-F]*)/
def id = idMatch.getCount() ? idMatch[0][2] : null
def skillMatch = skyrimPart =~ /(skill\s=\s\[\[)/
def skill = skillMatch.getCount() > 0
def questItem = bookPage.contains('Related Quest')
if (renames.get(bookTitle.toLowerCase().trim())) {
bookTitle = renames.get(bookTitle.toLowerCase().trim())
}
"$bookTitle"(id: id, skill: skill, questItem: questItem)
}
}
new File("SkyrimWikiaBooks.json").write(wikiaBooksJson.toPrettyString())
}
//buildWikiaBookFile(); print ".";
def qualityAssertions = {json ->
Set ids = [] as Set
json.books.each {book, props ->
if (props.id) {
assert !ids.contains(props.id), "Book '$book' has duplicate id. ${props.id}"
ids << props.id
}
assert book, "Book doesn't have a name. ${props}"
}
}
def buildWikiaJournalAndNotes = {
def wikiaApi = 'http://elderscrolls.wikia.com/api.php'
def wikiaBooksJson = new JsonBuilder()
def nonBooks = []
def wikiaJournals = wikiaApi.concat('?action=query&titles=Books_(Skyrim)&prop=revisions&rvprop=content&rvsection=3').toURL().getContent().text
def journals = (wikiaJournals =~ /(\*\[\[)([\d\s\w,.'()-:!&;]*)(\|([\d\s\w,.'()-:!&;#]*))?/).collect { it[4] ?: it[2] }
nonBooks.addAll(journals)
def wikiaNotes = wikiaApi.concat('?action=query&titles=Books_(Skyrim)&prop=revisions&rvprop=content&rvsection=4').toURL().getContent().text
def notes = (wikiaNotes =~ /(\*\[\[)([\d\s\w,.'()-:!&;]*)(\|([\d\s\w,.'()-:!&;#]*))?/).collect { it[4] ?: it[2] }
nonBooks.addAll(notes)
wikiaBooksJson(nonBooks)
new File("SkyrimWikiaNonBooks.json").write(wikiaBooksJson.toPrettyString())
}
//buildWikiaJournalAndNotes(); print ".";
def getWikiaBookJson = {
new JsonSlurper().parseText(new File("SkyrimWikiaBooks.json").text)
}
def getWikiaNonBooks = {
new JsonSlurper().parseText(new File("SkyrimWikiaNonBooks.json").text)
}
//qualityAssertions(getWikiaBookJson()); print ".";
def buildUespBookFile = {
def uespApi = 'http://www.uesp.net/w/api.php'
def uespBooksPage = uespApi.concat('?action=query&titles=Skyrim:Books&prop=revisions&rvprop=content').toURL().getContent().text
def uespBookTitles = (uespBooksPage =~ /(\{\{Book\sNormal\s\|\stitle=)([\d\s\w,.'()-:!&;]*)(\s?\|\s?alttitle=([\d\s\w,.'()-:!&;]*))?/).collect { it[4] ?: it[2] }
def uespBooksJson = new JsonBuilder()
uespBooksJson.books {
uespBookTitles.each {title ->
def bookPage = uespApi.concat(getPage('Skyrim:' + title.replaceAll(" ", "_"))).toURL().getContent().text
def summaryMatch = bookPage =~ /(\{\{Book\sSummary)([\n\w\d\s\|='\[:,\]\&;#*\(\)-\@\~\!\$\%\^\_\+]*)/
def summary = summaryMatch.getCount() ? summaryMatch[0][2] : ""
def idMatch = summary =~ /(\|id\s?=\s?)([0-9A-F]*)/
def id = idMatch.getCount() ? idMatch[0][2] : null
def skillMatch = summary =~ /(skill\s?=\s?)([A-Z]*)/
def skill = skillMatch.getCount() > 0
if (title.contains("(book)")) {
title = title.replaceAll("(book)", "")
}
if (renames.get(title.toLowerCase().trim())) {
title = renames.get(title.toLowerCase().trim())
}
"$title"(id: id, skill: skill)
}
}
new File("SkyrimUespBooks.json").write(uespBooksJson.toPrettyString())
}
//buildUespBookFile(); print ".";
def getUespBookJson = {
new JsonSlurper().parseText(new File("SkyrimUespBooks.json").text)
}
//qualityAssertions(getUespBookJson()); print ".";
def listUnknownIds = {json ->
json.books.findAll {name, props -> !props.id }.each { println it.key }
}
//println "Wikia Books (journals not included) Count ${getWikiaBookJson().books.size()}"
//println "Uesp Books Count ${getUespBookJson().books.size()}"
def printRecords = {json1, title1, json2, title2, nonBooks, showNonBooks = false ->
def books1 = json1.books.findAll {k, v -> !v.journal}.keySet()*.trim()*.toLowerCase().sort()
def books2 = json2.books.findAll {k, v -> !v.journal}.keySet()*.trim()*.toLowerCase().sort()
def lowerNonBooks = nonBooks*.toLowerCase()*.trim().sort()
def printRow = { c1, c2 ->
def nonBook = (lowerNonBooks.contains(c1) || lowerNonBooks.contains(c2) ||
knownNonBooks.contains(c1) || knownNonBooks.contains(c2)) ? "NB" : ""
if (!showNonBooks && nonBook) return
println "\t| ${nonBook.padRight(2)} | ${c1.padRight(45)} | ${c2.padRight(45)} |"
}
printRow(title1, title2)
def b1Iter = books1.iterator()
def b2Iter = books2.iterator()
String b1 = b1Iter.next()
String b2 = b2Iter.next()
while (b1Iter.hasNext() || b2Iter.hasNext()) {
def compare = b1 <=> b2
if (compare == 0) {
printRow(b1, b2)
b1 = b1Iter.hasNext() ? b1Iter.next() : ""
b2 = b2Iter.hasNext() ? b2Iter.next() : ""
} else if (compare < 0) {
printRow(b1, "")
b1 = b1Iter.hasNext() ? b1Iter.next() : ""
} else {
printRow("", b2)
b2 = b2Iter.hasNext() ? b2Iter.next() : ""
}
}
}
//printRecords(getWikiaBookJson(), "WIKIA", getUespBookJson(), "UESP", getWikiaNonBooks())
def lowerCaseTitleJson = {fileName ->
def json = new JsonSlurper().parseText(new File(fileName).text)
def newBooks = json.books.collectEntries {k, v ->
[(k.toLowerCase().trim()): v]
}
def newJson = new JsonBuilder()
newJson.books(newBooks)
new File(fileName).write(newJson.toPrettyString())
}
//lowerCaseTitleJson("SkyrimUespBooks.json"); print ".";
//lowerCaseTitleJson("SkyrimWikiaBooks.json"); print ".";
def lowerNonBooks = {fileName ->
def json = new JsonSlurper().parseText(new File(fileName).text)
def newBooks = json.collect {k ->
k.toLowerCase().trim()
}
def newJson = new JsonBuilder()
newJson(newBooks)
new File(fileName).write(newJson.toPrettyString())
}
//lowerNonBooks("SkyrimWikiaNonBooks.json"); print ".";
def mergeOnlyBooks = {booksComplete, books2, nonBooks ->
def mergedJson = new JsonBuilder()
def merged = booksComplete.findAll {name, p -> !nonBooks.contains(name)}.collectEntries {name, props ->
def otherProps = books2.get(name)
def id = props.id
if (otherProps?.id) {
assert id == otherProps.id, "Book $name mismatch ids"
}
[(name): id]
}
mergedJson(merged)
new File("SkyrimMerged.json").write(mergedJson.toPrettyString())
}
//List nb = getWikiaNonBooks()
//assert nb.addAll(knownNonBooks)
//mergeOnlyBooks(getUespBookJson().books, getWikiaBookJson().books, nb); print ".";
def manuallyFixMissingId = {
def books = new JsonSlurper().parseText(new File("SkyrimMerged.json").text)
def fixedJson = new JsonBuilder()
def fixedBooks
def console = System.console()
fixedBooks = books.collectEntries {name, id ->
if (!id) {
id = console.readLine("$name :")
}
[(name): id]
}
fixedJson(fixedBooks)
new File("SkyrimMerged.json").write(fixedJson.toPrettyString())
}
//manuallyFixMissingId()
def assertUniqueIds = {books->
Set ids = [] as Set
books.each {book, id ->
if (id) {
assert !ids.contains(id), "Book '$book' has duplicate id. ${id}"
ids << id
}
}
}
//assertUniqueIds(new JsonSlurper().parseText(new File("SkyrimMerged.json").text)); print ".";
def buildAddAll = {books->
String addBooks = books.sort().inject("") {s, book->
s << "player.additem ${book.getValue()}; ${book.getKey()} \n"
}
new File("SkyrimBooks.txt").write(addBooks)
}
buildAddAll(new JsonSlurper().parseText(new File("SkyrimMerged.json").text)); print ".";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment