Created
February 7, 2012 20:06
-
-
Save bendoerr/1761644 to your computer and use it in GitHub Desktop.
Some quick and dirty code to scrape two different wiki's about skyrim books compile them into one list
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import groovy.json.JsonBuilder | |
import groovy.json.JsonSlurper | |
def knownNonBooks = [ // Not listed on any wiki's | |
"adonato's book", | |
"alchemist's note", | |
"bandit's journal", | |
"decree of monument", | |
"imperial missive (battle-born)", | |
"love poem", | |
"master illusion text", | |
"oghma infinium", // Skill Book | |
"power of the elements", // Diff versions | |
"adventurer's journal", | |
"bandit's journal", | |
"butcher journal", | |
"cicero's journal - final volume", | |
"cicero's journal - volume 1", | |
"cicero's journal - volume 2", | |
"cicero's journal - volume 3", | |
"cicero's journal - volume 4", | |
"lymdrenn tenvanni's journal", | |
"nepos's journal", | |
"tolfdir's book", | |
"uncommon taste - signed" // special | |
] | |
def renames = [ // I got tired of updating the wikis | |
("ahzirr trajijazaeri"): "ahzirr traajijazeri", | |
("biography of barenziah, vol 3"): "biography of barenziah, v3", | |
("feyfolken, book i"): "feyfolken i", | |
("forge hammer and anvil"): "forge, hammer and anvil", | |
("incident in necrom"): "incident at necrom", | |
("kolb and the dragon"): "kolb & the dragon", | |
("mystery of talara, part 4"): "mystery of talara, part 4", | |
("mystery of talara, v1"): "mystery of talara, v 1", | |
("palla, book i"): "palla, volume 1", | |
("palla, book ii"): "palla, volume 2", | |
("song of skyrim"): "songs of skyrim", | |
("sovngarde, a reexamination"): "sovngarde: a reexamination", | |
("the alduin-akatosh dichotomy"): "the alduin/akatosh dichotomy", | |
("the lusty argonian maid vol 1"): "the lusty argonian maid, v1", | |
("the nirnoot missive"): "the nirnroot missive", | |
("the real barenziah, book i"): "the real barenziah, v1", | |
("the real barenziah, book ii"): "the real barenziah, v2", | |
("the real barenziah, book iii"): "the real barenziah, v3", | |
("the real barenziah, book iv"): "the real barenziah, v4", | |
("the real barenziah, book v"): "the real barenziah, v5", | |
("the song of pelinal, book i"): "the song of pelinal, v1", | |
("the song of pelinal, book ii"): "the song of pelinal, v2", | |
("the song of pelinal, book iii"): "the song of pelinal, v3", | |
("the song of pelinal, book iv"): "the song of pelinal, v4", | |
("the song of pelinal, book v"): "the song of pelinal, v5", | |
("the song of pelinal, book vi"): "the song of pelinal, v6", | |
("the song of pelinal, book vii"): "the song of pelinal, v7", | |
("the song of pelinal, book viii"): "the song of pelinal, v8", | |
("the tale of dro'zina"): "the tale of dro'zira", | |
("the wolf queen, book i"): "the wolf queen, v1", | |
("the wolf queen, book ii"): "the wolf queen, v2", | |
("the wolf queen, book iii"): "the wolf queen, v3", | |
("the wolf queen, book iv"): "the wolf queen, v4", | |
("the wolf queen, book v"): "the wolf queen, v5", | |
("the wolf queen, book vi"): "the wolf queen, v6", | |
("the wolf queen, book vii"): "the wolf queen, v7", | |
("the wolf queen, book viii"): "the wolf queen, v8", | |
("wabbajack ()"): "wabbajack" | |
] | |
def getPage = {title -> "?action=query&titles=${title}&prop=revisions&rvprop=content"} | |
def buildWikiaBookFile = { | |
def wikiaApi = 'http://elderscrolls.wikia.com/api.php' | |
def wikiaBooksJson = new JsonBuilder() | |
wikiaBooksJson.books { | |
def wikiaBooksPage = wikiaApi.concat('?action=query&titles=Books_(Skyrim)&prop=revisions&rvprop=content').toURL().getContent().text | |
def wikiaBooks = (wikiaBooksPage =~ /(-\n\|\[\[([\d\s\w,.'()-:!&;]*\|)?)([\d\s\w,.'()-:!&;]*)/).each { | |
def bookTitle = it[3] | |
def pageTitle = URLEncoder.encode(it[2] ? it[2][0..-2] : it[3]) | |
def bookPage = wikiaApi.concat(getPage(pageTitle)).toURL().getContent().text | |
def titleMatch = bookPage =~ /(\|title\s=\s)([\d\s\w,.'()-:!&;]*)/ | |
bookTitle = (titleMatch.getCount() ? titleMatch[0][2] : bookTitle).trim() | |
def skyrimPartMatch = bookPage =~ /\|skyrim\s=\s\{\{Book\/game[\s\n\|\w=\[\]\(\)]*/ | |
def skyrimPart = skyrimPartMatch.getCount() ? skyrimPartMatch[0] : "" | |
def idMatch = skyrimPart =~ /(id\s=\s)([0-9A-F]*)/ | |
def id = idMatch.getCount() ? idMatch[0][2] : null | |
def skillMatch = skyrimPart =~ /(skill\s=\s\[\[)/ | |
def skill = skillMatch.getCount() > 0 | |
def questItem = bookPage.contains('Related Quest') | |
if (renames.get(bookTitle.toLowerCase().trim())) { | |
bookTitle = renames.get(bookTitle.toLowerCase().trim()) | |
} | |
"$bookTitle"(id: id, skill: skill, questItem: questItem) | |
} | |
} | |
new File("SkyrimWikiaBooks.json").write(wikiaBooksJson.toPrettyString()) | |
} | |
//buildWikiaBookFile(); print "."; | |
def qualityAssertions = {json -> | |
Set ids = [] as Set | |
json.books.each {book, props -> | |
if (props.id) { | |
assert !ids.contains(props.id), "Book '$book' has duplicate id. ${props.id}" | |
ids << props.id | |
} | |
assert book, "Book doesn't have a name. ${props}" | |
} | |
} | |
def buildWikiaJournalAndNotes = { | |
def wikiaApi = 'http://elderscrolls.wikia.com/api.php' | |
def wikiaBooksJson = new JsonBuilder() | |
def nonBooks = [] | |
def wikiaJournals = wikiaApi.concat('?action=query&titles=Books_(Skyrim)&prop=revisions&rvprop=content&rvsection=3').toURL().getContent().text | |
def journals = (wikiaJournals =~ /(\*\[\[)([\d\s\w,.'()-:!&;]*)(\|([\d\s\w,.'()-:!&;#]*))?/).collect { it[4] ?: it[2] } | |
nonBooks.addAll(journals) | |
def wikiaNotes = wikiaApi.concat('?action=query&titles=Books_(Skyrim)&prop=revisions&rvprop=content&rvsection=4').toURL().getContent().text | |
def notes = (wikiaNotes =~ /(\*\[\[)([\d\s\w,.'()-:!&;]*)(\|([\d\s\w,.'()-:!&;#]*))?/).collect { it[4] ?: it[2] } | |
nonBooks.addAll(notes) | |
wikiaBooksJson(nonBooks) | |
new File("SkyrimWikiaNonBooks.json").write(wikiaBooksJson.toPrettyString()) | |
} | |
//buildWikiaJournalAndNotes(); print "."; | |
def getWikiaBookJson = { | |
new JsonSlurper().parseText(new File("SkyrimWikiaBooks.json").text) | |
} | |
def getWikiaNonBooks = { | |
new JsonSlurper().parseText(new File("SkyrimWikiaNonBooks.json").text) | |
} | |
//qualityAssertions(getWikiaBookJson()); print "."; | |
def buildUespBookFile = { | |
def uespApi = 'http://www.uesp.net/w/api.php' | |
def uespBooksPage = uespApi.concat('?action=query&titles=Skyrim:Books&prop=revisions&rvprop=content').toURL().getContent().text | |
def uespBookTitles = (uespBooksPage =~ /(\{\{Book\sNormal\s\|\stitle=)([\d\s\w,.'()-:!&;]*)(\s?\|\s?alttitle=([\d\s\w,.'()-:!&;]*))?/).collect { it[4] ?: it[2] } | |
def uespBooksJson = new JsonBuilder() | |
uespBooksJson.books { | |
uespBookTitles.each {title -> | |
def bookPage = uespApi.concat(getPage('Skyrim:' + title.replaceAll(" ", "_"))).toURL().getContent().text | |
def summaryMatch = bookPage =~ /(\{\{Book\sSummary)([\n\w\d\s\|='\[:,\]\&;#*\(\)-\@\~\!\$\%\^\_\+]*)/ | |
def summary = summaryMatch.getCount() ? summaryMatch[0][2] : "" | |
def idMatch = summary =~ /(\|id\s?=\s?)([0-9A-F]*)/ | |
def id = idMatch.getCount() ? idMatch[0][2] : null | |
def skillMatch = summary =~ /(skill\s?=\s?)([A-Z]*)/ | |
def skill = skillMatch.getCount() > 0 | |
if (title.contains("(book)")) { | |
title = title.replaceAll("(book)", "") | |
} | |
if (renames.get(title.toLowerCase().trim())) { | |
title = renames.get(title.toLowerCase().trim()) | |
} | |
"$title"(id: id, skill: skill) | |
} | |
} | |
new File("SkyrimUespBooks.json").write(uespBooksJson.toPrettyString()) | |
} | |
//buildUespBookFile(); print "."; | |
def getUespBookJson = { | |
new JsonSlurper().parseText(new File("SkyrimUespBooks.json").text) | |
} | |
//qualityAssertions(getUespBookJson()); print "."; | |
def listUnknownIds = {json -> | |
json.books.findAll {name, props -> !props.id }.each { println it.key } | |
} | |
//println "Wikia Books (journals not included) Count ${getWikiaBookJson().books.size()}" | |
//println "Uesp Books Count ${getUespBookJson().books.size()}" | |
def printRecords = {json1, title1, json2, title2, nonBooks, showNonBooks = false -> | |
def books1 = json1.books.findAll {k, v -> !v.journal}.keySet()*.trim()*.toLowerCase().sort() | |
def books2 = json2.books.findAll {k, v -> !v.journal}.keySet()*.trim()*.toLowerCase().sort() | |
def lowerNonBooks = nonBooks*.toLowerCase()*.trim().sort() | |
def printRow = { c1, c2 -> | |
def nonBook = (lowerNonBooks.contains(c1) || lowerNonBooks.contains(c2) || | |
knownNonBooks.contains(c1) || knownNonBooks.contains(c2)) ? "NB" : "" | |
if (!showNonBooks && nonBook) return | |
println "\t| ${nonBook.padRight(2)} | ${c1.padRight(45)} | ${c2.padRight(45)} |" | |
} | |
printRow(title1, title2) | |
def b1Iter = books1.iterator() | |
def b2Iter = books2.iterator() | |
String b1 = b1Iter.next() | |
String b2 = b2Iter.next() | |
while (b1Iter.hasNext() || b2Iter.hasNext()) { | |
def compare = b1 <=> b2 | |
if (compare == 0) { | |
printRow(b1, b2) | |
b1 = b1Iter.hasNext() ? b1Iter.next() : "" | |
b2 = b2Iter.hasNext() ? b2Iter.next() : "" | |
} else if (compare < 0) { | |
printRow(b1, "") | |
b1 = b1Iter.hasNext() ? b1Iter.next() : "" | |
} else { | |
printRow("", b2) | |
b2 = b2Iter.hasNext() ? b2Iter.next() : "" | |
} | |
} | |
} | |
//printRecords(getWikiaBookJson(), "WIKIA", getUespBookJson(), "UESP", getWikiaNonBooks()) | |
def lowerCaseTitleJson = {fileName -> | |
def json = new JsonSlurper().parseText(new File(fileName).text) | |
def newBooks = json.books.collectEntries {k, v -> | |
[(k.toLowerCase().trim()): v] | |
} | |
def newJson = new JsonBuilder() | |
newJson.books(newBooks) | |
new File(fileName).write(newJson.toPrettyString()) | |
} | |
//lowerCaseTitleJson("SkyrimUespBooks.json"); print "."; | |
//lowerCaseTitleJson("SkyrimWikiaBooks.json"); print "."; | |
def lowerNonBooks = {fileName -> | |
def json = new JsonSlurper().parseText(new File(fileName).text) | |
def newBooks = json.collect {k -> | |
k.toLowerCase().trim() | |
} | |
def newJson = new JsonBuilder() | |
newJson(newBooks) | |
new File(fileName).write(newJson.toPrettyString()) | |
} | |
//lowerNonBooks("SkyrimWikiaNonBooks.json"); print "."; | |
def mergeOnlyBooks = {booksComplete, books2, nonBooks -> | |
def mergedJson = new JsonBuilder() | |
def merged = booksComplete.findAll {name, p -> !nonBooks.contains(name)}.collectEntries {name, props -> | |
def otherProps = books2.get(name) | |
def id = props.id | |
if (otherProps?.id) { | |
assert id == otherProps.id, "Book $name mismatch ids" | |
} | |
[(name): id] | |
} | |
mergedJson(merged) | |
new File("SkyrimMerged.json").write(mergedJson.toPrettyString()) | |
} | |
//List nb = getWikiaNonBooks() | |
//assert nb.addAll(knownNonBooks) | |
//mergeOnlyBooks(getUespBookJson().books, getWikiaBookJson().books, nb); print "."; | |
def manuallyFixMissingId = { | |
def books = new JsonSlurper().parseText(new File("SkyrimMerged.json").text) | |
def fixedJson = new JsonBuilder() | |
def fixedBooks | |
def console = System.console() | |
fixedBooks = books.collectEntries {name, id -> | |
if (!id) { | |
id = console.readLine("$name :") | |
} | |
[(name): id] | |
} | |
fixedJson(fixedBooks) | |
new File("SkyrimMerged.json").write(fixedJson.toPrettyString()) | |
} | |
//manuallyFixMissingId() | |
def assertUniqueIds = {books-> | |
Set ids = [] as Set | |
books.each {book, id -> | |
if (id) { | |
assert !ids.contains(id), "Book '$book' has duplicate id. ${id}" | |
ids << id | |
} | |
} | |
} | |
//assertUniqueIds(new JsonSlurper().parseText(new File("SkyrimMerged.json").text)); print "."; | |
def buildAddAll = {books-> | |
String addBooks = books.sort().inject("") {s, book-> | |
s << "player.additem ${book.getValue()}; ${book.getKey()} \n" | |
} | |
new File("SkyrimBooks.txt").write(addBooks) | |
} | |
buildAddAll(new JsonSlurper().parseText(new File("SkyrimMerged.json").text)); print "."; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment