Created
September 15, 2018 23:53
-
-
Save loxaxs/bfc23e193003e9f1fce9966ab2122b94 to your computer and use it in GitHub Desktop.
Derpibooru category tag crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// /\ CUSTOMIZABLE CODE | |
// You can re-order the categories | |
var categories = ["other", "error", "rating", "content-official", "spoiler", "content-fanmade", "character", "oc", "origin", ""]; | |
// You can choose not to fill certain categories. "", "origin" and "oc" are the three biggests. | |
var categoriesHide = { | |
"": true, // Contains all general tags | |
"origin": true, // Contains all artist tags as well as `screencap`, `edit`, `edited screencap`, `derpibooru exclusive`, `alternate version`, `color edit` and that should be all of them. | |
"oc": true // `oc`, `oc only`, and `oc:*` most notably, `oc:anon`. | |
}; | |
var requestWaitMs = 200; | |
// \/ END OF CUSTOMIZABLE CODE | |
var startpage = 1; | |
var page; | |
var tagList = document.querySelector(".tag-list");; | |
var tagDivSet; | |
function go() { | |
tagList.innerHTML = ""; | |
let heading = document.createElement("h2"); | |
tagList.appendChild(heading); | |
tagList.dataset.category = "Page count"; | |
tagList.dataset.count = 0; | |
renderHeading(tagList); | |
tagDivSet = {}; | |
for (let c of categories) { | |
let subList = document.createElement("div"); | |
subList.dataset.category = c; | |
subList.dataset.count = 0; | |
heading = document.createElement("h3"); | |
subList.appendChild(heading); | |
renderHeading(subList); | |
tagList.appendChild(subList); | |
tagDivSet[c] = subList; | |
} | |
crawl(callback); | |
} | |
function pause() { | |
startpage = page; | |
page = 9007199254740991; // Biggest int, after that we get float and imprecision (9007199254740992 === 9007199254740993) (firefox, 2018-09-15). | |
} | |
function stop() { | |
startpage = 1; | |
page = 9007199254740991; // Biggest int, after that we get float and imprecision (9007199254740992 === 9007199254740993) (firefox, 2018-09-15). | |
} | |
async function callback(data, page) { | |
let tags = data.querySelectorAll(".tag"); | |
if (page < 9007199254740991) { | |
tagList.dataset.count = page; | |
} | |
renderHeading(tagList); | |
for (let t of tags) { | |
let c = t.dataset.tagCategory || ""; | |
if (categoriesHide[c]) { | |
continue; | |
} | |
let subList = tagDivSet[c] || tagDivSet["other"]; | |
let deep = true; | |
subList.appendChild(t.cloneNode(deep)); | |
subList.dataset.count = 1 + parseInt(subList.dataset.count); | |
renderHeading(subList); | |
} | |
} | |
function renderHeading (subList) { | |
let heading = subList.firstChild; | |
let category = subList.dataset.category; | |
let count = subList.dataset.count; | |
heading.innerText = `${category} [${count}]`; | |
} | |
async function crawl(__callback) { | |
let _url = (page) => `/tags?page=${page}`; | |
let _parser = async (response) => parseHTML(await response.text()); | |
let _stoppingCondition = (html) => { | |
let l = html.querySelectorAll(".tag").length; | |
//let _worstMeasuredCount = 221; | |
//let chosenThreshold = 200; | |
return !l; | |
}; | |
for (page = startpage;; page++) { | |
let pageUrl = _url(page); | |
let data = await _parser(await fetch(pageUrl)); | |
let stop = _stoppingCondition(data); | |
if (stop) { | |
break; | |
} | |
await __callback(data, page); | |
await new Promise((resolve, _reject) => setTimeout(resolve, requestWaitMs)); | |
} | |
} | |
function parseHTML(htmlText) { | |
let html = document.createElement("html"); | |
html.innerHTML = htmlText; | |
return html; | |
} | |
function addButtons() { | |
let content = document.querySelector("#content"); | |
let deep = true; | |
let buttonSearch = document.querySelector("input[value=\"Search\"]"); | |
let buttonGo = buttonSearch.cloneNode(deep); | |
delete buttonGo.dataset.disableWith; | |
// buttonGo.className = "hform__button button"; | |
let buttonPause = buttonGo.cloneNode(deep); | |
let buttonStop = buttonGo.cloneNode(deep); | |
buttonGo.value = "Clean & Go"; | |
buttonPause.value = "Pause"; | |
buttonStop.value = "Stop"; | |
let div = document.createElement("div"); | |
try { | |
content.insertBefore(div, tagList); | |
div.appendChild(buttonGo); | |
div.appendChild(buttonPause); | |
div.appendChild(buttonStop); | |
} catch (err) { | |
console.error(err); | |
} | |
buttonStop.addEventListener("click", stop); | |
buttonPause.addEventListener("click", pause); | |
buttonGo.addEventListener("click", go); | |
} | |
function main() { | |
if (!location.href.match("/tags")) { | |
alert("The code is supposed to be run on derpibooru, on the page `/tags`."); | |
return; | |
} | |
addButtons(); | |
} | |
main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I forgot to mention it in the script, but here is how it is used:
Note: You can re-invoke old console commands using the keyboard up arrow.