Skip to content

Instantly share code, notes, and snippets.

@loxaxs
Created September 15, 2018 23:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save loxaxs/bfc23e193003e9f1fce9966ab2122b94 to your computer and use it in GitHub Desktop.
Save loxaxs/bfc23e193003e9f1fce9966ab2122b94 to your computer and use it in GitHub Desktop.
Derpibooru category tag crawler
// /\ CUSTOMIZABLE CODE
// You can re-order the categories
var categories = ["other", "error", "rating", "content-official", "spoiler", "content-fanmade", "character", "oc", "origin", ""];
// You can choose not to fill certain categories. "", "origin" and "oc" are the three biggests.
var categoriesHide = {
"": true, // Contains all general tags
"origin": true, // Contains all artist tags as well as `screencap`, `edit`, `edited screencap`, `derpibooru exclusive`, `alternate version`, `color edit` and that should be all of them.
"oc": true // `oc`, `oc only`, and `oc:*` most notably, `oc:anon`.
};
var requestWaitMs = 200;
// \/ END OF CUSTOMIZABLE CODE
var startpage = 1;
var page;
var tagList = document.querySelector(".tag-list");;
var tagDivSet;
function go() {
tagList.innerHTML = "";
let heading = document.createElement("h2");
tagList.appendChild(heading);
tagList.dataset.category = "Page count";
tagList.dataset.count = 0;
renderHeading(tagList);
tagDivSet = {};
for (let c of categories) {
let subList = document.createElement("div");
subList.dataset.category = c;
subList.dataset.count = 0;
heading = document.createElement("h3");
subList.appendChild(heading);
renderHeading(subList);
tagList.appendChild(subList);
tagDivSet[c] = subList;
}
crawl(callback);
}
function pause() {
startpage = page;
page = 9007199254740991; // Biggest int, after that we get float and imprecision (9007199254740992 === 9007199254740993) (firefox, 2018-09-15).
}
function stop() {
startpage = 1;
page = 9007199254740991; // Biggest int, after that we get float and imprecision (9007199254740992 === 9007199254740993) (firefox, 2018-09-15).
}
async function callback(data, page) {
let tags = data.querySelectorAll(".tag");
if (page < 9007199254740991) {
tagList.dataset.count = page;
}
renderHeading(tagList);
for (let t of tags) {
let c = t.dataset.tagCategory || "";
if (categoriesHide[c]) {
continue;
}
let subList = tagDivSet[c] || tagDivSet["other"];
let deep = true;
subList.appendChild(t.cloneNode(deep));
subList.dataset.count = 1 + parseInt(subList.dataset.count);
renderHeading(subList);
}
}
function renderHeading (subList) {
let heading = subList.firstChild;
let category = subList.dataset.category;
let count = subList.dataset.count;
heading.innerText = `${category} [${count}]`;
}
async function crawl(__callback) {
let _url = (page) => `/tags?page=${page}`;
let _parser = async (response) => parseHTML(await response.text());
let _stoppingCondition = (html) => {
let l = html.querySelectorAll(".tag").length;
//let _worstMeasuredCount = 221;
//let chosenThreshold = 200;
return !l;
};
for (page = startpage;; page++) {
let pageUrl = _url(page);
let data = await _parser(await fetch(pageUrl));
let stop = _stoppingCondition(data);
if (stop) {
break;
}
await __callback(data, page);
await new Promise((resolve, _reject) => setTimeout(resolve, requestWaitMs));
}
}
function parseHTML(htmlText) {
let html = document.createElement("html");
html.innerHTML = htmlText;
return html;
}
function addButtons() {
let content = document.querySelector("#content");
let deep = true;
let buttonSearch = document.querySelector("input[value=\"Search\"]");
let buttonGo = buttonSearch.cloneNode(deep);
delete buttonGo.dataset.disableWith;
// buttonGo.className = "hform__button button";
let buttonPause = buttonGo.cloneNode(deep);
let buttonStop = buttonGo.cloneNode(deep);
buttonGo.value = "Clean & Go";
buttonPause.value = "Pause";
buttonStop.value = "Stop";
let div = document.createElement("div");
try {
content.insertBefore(div, tagList);
div.appendChild(buttonGo);
div.appendChild(buttonPause);
div.appendChild(buttonStop);
} catch (err) {
console.error(err);
}
buttonStop.addEventListener("click", stop);
buttonPause.addEventListener("click", pause);
buttonGo.addEventListener("click", go);
}
function main() {
if (!location.href.match("/tags")) {
alert("The code is supposed to be run on derpibooru, on the page `/tags`.");
return;
}
addButtons();
}
main();
@loxaxs
Copy link
Author

loxaxs commented Sep 16, 2018

I forgot to mention it in the script, but here is how it is used:

  1. Got to "derpibooru.org/tags"
  2. Copy this script, open the console on derpibooru (press F12) and paste it in and execute it (press Enter)
  3. Click on the button newly created button "Go" above the tags to start the crawler
    Note: You can re-invoke old console commands using the keyboard up arrow.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment