Skip to content

Instantly share code, notes, and snippets.

@depau
Last active March 2, 2024 05:33
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save depau/39dbf43d54d743e2f1781a1eec927128 to your computer and use it in GitHub Desktop.
Save depau/39dbf43d54d743e2f1781a1eec927128 to your computer and use it in GitHub Desktop.
VitalSource web book page scraper

VitalSource web book page scraper

This pair of userscripts (to be used with any usescript manager such as ViolentMonkey) allow scraping books from the VitalSource Bookshelf web reader.

This allows creating a PDF for offline reading with free-software readers such as Calibre.

The "inner" script hooks into the book page nested iframe; it detects when a new page image is loaded and it automatically starts a browser download.

The "outer" script hooks into the main reader page and adds a "Scrape" button which automatically goes to the next page when the inner script has successfully downloaded a page.

Usage

  1. Install both userscripts
  2. Open the book
  3. Wait for the first two downloads (the web reader will prefetch one page in the background while you read)
  4. Click "Scrape this motherfucker" to start auto-advancing
  5. Keep an eye on the reader as it may occasionally present a CAPTCHA; the scraping should resume once you solve it

Additional tools

// ==UserScript==
// @name Scrape VitalSource (inner)
// @namespace http://tampermonkey.net/
// @version 2024-02-16
// @description try to take over the world!
// @author You
// @match https://jigsaw.vitalsource.com/books/*
// @icon https://www.google.com/s2/favicons?sz=64&domain=vitalsource.com
// @sandbox JavaScript
// @grant unsafeWindow
// @grant GM_download
// ==/UserScript==
function download(url) {
console.log("inner frame: downloading:", url);
GM_download({
url: url,
name: "page.jpg",
saveAs: false,
conflictAction: "uniquify",
onerror: function (error) {
console.error("inner frame: download failed:", url, error);
},
onload: function (response) {
console.log("inner frame: downloaded:", url);
window.top.postMessage(
{
type: "pageImage",
frameUrl: window.location.href,
url: url,
},
"https://bookshelf.vitalsource.com/reader/books/*"
);
console.log("inner frame: message sent");
},
});
}
(function () {
let lowResCount = 0;
const run = function () {
const b = document.querySelector('img#pbk-page');
if (b == null) {
console.log("inner frame: image not found");
setTimeout(run, 100);
} else {
const url = b.src;
if (url.endsWith("/800")) {
lowResCount++;
if (lowResCount < 25) {
console.log("inner frame: low res image, retrying:", url);
setTimeout(run, 500);
return;
} else {
console.log("inner frame: low res image, download anyway:", url);
}
}
console.log("inner frame: found image:", url);
download(url);
}
};
run();
})();
// ==UserScript==
// @name Scrape VitalSource
// @namespace http://tampermonkey.net/
// @version 2024-02-16
// @description try to take over the world!
// @author You
// @match https://bookshelf.vitalsource.com/reader/books/*
// @icon https://www.google.com/s2/favicons?sz=64&domain=vitalsource.com
// @grant window.onurlchange
// @grant unsafeWindow
// ==/UserScript==
function findElementByLabelText(labelText) {
// Find all label elements in the document
const labels = document.querySelectorAll('label');
// Iterate through the found labels to find the one with the matching text
for (let label of labels) {
if (label.textContent.trim() === labelText) {
// Read the "for" attribute of the label
const forAttribute = label.getAttribute('for');
if (forAttribute) {
// Use the "for" attribute to fetch the referenced element
return document.getElementById(forAttribute); // Return the found element
}
break; // Stop the search once the first matching label is found
}
}
return null; // Return null if no matching label or referenced element is found
}
(function () {
let lastUrl = null;
let lastUrlChangedCallback = null;
let currentPage = -1;
let running = false;
function stop() {
running = false;
lastUrlChangedCallback = null;
currentPage = -1;
const button = document.querySelector("#scrapeButton");
button.innerHTML = "Scrape this motherfucker";
}
window.addEventListener("message", e => {
// Check if e.data contains .type, and .type===pageImage
if (e.data.type !== "pageImage") return;
console.log("Page image event:", e);
unsafeWindow.msg = e;
lastUrl = e.data;
if (lastUrlChangedCallback) {
lastUrlChangedCallback();
}
});
const worker = function () {
const goToPageInput = findElementByLabelText('Go to Page');
if (goToPageInput == null) {
console.log("Go to page input not found!");
return;
}
const pageNum = parseInt(goToPageInput.value);
if (pageNum === currentPage) {
console.log("Skipping duplicate event");
return;
}
if (currentPage === -1) {
currentPage = pageNum;
if (currentPage !== 1) {
// Ask permission to start from a page other than the first
if (!confirm("Do you want to start from page " + currentPage + "?")) {
console.log("User cancelled");
stop();
return;
}
}
}
if (lastUrl == null) {
console.log("No URL found");
stop();
return;
}
console.log("SCRAPER:", lastUrl.url);
setTimeout(() => {
const nextButton = document.querySelector('[aria-label="Next"]');
if (nextButton == null) {
console.log("Next button not found!");
stop();
return;
}
// check if disabled via the "disabled" attribute
if (nextButton.hasAttribute("disabled")) {
console.log("Next button is disabled");
stop();
return;
}
nextButton.click();
}, 10);
}
const doStuff = function () {
if (running) {
console.log("Stopping scraper");
stop();
return;
}
console.log("Starting scraper, lastUrl:", lastUrl);
const button = document.querySelector("#scrapeButton");
running = true;
lastUrlChangedCallback = worker;
button.innerHTML = "Stop scraping";
worker();
};
const inject = function () {
const b = document.querySelector('[aria-label="Search across book"]');
if (b == null) {
console.log("Button not found, retrying in 1s");
setTimeout(inject, 1000);
return;
}
const a = document.createElement("a");
a.id = "scrapeButton";
a.href = "#";
a.innerHTML = "Scrape this motherfucker";
a.addEventListener("click", _ => doStuff());
b.after(a);
console.log("Injected!");
};
inject();
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment