Skip to content

Instantly share code, notes, and snippets.

@rampadc
Created July 28, 2019 15:08
Show Gist options
  • Save rampadc/f3c92359d75e0d72bee496fc7f79b316 to your computer and use it in GitHub Desktop.
Save rampadc/f3c92359d75e0d72bee496fc7f79b316 to your computer and use it in GitHub Desktop.
Kindle Cloud Reader scraper
// modified based on:
// - https://lowrey.me/scraping-a-book-from-kindle-read-amazon-com/
console.clear();
(function() {
var hashes = {};
var all = "";
function hashString(str) {
let hash = 0;
for (let i = 0; i < str.length; i++) {
hash += Math.pow(str.charCodeAt(i) * 31, str.length - i);
hash = hash & hash; // Convert to 32bit integer
}
return hash;
}
function getKindleBookAppFrame() {
return document.querySelector("#KindleReaderIFrame").contentDocument;
}
function turnPage() {
return new Promise(resolve => {
var appFrame = getKindleBookAppFrame();
appFrame.getElementById("kindleReader_pageTurnAreaRight").click();
setTimeout(resolve, 200);
});
}
function isHeading(block) {
if (
$(block).is("h1") ||
$(block).is("h2") ||
$(block).is("h3") ||
$(block).is("h4")
) {
return true;
}
$(block)
.contents()
.each(() => {
if (
$(this).is("h1") ||
$(this).is("h2") ||
$(this).is("h3") ||
$(this).is("h4")
) {
return true;
}
});
return false;
}
function getContentFramesSubElements(contentFrames) {
return Array.from(
contentFrames[currentContentFrameIndex].querySelectorAll(
"\
body > div, body > h1, body > h2, body > h3, body > h4, body > h5, body > h6, \
body > ol, body > ul, body > li \
"
)
);
}
function scrapeFrames() {
return new Promise(resolve => {
frames = [];
var appFrame = getKindleBookAppFrame();
var contentFrames = Array.from(appFrame.querySelectorAll("iframe")).map(
f => f.contentDocument
);
var frameBody = $("iframe")
.contents()
.find("iframe")
.contents()
.find("body")
.get(1);
// console.log(frameBody);
let hash = hashString(frameBody.innerText);
if (hashes[hash] === undefined) {
hashes[hash] = true;
frames.push(frameBody.innerHTML);
}
resolve(frames);
});
}
function formatFrames(frames) {
console.log("unformatted");
console.log(frames);
return new Promise(resolve => {
formattedFrames = [];
for (let i = 0; i < frames.length; i++) {
const frame = frames[i];
let formattedFrame = {
is_heading: false,
is_list_item: false,
text: null
};
// check if frame contains a unordered/ordered list
if ($(frame).is("ul")) {
// break up list further
let ulTexts = Array.from($(frame).contents())
.map(el => {
return $(el).text();
})
.filter(el => {
return el.trim().length != 0;
});
ulTexts.forEach(text => {
formattedFrame["text"] = text;
formattedFrame["is_list_item"] = true;
formattedFrames.push(formattedFrame);
});
} else if ($(frame).is("ol")) {
let olTexts = Array.from($(frame).contents())
.map(el => {
return $(el).text();
})
.filter(el => {
return el.trim().length != 0;
});
olTexts.forEach(text => {
formattedFrame["text"] = text;
formattedFrame["is_list_item"] = true;
formattedFrames.push(formattedFrame);
});
console.log(formattedFrames);
} else {
// otherwise, treat as paragraph
const text = $(frame).text();
if (text.trim().length == 0) {
continue;
}
formattedFrame["is_heading"] = isHeading(frame);
formattedFrame["text"] = $(frame).text();
formattedFrames.push(formattedFrame);
}
}
resolve(formattedFrames);
});
}
function getFormattedFrames() {
// return a promise with an array of formatted scraped content
return scrapeFrames().then(formatFrames);
}
function hasReachedEndSample() {
var appFrame = getKindleBookAppFrame();
var endSampleMessageDiv = appFrame.getElementById(
"kindle_sample_end_message"
);
return $(endSampleMessageDiv).is(":visible");
}
function hasReachedEnd() {
var appFrame = getKindleBookAppFrame();
return appFrame
.getElementById("kindleReader_footer")
.innerText.includes("100%");
}
function done() {
console.log(all);
}
function scrape() {
setTimeout(() => {
scrapeFrames()
.then(frames => {
console.log(".");
all += frames.join("\n");
})
.then(turnPage)
.then(() => {
if (hasReachedEndSample() || hasReachedEnd()) {
done();
} else {
scrape();
}
});
}, 800);
}
scrape();
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment