Skip to content

Instantly share code, notes, and snippets.

@MTco
Forked from rampadc/kindle-reader.js
Created March 8, 2020 19:43
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save MTco/a3d42a5160a81b120d451a4bc680508e to your computer and use it in GitHub Desktop.
Save MTco/a3d42a5160a81b120d451a4bc680508e to your computer and use it in GitHub Desktop.
Kindle Cloud Reader scraper
// modified based on:
// - https://lowrey.me/scraping-a-book-from-kindle-read-amazon-com/
console.clear();
(function() {
var hashes = {};
var all = "";
function hashString(str) {
let hash = 0;
for (let i = 0; i < str.length; i++) {
hash += Math.pow(str.charCodeAt(i) * 31, str.length - i);
hash = hash & hash; // Convert to 32bit integer
}
return hash;
}
function getKindleBookAppFrame() {
return document.querySelector("#KindleReaderIFrame").contentDocument;
}
function turnPage() {
return new Promise(resolve => {
var appFrame = getKindleBookAppFrame();
appFrame.getElementById("kindleReader_pageTurnAreaRight").click();
setTimeout(resolve, 200);
});
}
function isHeading(block) {
if (
$(block).is("h1") ||
$(block).is("h2") ||
$(block).is("h3") ||
$(block).is("h4")
) {
return true;
}
$(block)
.contents()
.each(() => {
if (
$(this).is("h1") ||
$(this).is("h2") ||
$(this).is("h3") ||
$(this).is("h4")
) {
return true;
}
});
return false;
}
function getContentFramesSubElements(contentFrames) {
return Array.from(
contentFrames[currentContentFrameIndex].querySelectorAll(
"\
body > div, body > h1, body > h2, body > h3, body > h4, body > h5, body > h6, \
body > ol, body > ul, body > li \
"
)
);
}
function scrapeFrames() {
return new Promise(resolve => {
frames = [];
var appFrame = getKindleBookAppFrame();
var contentFrames = Array.from(appFrame.querySelectorAll("iframe")).map(
f => f.contentDocument
);
var frameBody = $("iframe")
.contents()
.find("iframe")
.contents()
.find("body")
.get(1);
// console.log(frameBody);
let hash = hashString(frameBody.innerText);
if (hashes[hash] === undefined) {
hashes[hash] = true;
frames.push(frameBody.innerHTML);
}
resolve(frames);
});
}
function formatFrames(frames) {
console.log("unformatted");
console.log(frames);
return new Promise(resolve => {
formattedFrames = [];
for (let i = 0; i < frames.length; i++) {
const frame = frames[i];
let formattedFrame = {
is_heading: false,
is_list_item: false,
text: null
};
// check if frame contains a unordered/ordered list
if ($(frame).is("ul")) {
// break up list further
let ulTexts = Array.from($(frame).contents())
.map(el => {
return $(el).text();
})
.filter(el => {
return el.trim().length != 0;
});
ulTexts.forEach(text => {
formattedFrame["text"] = text;
formattedFrame["is_list_item"] = true;
formattedFrames.push(formattedFrame);
});
} else if ($(frame).is("ol")) {
let olTexts = Array.from($(frame).contents())
.map(el => {
return $(el).text();
})
.filter(el => {
return el.trim().length != 0;
});
olTexts.forEach(text => {
formattedFrame["text"] = text;
formattedFrame["is_list_item"] = true;
formattedFrames.push(formattedFrame);
});
console.log(formattedFrames);
} else {
// otherwise, treat as paragraph
const text = $(frame).text();
if (text.trim().length == 0) {
continue;
}
formattedFrame["is_heading"] = isHeading(frame);
formattedFrame["text"] = $(frame).text();
formattedFrames.push(formattedFrame);
}
}
resolve(formattedFrames);
});
}
function getFormattedFrames() {
// return a promise with an array of formatted scraped content
return scrapeFrames().then(formatFrames);
}
function hasReachedEndSample() {
var appFrame = getKindleBookAppFrame();
var endSampleMessageDiv = appFrame.getElementById(
"kindle_sample_end_message"
);
return $(endSampleMessageDiv).is(":visible");
}
function hasReachedEnd() {
var appFrame = getKindleBookAppFrame();
return appFrame
.getElementById("kindleReader_footer")
.innerText.includes("100%");
}
function done() {
console.log(all);
}
function scrape() {
setTimeout(() => {
scrapeFrames()
.then(frames => {
console.log(".");
all += frames.join("\n");
})
.then(turnPage)
.then(() => {
if (hasReachedEndSample() || hasReachedEnd()) {
done();
} else {
scrape();
}
});
}, 800);
}
scrape();
})();
@javierfuentesm
Copy link

Is this still working? i think that now amazon renders an image and not the text itself

@ajakate
Copy link

ajakate commented Feb 26, 2023

My guess would be no... Granted I only spent about 5 minutes looking today, but I tried searching for words from the text I was reading in the source and none of the text seemed to be in the rendered page at all. The ajax responses on turning the page seemed to contain long and detailed instructions on how to draw the text.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment