Skip to content

Instantly share code, notes, and snippets.

@neoPix
Last active November 16, 2023 15:50
Show Gist options
  • Save neoPix/87ccd1f10ab810061543ebea18a56e83 to your computer and use it in GitHub Desktop.
Save neoPix/87ccd1f10ab810061543ebea18a56e83 to your computer and use it in GitHub Desktop.
Babelio Node Scraper
import { gotScraping } from "got-scraping";
import { CookieJar } from "tough-cookie";
import { URLSearchParams, parse as parseUrl } from "url";
import { parse as parseHtml } from "node-html-parser";
const parse = (body) => parseHtml(new TextDecoder("windows-1252").decode(body));
const BASE_URL = "https://www.babelio.com";
/**
* The editor of a book.
* @typedef {Object} Editor
* @property {string} name - The name of the editor.
* @property {string} link - The url to the editor.
*/
/**
* The author of a book.
* @typedef {Object} Author
* @property {string} name - The name of the author.
* @property {string} link - The url to the author.
*/
/**
* A tag.
* @typedef {Object} Tag
* @property {string} name - The name of the tag.
* @property {string} link - The url to the tag.
*/
/**
* A book from the list.
* @typedef {Object} ListBook
* @property {string} id - The identifier of the book.
* @property {string} title - The title of the book.
* @property {string} link - The url to the book.
* @property {Editor} editor - The url to the book.
* @property {Author[]} authors - The list of authors.
* @property {string} status - The status of the book.
* @property {string?} endRead - The date when the book has been finished.
* @property {Tag[]} tags - The tags associated to the book.
*/
/**
* A book from the list.
* @typedef {Object} Book
* @property {string} id - The identifier of the book.
* @property {string} title - The title of the book.
* @property {string} link - The url to the book.
* @property {Editor} editor - The url to the book.
* @property {Author[]} authors - The list of authors.
* @property {string} status - The status of the book.
* @property {string?} endRead - The date when the book has been finished.
* @property {string?} startRead - The date when the book has been started.
* @property {string} image - The image of the book.
* @property {string} ean - The EAN reference of the book.
* @property {boolean} borrowed - Whether the book has been borrowed or not.
* @property {boolean} exchangeable - Whether the book can be traded or not.
* @property {string} summary - The summary of the book.
* @property {number?} pages - The number of page of the book.
* @property {Tag[]} tags - The tags associated to the book.
* @property {number?} evaluation - The evaluation given to the book.
* @property {string?} review - A review of the book.
*/
export default class BabelIO {
/**
* @type {string}
*/
#username;
/**
* @type {string}
*/
#password;
/**
* @type {import('got-scraping').Got}
*/
#client;
/**
* @type {CookieJar}
*/
#cookieJar;
/**
* Initialize a babelIO scraper
* @param {string} username the email of the user
* @param {string} password the password of the user
*/
constructor(username, password) {
this.#username = username;
this.#password = password;
this.#cookieJar = new CookieJar();
this.#client = gotScraping.extend({ cookieJar: this.#cookieJar });
}
/**
* Authenticate to Babelio and obtain a session
*/
async auth() {
const authForm = new URLSearchParams();
authForm.append("Login", this.#username);
authForm.append("Password", this.#password);
authForm.append("sub_btn", "connexion");
authForm.append("ref", "https://www.babelio.com/");
await this.#client.post(`${BASE_URL}/connection.php`, {
body: authForm.toString(),
headers: { "Content-Type": "application/x-www-form-urlencoded" },
});
}
async #configureAndGetLastPage() {
const { body } = await this.#client.get(
`${BASE_URL}/mabibliotheque.php?action=toute&affichage=1&tri=titre&sens=croissant`,
{ responseType: "buffer" }
);
const document = parse(body);
const pages = document.querySelectorAll("div.pagination.row a");
if (pages.length === 1) {
return { lastPage: 1 };
} else {
return {
lastPage: Number(
parseUrl(pages.at(-2).getAttribute("href"), true).query.pageN
),
};
}
}
/**
* Gets a page of books
* @param {number} page the page number
* @returns {ListBook[]} a page of books
*/
async #getBooksFromPage(page) {
const { body } = await this.#client.get(
`${BASE_URL}/mabibliotheque.php?pageN=${page}`,
{ responseType: "buffer" }
);
const document = parse(body);
const bookLines = document.querySelectorAll(
"div.mes_livres_con table tbody tr"
);
return bookLines.map((htmlBookElement) => {
const [titleElement] = htmlBookElement.querySelectorAll(".titre_livre>a");
const editorElement = htmlBookElement.querySelector(
".titre_livre .titre_livre_editor"
);
const authorElements = htmlBookElement.querySelectorAll(".auteur>a");
const tagsElements = htmlBookElement.querySelectorAll(".etiquette>a");
const statusElement = htmlBookElement.querySelector(".statut");
const id = htmlBookElement
.querySelector(".check input")
?.getAttribute("value");
return {
id: titleElement.getAttribute("href").split("/").at(-1),
title: titleElement.querySelector("h2").text.trim(),
link: `${BASE_URL}${titleElement.getAttribute("href")}`,
editor: {
name: editorElement?.text.trim(),
link: `${BASE_URL}${editorElement?.parentNode.getAttribute("href")}`,
},
authors: authorElements.map((htmlAuthor) => ({
name: htmlAuthor.text.trim(),
link: `${BASE_URL}${htmlAuthor.getAttribute("href")}`,
})),
status: statusElement.querySelector(".livre_action_status_1")?.text,
endRead: statusElement
.querySelector(`#dtFin_${id}`)
?.getAttribute("value"),
tags: tagsElements.slice(0, -1).map((tagElement) => ({
name: tagElement.text.trim(),
link: `${BASE_URL}${tagElement.getAttribute("href")}`,
})),
};
});
}
/**
* Gets the books of the user
* @returns {ListBook[]} A list of books
*/
async books() {
const { lastPage } = await this.#configureAndGetLastPage();
const pages = [];
for (let i = 1; i <= lastPage; i++) {
pages.push(await this.#getBooksFromPage(i));
}
return pages.flat();
}
async #getBookSummary(document) {
const summaryProps = new URLSearchParams();
summaryProps.append("type", 1);
summaryProps.append(
"id_obj",
document
.querySelector("#d_bio>span>a")
.getAttribute("onclick")
.split(",")
.at(-1)
.replace(");", "")
);
const { body: summary } = await this.#client.post(
`${BASE_URL}/aj_voir_plus_a.php`,
{
responseType: "buffer",
headers: { "Content-Type": "application/x-www-form-urlencoded" },
body: summaryProps.toString(),
}
);
return new TextDecoder("windows-1252").decode(summary).replace(/<[^>]*>?/gm, '').trim();
}
async #getBookUserData(libraryId) {
const { body } = await this.#client.get(
`${BASE_URL}/ajoutinfoslivre.php?id_biblio=${libraryId}`,
{ responseType: "buffer" }
);
const document = parse(body);
const evaluation = document
.querySelector(`#unit_longDUR${libraryId}`)
?.getAttribute("data-rateit-value");
const startRead = document
.querySelector("#datepicker_deb")
?.getAttribute("value");
const endRead = document
.querySelector("#datepicker_fin")
?.getAttribute("value");
const tags = document
.querySelectorAll('a[href*="/etiquettes"]')
.map((htmlTag) => ({
name: htmlTag.text,
link: htmlTag.getAttribute("href"),
}));
const review = document.getElementById('Critique')?.text;
return {
startRead,
endRead,
evaluation: evaluation ? Number(evaluation) : undefined,
tags,
review
};
}
/**
* Gets the details of a book
* @param {string} bookId the identifier of the book
* @returns {Book} The book details
*/
async book(bookId) {
const { body } = await this.#client.get(`${BASE_URL}/livres/-/${bookId}`, {
responseType: "buffer",
});
const document = parse(body);
const authorsElements = document.querySelectorAll(
'span[itemprop="author"]>a'
);
const headerElement = document.querySelector(
'.livre_header_con [itemprop="name"]'
);
const metaElemen = document.querySelector(".livre_refs");
const editorElement = metaElemen.querySelector("a");
const ean = metaElemen.text.match(/EAN : (?<ean>[0-9]{13})/)?.groups.ean;
const pages = metaElemen.text.match(/(?<pages>[0-9]+) pages /)?.groups
.pages;
const link = headerElement.querySelector(">a").getAttribute("href");
const libraryId = document
.querySelector('.titre a[href*="/ajoutinfoslivre"]')
.getAttribute("href")
.split("=")
.at(-1);
const metaStatus = document.querySelector(".livre_action_status_2")?.text.trim();
const [summary, tagsAndPeriod] = await Promise.all([
this.#getBookSummary(document),
this.#getBookUserData(libraryId),
]);
return {
id: link.split("/").at(-1),
title: headerElement.text.trim(),
image: document
.querySelector('.livre_con [itemprop="image"]')
?.getAttribute("src"),
ean,
pages: pages ? Number(pages) : undefined,
link: `${BASE_URL}${link}`,
editor: {
name: editorElement?.text.trim(),
link: `${BASE_URL}${editorElement?.getAttribute("href")}`,
},
authors: authorsElements.map((authorElement) => ({
link: `${BASE_URL}${authorElement.getAttribute("href")}`,
name: authorElement.text.trim(),
})),
status: document.querySelector(".livre_action_status_1")?.text,
borrowed: metaStatus.includes('Emprunté'),
exchangeable: metaStatus.includes('À échanger'),
summary,
...tagsAndPeriod,
};
}
}
import BabelIO from "./babelio.js";
const main = async () => {
const babelio = new BabelIO('My username', 'It's password');
await babelio.auth();
console.log(await babelio.books());
console.log(await babelio.book("837687"));
};
main()
.then(() => console.log("Done !"))
.catch((e) => {
console.error(e);
process.exit(10);
});
{
"name": "babelio-scraper",
"version": "1.0.0",
"description": "",
"main": "src/main.js",
"type": "module",
"scripts": {
"start": "node ./main.js"
},
"author": "",
"license": "ISC",
"dependencies": {
"got-scraping": "^3.2.9",
"node-html-parser": "^5.3.3",
"tough-cookie": "^4.0.0"
}
}
@pixelshot91
Copy link

Hi @neoPix
I tried to implement my own scrapping tool for Babelio in Rust, but after a few tests, I can't access babelio.com anymore (it timeout). It seems Babelio have blocked my IP address. :(
Did you have the same problem ? Maybe they would not have banned my IP if I was logged in ?
It's been 3 days and my IP is still blocked. I didn't find any mentions in Babelio website or any other person like me who have been blocked
Any help is appreciated ! :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment