Last active
November 16, 2023 15:50
-
-
Save neoPix/87ccd1f10ab810061543ebea18a56e83 to your computer and use it in GitHub Desktop.
Babelio Node Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { gotScraping } from "got-scraping"; | |
import { CookieJar } from "tough-cookie"; | |
import { URLSearchParams, parse as parseUrl } from "url"; | |
import { parse as parseHtml } from "node-html-parser"; | |
const parse = (body) => parseHtml(new TextDecoder("windows-1252").decode(body)); | |
const BASE_URL = "https://www.babelio.com"; | |
/** | |
* The editor of a book. | |
* @typedef {Object} Editor | |
* @property {string} name - The name of the editor. | |
* @property {string} link - The url to the editor. | |
*/ | |
/** | |
* The author of a book. | |
* @typedef {Object} Author | |
* @property {string} name - The name of the author. | |
* @property {string} link - The url to the author. | |
*/ | |
/** | |
* A tag. | |
* @typedef {Object} Tag | |
* @property {string} name - The name of the tag. | |
* @property {string} link - The url to the tag. | |
*/ | |
/** | |
* A book from the list. | |
* @typedef {Object} ListBook | |
* @property {string} id - The identifier of the book. | |
* @property {string} title - The title of the book. | |
* @property {string} link - The url to the book. | |
* @property {Editor} editor - The url to the book. | |
* @property {Author[]} authors - The list of authors. | |
* @property {string} status - The status of the book. | |
* @property {string?} endRead - The date when the book has been finished. | |
* @property {Tag[]} tags - The tags associated to the book. | |
*/ | |
/** | |
* A book from the list. | |
* @typedef {Object} Book | |
* @property {string} id - The identifier of the book. | |
* @property {string} title - The title of the book. | |
* @property {string} link - The url to the book. | |
* @property {Editor} editor - The url to the book. | |
* @property {Author[]} authors - The list of authors. | |
* @property {string} status - The status of the book. | |
* @property {string?} endRead - The date when the book has been finished. | |
* @property {string?} startRead - The date when the book has been started. | |
* @property {string} image - The image of the book. | |
* @property {string} ean - The EAN reference of the book. | |
* @property {boolean} borrowed - Whether the book has been borrowed or not. | |
* @property {boolean} exchangeable - Whether the book can be traded or not. | |
* @property {string} summary - The summary of the book. | |
* @property {number?} pages - The number of page of the book. | |
* @property {Tag[]} tags - The tags associated to the book. | |
* @property {number?} evaluation - The evaluation given to the book. | |
* @property {string?} review - A review of the book. | |
*/ | |
export default class BabelIO { | |
/** | |
* @type {string} | |
*/ | |
#username; | |
/** | |
* @type {string} | |
*/ | |
#password; | |
/** | |
* @type {import('got-scraping').Got} | |
*/ | |
#client; | |
/** | |
* @type {CookieJar} | |
*/ | |
#cookieJar; | |
/** | |
* Initialize a babelIO scraper | |
* @param {string} username the email of the user | |
* @param {string} password the password of the user | |
*/ | |
constructor(username, password) { | |
this.#username = username; | |
this.#password = password; | |
this.#cookieJar = new CookieJar(); | |
this.#client = gotScraping.extend({ cookieJar: this.#cookieJar }); | |
} | |
/** | |
* Authenticate to Babelio and obtain a session | |
*/ | |
async auth() { | |
const authForm = new URLSearchParams(); | |
authForm.append("Login", this.#username); | |
authForm.append("Password", this.#password); | |
authForm.append("sub_btn", "connexion"); | |
authForm.append("ref", "https://www.babelio.com/"); | |
await this.#client.post(`${BASE_URL}/connection.php`, { | |
body: authForm.toString(), | |
headers: { "Content-Type": "application/x-www-form-urlencoded" }, | |
}); | |
} | |
async #configureAndGetLastPage() { | |
const { body } = await this.#client.get( | |
`${BASE_URL}/mabibliotheque.php?action=toute&affichage=1&tri=titre&sens=croissant`, | |
{ responseType: "buffer" } | |
); | |
const document = parse(body); | |
const pages = document.querySelectorAll("div.pagination.row a"); | |
if (pages.length === 1) { | |
return { lastPage: 1 }; | |
} else { | |
return { | |
lastPage: Number( | |
parseUrl(pages.at(-2).getAttribute("href"), true).query.pageN | |
), | |
}; | |
} | |
} | |
/** | |
* Gets a page of books | |
* @param {number} page the page number | |
* @returns {ListBook[]} a page of books | |
*/ | |
async #getBooksFromPage(page) { | |
const { body } = await this.#client.get( | |
`${BASE_URL}/mabibliotheque.php?pageN=${page}`, | |
{ responseType: "buffer" } | |
); | |
const document = parse(body); | |
const bookLines = document.querySelectorAll( | |
"div.mes_livres_con table tbody tr" | |
); | |
return bookLines.map((htmlBookElement) => { | |
const [titleElement] = htmlBookElement.querySelectorAll(".titre_livre>a"); | |
const editorElement = htmlBookElement.querySelector( | |
".titre_livre .titre_livre_editor" | |
); | |
const authorElements = htmlBookElement.querySelectorAll(".auteur>a"); | |
const tagsElements = htmlBookElement.querySelectorAll(".etiquette>a"); | |
const statusElement = htmlBookElement.querySelector(".statut"); | |
const id = htmlBookElement | |
.querySelector(".check input") | |
?.getAttribute("value"); | |
return { | |
id: titleElement.getAttribute("href").split("/").at(-1), | |
title: titleElement.querySelector("h2").text.trim(), | |
link: `${BASE_URL}${titleElement.getAttribute("href")}`, | |
editor: { | |
name: editorElement?.text.trim(), | |
link: `${BASE_URL}${editorElement?.parentNode.getAttribute("href")}`, | |
}, | |
authors: authorElements.map((htmlAuthor) => ({ | |
name: htmlAuthor.text.trim(), | |
link: `${BASE_URL}${htmlAuthor.getAttribute("href")}`, | |
})), | |
status: statusElement.querySelector(".livre_action_status_1")?.text, | |
endRead: statusElement | |
.querySelector(`#dtFin_${id}`) | |
?.getAttribute("value"), | |
tags: tagsElements.slice(0, -1).map((tagElement) => ({ | |
name: tagElement.text.trim(), | |
link: `${BASE_URL}${tagElement.getAttribute("href")}`, | |
})), | |
}; | |
}); | |
} | |
/** | |
* Gets the books of the user | |
* @returns {ListBook[]} A list of books | |
*/ | |
async books() { | |
const { lastPage } = await this.#configureAndGetLastPage(); | |
const pages = []; | |
for (let i = 1; i <= lastPage; i++) { | |
pages.push(await this.#getBooksFromPage(i)); | |
} | |
return pages.flat(); | |
} | |
async #getBookSummary(document) { | |
const summaryProps = new URLSearchParams(); | |
summaryProps.append("type", 1); | |
summaryProps.append( | |
"id_obj", | |
document | |
.querySelector("#d_bio>span>a") | |
.getAttribute("onclick") | |
.split(",") | |
.at(-1) | |
.replace(");", "") | |
); | |
const { body: summary } = await this.#client.post( | |
`${BASE_URL}/aj_voir_plus_a.php`, | |
{ | |
responseType: "buffer", | |
headers: { "Content-Type": "application/x-www-form-urlencoded" }, | |
body: summaryProps.toString(), | |
} | |
); | |
return new TextDecoder("windows-1252").decode(summary).replace(/<[^>]*>?/gm, '').trim(); | |
} | |
async #getBookUserData(libraryId) { | |
const { body } = await this.#client.get( | |
`${BASE_URL}/ajoutinfoslivre.php?id_biblio=${libraryId}`, | |
{ responseType: "buffer" } | |
); | |
const document = parse(body); | |
const evaluation = document | |
.querySelector(`#unit_longDUR${libraryId}`) | |
?.getAttribute("data-rateit-value"); | |
const startRead = document | |
.querySelector("#datepicker_deb") | |
?.getAttribute("value"); | |
const endRead = document | |
.querySelector("#datepicker_fin") | |
?.getAttribute("value"); | |
const tags = document | |
.querySelectorAll('a[href*="/etiquettes"]') | |
.map((htmlTag) => ({ | |
name: htmlTag.text, | |
link: htmlTag.getAttribute("href"), | |
})); | |
const review = document.getElementById('Critique')?.text; | |
return { | |
startRead, | |
endRead, | |
evaluation: evaluation ? Number(evaluation) : undefined, | |
tags, | |
review | |
}; | |
} | |
/** | |
* Gets the details of a book | |
* @param {string} bookId the identifier of the book | |
* @returns {Book} The book details | |
*/ | |
async book(bookId) { | |
const { body } = await this.#client.get(`${BASE_URL}/livres/-/${bookId}`, { | |
responseType: "buffer", | |
}); | |
const document = parse(body); | |
const authorsElements = document.querySelectorAll( | |
'span[itemprop="author"]>a' | |
); | |
const headerElement = document.querySelector( | |
'.livre_header_con [itemprop="name"]' | |
); | |
const metaElemen = document.querySelector(".livre_refs"); | |
const editorElement = metaElemen.querySelector("a"); | |
const ean = metaElemen.text.match(/EAN : (?<ean>[0-9]{13})/)?.groups.ean; | |
const pages = metaElemen.text.match(/(?<pages>[0-9]+) pages /)?.groups | |
.pages; | |
const link = headerElement.querySelector(">a").getAttribute("href"); | |
const libraryId = document | |
.querySelector('.titre a[href*="/ajoutinfoslivre"]') | |
.getAttribute("href") | |
.split("=") | |
.at(-1); | |
const metaStatus = document.querySelector(".livre_action_status_2")?.text.trim(); | |
const [summary, tagsAndPeriod] = await Promise.all([ | |
this.#getBookSummary(document), | |
this.#getBookUserData(libraryId), | |
]); | |
return { | |
id: link.split("/").at(-1), | |
title: headerElement.text.trim(), | |
image: document | |
.querySelector('.livre_con [itemprop="image"]') | |
?.getAttribute("src"), | |
ean, | |
pages: pages ? Number(pages) : undefined, | |
link: `${BASE_URL}${link}`, | |
editor: { | |
name: editorElement?.text.trim(), | |
link: `${BASE_URL}${editorElement?.getAttribute("href")}`, | |
}, | |
authors: authorsElements.map((authorElement) => ({ | |
link: `${BASE_URL}${authorElement.getAttribute("href")}`, | |
name: authorElement.text.trim(), | |
})), | |
status: document.querySelector(".livre_action_status_1")?.text, | |
borrowed: metaStatus.includes('Emprunté'), | |
exchangeable: metaStatus.includes('À échanger'), | |
summary, | |
...tagsAndPeriod, | |
}; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import BabelIO from "./babelio.js"; | |
const main = async () => { | |
const babelio = new BabelIO('My username', 'It's password'); | |
await babelio.auth(); | |
console.log(await babelio.books()); | |
console.log(await babelio.book("837687")); | |
}; | |
main() | |
.then(() => console.log("Done !")) | |
.catch((e) => { | |
console.error(e); | |
process.exit(10); | |
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "babelio-scraper", | |
"version": "1.0.0", | |
"description": "", | |
"main": "src/main.js", | |
"type": "module", | |
"scripts": { | |
"start": "node ./main.js" | |
}, | |
"author": "", | |
"license": "ISC", | |
"dependencies": { | |
"got-scraping": "^3.2.9", | |
"node-html-parser": "^5.3.3", | |
"tough-cookie": "^4.0.0" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi @neoPix
I tried to implement my own scrapping tool for Babelio in Rust, but after a few tests, I can't access babelio.com anymore (it timeout). It seems Babelio have blocked my IP address. :(
Did you have the same problem ? Maybe they would not have banned my IP if I was logged in ?
It's been 3 days and my IP is still blocked. I didn't find any mentions in Babelio website or any other person like me who have been blocked
Any help is appreciated ! :)