Skip to content

Instantly share code, notes, and snippets.

@ggorlen
Created May 20, 2024 22:59
Show Gist options
  • Save ggorlen/46872a1bb569d0119e6574b196a5c48a to your computer and use it in GitHub Desktop.
Save ggorlen/46872a1bb569d0119e6574b196a5c48a to your computer and use it in GitHub Desktop.
scrape any webpage to markdown
const cheerio = require("cheerio"); // ^1.0.0-rc.12
const { JSDOM } = require("jsdom"); // ^24.0.0
const puppeteer = require("puppeteer"); // ^22.7.1
const TurndownService = require("turndown"); // ^7.1.2
const { Readability } = require("@mozilla/readability"); // ^0.5.0
const urlToMarkdown = async (page, url) => {
await page.goto(url, { waitUntil: "networkidle2" });
const doc = new JSDOM(await page.content(), { url });
// idea for using readability: https://news.ycombinator.com/item?id=39504105
const article = new Readability(doc.window.document).parse();
const content = `<h1>${article.title}</h1>${article.content}`;
const turndownService = new TurndownService({
headingStyle: "atx",
codeBlockStyle: "fenced",
bulletListMarker: "-",
});
// isn't working, cheerio is used instead
//turndownService.addRule("br", {
// filter: "br",
// replacement: () => "XXX".repeat(500),
//});
const $ = cheerio.load(content);
$("code br, pre br").replaceWith("\n");
// remove links
//turndownService.addRule("a", {
// filter: "a",
// replacement: (content) => content,
//});
return turndownService.turndown($.html());
};
const makePage = async (browser) => {
const [page] = await browser.pages();
await page.setRequestInterception(true);
const blockedTypes = ["image", "stylesheet", "font"];
page.on("request", (req) => {
if (blockedTypes.includes(req.resourceType())) {
req.abort();
} else {
req.continue();
}
});
return page;
};
let browser;
(async () => {
const url = "https://en.wikipedia.org/wiki/Data_warehouse";
browser = await puppeteer.launch();
const page = await makePage(browser);
const markdown = await urlToMarkdown(page, url);
console.log(markdown);
})()
.catch((err) => console.error(err))
.finally(() => browser?.close());
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment