Skip to content

Instantly share code, notes, and snippets.

@alavkx
Created January 8, 2024 22:36
Show Gist options
  • Save alavkx/8074ec34d842f9f95d1cffbb6e1e7ea4 to your computer and use it in GitHub Desktop.
Save alavkx/8074ec34d842f9f95d1cffbb6e1e7ea4 to your computer and use it in GitHub Desktop.
Simple Puppeteer Web Scraper
{
"name": "some-project",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"scrape": "npx tsx ./src/scrape.ts"
},
"author": "",
"license": "ISC",
"dependencies": {
"puppeteer": "^21.7.0",
"puppeteer-cluster": "^0.23.0",
"typescript": "^5.3.3"
}
}
import { Page } from "puppeteer";
import { promises as fsPromises, readdirSync } from "fs";
import * as path from "path";
import * as url from "url";
import { Cluster } from "puppeteer-cluster";
const MAX_CONCURRENCY = 5;
const savePage = async (
{ page, data: pageUrl }: { page: Page; data: string },
basePath: string,
visited: Set<string>,
cluster: Cluster
) => {
console.log(`Loading ${pageUrl}...`);
await page.goto(pageUrl, { waitUntil: "networkidle2" });
const pageContent = await page.content();
console.log(`${pageUrl} content retrieved`);
const pagePath = url.parse(pageUrl).pathname || "/index.html";
const filePath = path.join(basePath, pagePath.replace(/\/$/, "/index.html"));
console.log(`Writing directory for ${filePath}`);
await fsPromises.mkdir(path.dirname(filePath), { recursive: true });
console.log(`Writing file for ${filePath}`);
await fsPromises.writeFile(filePath, pageContent);
console.log(`File written for ${filePath}`);
const links = await page.$$eval("a", (anchors) =>
anchors.map((anchor: any) => anchor.href)
);
if (!visited.size) console.log(`Visiting ${JSON.stringify(links, null, 3)}`);
for (const link of links) {
if (!visited.has(link) && link.startsWith(new URL(pageUrl).origin)) {
visited.add(link);
cluster.queue(link);
}
}
};
const initializeVisitedUrls = (
basePath: string,
baseUrl: string
): Set<string> => {
const visited = new Set<string>();
const baseDomain = new URL(baseUrl).origin;
const addFiles = (directory: string) => {
readdirSync(directory, { withFileTypes: true }).forEach((dirent) => {
const fullPath = path.join(directory, dirent.name);
if (dirent.isDirectory()) {
addFiles(fullPath);
} else {
// Assuming the file structure mirrors the URL path
const relativePath = fullPath
.substring(basePath.length)
.replace(/\\/g, "/");
visited.add(new URL(relativePath, baseDomain).href);
}
});
};
addFiles(basePath);
return visited;
};
const crawlWebsite = async (startUrl: string, basePath: string) => {
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_BROWSER,
maxConcurrency: MAX_CONCURRENCY,
});
const visitedUrls = initializeVisitedUrls(basePath, startUrl);
// Event handler to be called every time cluster executes a task
await cluster.task(async ({ page, data: pageUrl }) => {
await savePage({ page, data: pageUrl }, basePath, visitedUrls, cluster);
});
cluster.queue(startUrl);
visitedUrls.add(startUrl);
await cluster.idle();
await cluster.close();
};
const site = "https://www.wikipedia.org/"; // Replace with your target URL
const outputPath = "./output"; // Local directory to save the files
crawlWebsite(site, outputPath).catch(console.error);
{
"compilerOptions": {
"esModuleInterop": true,
"skipLibCheck": true,
"target": "es2022",
"allowJs": true,
"resolveJsonModule": true,
"moduleDetection": "force",
"isolatedModules": true,
"strict": true,
"noUncheckedIndexedAccess": true,
"moduleResolution": "NodeNext",
"module": "NodeNext",
"outDir": "dist",
"sourceMap": true,
"lib": ["es2022"]
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment