Skip to content

Instantly share code, notes, and snippets.

@TimvanScherpenzeel
Last active June 14, 2023 12:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TimvanScherpenzeel/1eed0682d68bf126801b1aeb3895c15d to your computer and use it in GitHub Desktop.
Save TimvanScherpenzeel/1eed0682d68bf126801b1aeb3895c15d to your computer and use it in GitHub Desktop.
Website scraping using Puppeteer and Node.js
// Native
const fs = require('fs');
const path = require('path');
// Vendor
const { ArgumentParser } = require('argparse');
const fetch = require('node-fetch');
const mkdirp = require('mkdirp');
const puppeteer = require('puppeteer');
// Argument parser
const createParserArguments = () => {
const parser = new ArgumentParser({
addHelp: true,
});
parser.addArgument(['-i', '--input'], {
help: 'Webpage you would like to save',
required: true,
});
parser.addArgument(['-o', '--output'], {
help: 'Output directory you would like to save to',
defaultValue: 'output',
required: false,
});
const args = parser.parseArgs();
return args;
}
const args = createParserArguments();
// Scraper
(async () => {
const browser = await puppeteer.launch({
headless: false,
});
const page = await browser.newPage();
await page.setRequestInterception(true);
// Write base index.html
mkdirp(`${args.output}`, error => {
if (error) {
console.error(error);
} else {
fetch(args.input)
.then(response => response.buffer())
.then(buffer => {
console.log(`Wrote to ${args.output}/index.html`);
const fileStream = fs.createWriteStream(`${args.output}/index.html`);
fileStream.write(buffer);
fileStream.end();
});
}
});
// Write all assets to their correct folders
page.on('request', interceptedRequest => {
const request = interceptedRequest.url();
// construct path from after input url
// http://experience.example.com/img/ to /img/
const pathname = (path.parse(decodeURI(request)).dir).split(args.input)[1];
// strip versioning from files because they don't work in a filesystem
// main.min.js?v=1481729814779 to main.min.js
const filename = (path.parse(request).base).split('?')[0];
// Only handle root files or files in the website file system (so available on the domain)
if (pathname !== undefined || args.input.replace(/\/$/, '') === path.parse(decodeURI(request)).dir) {
// Files in the root appear as undefined but should be handled
const decodedPath = pathname ? pathname : '';
mkdirp(`${args.output}/${decodedPath}`, error => {
if (error) {
console.error(error);
} else {
fetch(request)
.then(response => response.buffer())
.then(buffer => {
console.log(`Wrote to ${args.output}/${decodedPath}/${filename}`);
const fileStream = fs.createWriteStream(`${args.output}/${decodedPath}/${filename}`);
fileStream.write(buffer);
fileStream.end();
});
}
});
}
interceptedRequest.continue();
});
await page.goto(args.input);
await page.waitForNavigation({ waitUntil: 'networkidle0' }),
await browser.close();
})();
{
"name": "scraper",
"version": "0.0.1",
"description": "",
"main": "index.js",
"author": "Tim van Scherpenzeel",
"license": "MIT",
"dependencies": {
"argparse": "^1.0.10",
"fs-extra": "^6.0.1",
"mkdirp": "^0.5.1",
"node-fetch": "^2.1.2",
"puppeteer": "^1.4.0"
}
}
@elarias1975
Copy link

Excellent!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment