Skip to content

Instantly share code, notes, and snippets.

@cflems
Created August 11, 2017 18:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cflems/074a28c6d6adbd9ff49dcd58a041a7d9 to your computer and use it in GitHub Desktop.
Save cflems/074a28c6d6adbd9ff49dcd58a041a7d9 to your computer and use it in GitHub Desktop.
Obtain a local copy of any static website. Background images and some references may not work properly.
const http = require('http');
const https = require('https');
const fs = require('fs');
const path = require('path');
const url = require('url');
const exec = require('child_process').execSync;
const jq = require('jquery');
const jsdom = require('jsdom');
const SAVE_DIR = path.dirname(__filename)+'/scrape';
const nop = function(){};
let map = {};
function strip_www (str) {
let idx = str.indexOf('www.');
return (idx<0)?str:str.substr(0,idx)+str.substr(idx+4);
}
function piecewise_url (href, base_href) {
let myurl = new url.URL(href, base_href);
return {
host: strip_www(myurl.host),
path: myurl.pathname ? myurl.pathname : '/',
protocol: myurl.protocol,
port: myurl.protocol.toLowerCase()=='https:'?443:80,
href: strip_www(myurl.href),
};
}
function safe_download_path (fname) {
if (fname[fname.length-1] == '/') return fname+'index.html';
else if (path.basename(fname).indexOf('.') < 0) return fname+'/index.html';
return fname;
}
function schedule_download (url_pts) {
const dl_module = (url_pts.protocol.toLowerCase() == 'https:') ? https : http;
let http_opts = url_pts;
http_opts['headers'] = {
'User-Agent': 'Mozilla/5.0',
};
let safe_dl_path = SAVE_DIR+safe_download_path(url_pts.path);
exec('mkdir -p '+path.dirname(safe_dl_path), nop);
let dl_fp = fs.createWriteStream(safe_dl_path, {flags: 'w', mode: 0o644})
dl_fp.on('error', function (e) {
console.error(e);
});
dl_fp.on('open', function (fd_number) {
dl_module.get(http_opts, (res) => {
console.log(url_pts.path+' STATUS: '+res.statusCode);
let parseFlag = res.headers['content-type'].toLowerCase().startsWith('text/html');
let html = [];
res.pipe(dl_fp);
if (parseFlag) {
res.on('data', (chunk) => {
html.push(chunk);
});
res.on('end', () => {
html = Buffer.concat(html).toString();
let $ = jq((new jsdom.JSDOM(html, { url: url_pts.href })).window);
$.each($('[href],[src]'), function (idx, me) {
let $me = $(me);
if ($me.attr('href') || $me.attr('src')) {
let child_pts = piecewise_url($me.attr('href') || $me.attr('src'), url_pts.href);
if (child_pts.host == url_pts.host) spider(child_pts.href);
}
});
});
}
});
});
}
function spider (href) {
let url_pts = piecewise_url(href);
if (url_pts.href in map) return;
map[url_pts.href] = true;
console.log(url_pts.href);
schedule_download(url_pts);
}
spider(process.argv[2]);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment