Created
August 11, 2017 18:59
-
-
Save cflems/074a28c6d6adbd9ff49dcd58a041a7d9 to your computer and use it in GitHub Desktop.
Obtain a local copy of any static website. Background images and some references may not work properly.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const http = require('http'); | |
const https = require('https'); | |
const fs = require('fs'); | |
const path = require('path'); | |
const url = require('url'); | |
const exec = require('child_process').execSync; | |
const jq = require('jquery'); | |
const jsdom = require('jsdom'); | |
const SAVE_DIR = path.dirname(__filename)+'/scrape'; | |
const nop = function(){}; | |
let map = {}; | |
function strip_www (str) { | |
let idx = str.indexOf('www.'); | |
return (idx<0)?str:str.substr(0,idx)+str.substr(idx+4); | |
} | |
function piecewise_url (href, base_href) { | |
let myurl = new url.URL(href, base_href); | |
return { | |
host: strip_www(myurl.host), | |
path: myurl.pathname ? myurl.pathname : '/', | |
protocol: myurl.protocol, | |
port: myurl.protocol.toLowerCase()=='https:'?443:80, | |
href: strip_www(myurl.href), | |
}; | |
} | |
function safe_download_path (fname) { | |
if (fname[fname.length-1] == '/') return fname+'index.html'; | |
else if (path.basename(fname).indexOf('.') < 0) return fname+'/index.html'; | |
return fname; | |
} | |
function schedule_download (url_pts) { | |
const dl_module = (url_pts.protocol.toLowerCase() == 'https:') ? https : http; | |
let http_opts = url_pts; | |
http_opts['headers'] = { | |
'User-Agent': 'Mozilla/5.0', | |
}; | |
let safe_dl_path = SAVE_DIR+safe_download_path(url_pts.path); | |
exec('mkdir -p '+path.dirname(safe_dl_path), nop); | |
let dl_fp = fs.createWriteStream(safe_dl_path, {flags: 'w', mode: 0o644}) | |
dl_fp.on('error', function (e) { | |
console.error(e); | |
}); | |
dl_fp.on('open', function (fd_number) { | |
dl_module.get(http_opts, (res) => { | |
console.log(url_pts.path+' STATUS: '+res.statusCode); | |
let parseFlag = res.headers['content-type'].toLowerCase().startsWith('text/html'); | |
let html = []; | |
res.pipe(dl_fp); | |
if (parseFlag) { | |
res.on('data', (chunk) => { | |
html.push(chunk); | |
}); | |
res.on('end', () => { | |
html = Buffer.concat(html).toString(); | |
let $ = jq((new jsdom.JSDOM(html, { url: url_pts.href })).window); | |
$.each($('[href],[src]'), function (idx, me) { | |
let $me = $(me); | |
if ($me.attr('href') || $me.attr('src')) { | |
let child_pts = piecewise_url($me.attr('href') || $me.attr('src'), url_pts.href); | |
if (child_pts.host == url_pts.host) spider(child_pts.href); | |
} | |
}); | |
}); | |
} | |
}); | |
}); | |
} | |
function spider (href) { | |
let url_pts = piecewise_url(href); | |
if (url_pts.href in map) return; | |
map[url_pts.href] = true; | |
console.log(url_pts.href); | |
schedule_download(url_pts); | |
} | |
spider(process.argv[2]); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment