Skip to content

Instantly share code, notes, and snippets.

@cvan
Last active April 8, 2019 10:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cvan/da899090fa6c38f87dacbef95ea5d785 to your computer and use it in GitHub Desktop.
Save cvan/da899090fa6c38f87dacbef95ea5d785 to your computer and use it in GitHub Desktop.
download URLs to disk (without any third-party npm dependencies)
// Source: https://gist.github.com/cvan/da899090fa6c38f87dacbef95ea5d785#file-downloader-js
// Author: CVAN <https://cvan.io>
const fs = require('fs');
const http = require('http');
const https = require('https');
const path = require('path');
const URL = require('url');
const utils = {};
utils.getSafeTimestamp = () => new Date().toISOString().slice(0, 23).replace(/:/g, '.');
utils.prependToFileExtname = (filename, str) => {
const idxDot = filename.lastIndexOf('.');
if (idxDot === -1) {
return `${filename}${str}`;
}
return `${filename.substring(0, idxDot)}${str}${filename.substring(idxDot)}`;
};
utils.getSafeFilenameWithTimestamp = (safeFilename, opts = {replacement: '__'}) => {
return utils.prependToFileExtname(safeFilename, `${opts.replacement || ''}${utils.getSafeTimestamp()}`);
};
utils.getSafeFilename = utils.getSafeFilenameWithTimestamp;
try {
utils.getSafeFilename = require('./filenamify.js');
} catch (err) {
}
class Downloader {
constructor (url, options) {
this.counter = 0;
this.options = {url};
if (typeof options === 'undefined') {
if (typeof url === 'object') {
this.options = Object.assign(this.options, {url});
}
} else {
this.options = {url};
}
this.defaults = {
followRedirect: true,
getFormattedFilename: utils.getSafeFilename,
defaultProtocol: 'https', // Default protocol to use when URL passed does not start with a protocol.
timeout: 3000, // Give up on request after (in milliseconds). Default: 3000ms (three seconds).
userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.39 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
};
if (!('userAgent' in this.options)) {
if ('useragent' in this.options) {
this.options.userAgent = this.options['useragent'];
} else if ('user-agent' in this.options) {
this.options.userAgent = this.options['user-agent'];
} else if ('User-agent' in this.options) {
this.options.userAgent = this.options['User-agent'];
} else if ('User-Agent' in this.options) {
this.options.userAgent = this.options['User-Agent'];
} else if ('UserAgent' in this.options) {
this.options.userAgent = this.options['UserAgent'];
} else if ('useragent' in this.options) {
this.options.userAgent = this.options['useragent'];
}
}
this.options = Object.assign({}, this.defaults, this.options);
this.options.defaultProtocol = this.options.defaultProtocol.toLowerCase().replace(/:.*/, '');
if (typeof this.options.url !== 'undefined') {
this.url = this.options.url;
this.fetch(this.url);
}
}
get url () {
return this._url;
}
set url (url) {
if (url.startsWith('//')) {
url = `http://${url.substr(2)}`;
} else if (!url.startsWith('https://') && !url.startsWith('http://')) {
url = `${this.options.defaultProtocol}://${url}`;
}
this._url = url;
this.urlObj = URL.parse(url);
this.requestOptions = {
method: 'GET',
hostname: this.urlObj.hostname,
path: this.urlObj.path || '/',
headers: {
'User-Agent': this.options.userAgent
},
timeout: this.options.timeout
};
}
getFileExtnameFromUrl ({pathname, contentType}) {
contentType = (contentType || '').toLowerCase();
let extname = path.extname(pathname);
if (!extname || extname !== 'html' && contentType.includes('html')) {
extname = '.html';
}
return extname;
}
getSafeFilename ({host, pathname, contentType}) {
const includePathname = this.options.getFormattedFilename._willEscapeUnsafeCharacters && pathname !== '/' && pathname !== '/index.html';
let filename = host;
if (includePathname) {
filename += pathname;
}
if (!includePathname || !path.extname(path.basename(filename.toLowerCase()))) {
filename += this.getFileExtnameFromUrl({
pathname,
contentType
});
}
filename = this.options.getFormattedFilename(filename, {replacement: '__'});
return filename;
}
fetch (url) {
if (typeof url !== 'undefined') {
this.url = url;
}
return new Promise((resolve, reject) => {
// TODO: Improve handling of media assets that have compressed response bodies (e.g., gzip, Deflate, Brotli).
const request = (this.urlObj.protcool === 'https' ? https : http).request(this.requestOptions, res => {
// Handle redirects.
if (this.followRedirect &&
res.statusCode === 301 || res.statusCode === 302 || res.statusCode === 307 || res.statusCode === 308) {
if (!res.headers.location) {
request.abort();
return reject(new Error('Encountered a URL redirect without a "Location" HTTP response header'));
}
if (this.requestOptions.path === res.headers.location ||
this.urlObj.href === res.headers.location) {
request.abort();
return reject(new Error('Encountered an infinite URL redirect'));
}
request.abort();
return this.fetch(res.headers.location);
}
if (res.statusCode !== 200) {
request.abort();
return reject(new Error(`Encountered an unexpected server response: ${res.statusCode} – ${res.statusMessage}`));
}
res.setEncoding('utf8');
res.on('error', err => reject(err));
let filename = this.getSafeFilename({
host: res.socket._host,
pathname: res.req.path.split('?')[0],
contentType: res.headers['content-type']
});
const filenameAbsolute = path.join(__dirname, filename);
const destStream = fs.createWriteStream(filenameAbsolute);
destStream.on('finish', () => resolve(`Downloaded "${this.urlObj.href}" to ${filenameAbsolute}`));
destStream.on('error', err => reject(err));
res.pipe(destStream);
});
request.on('timeout', () => {
request.abort();
reject(new Error('Network timeout'));
});
request.end();
}).then(successMsg => {
if (module.parent) {
return successMsg;
}
console.log(successMsg);
process.exit(0);
}).catch(err => {
if (!module.parent) {
console.error(`Error occurred: ${err.message}`);
process.exit(1);
}
throw err;
});
}
}
if (!module.parent) {
new Downloader(process.argv[2] || process.env.DOWNLOADER_URL);
}
module.exports.Downloader = Downloader;
module.exports.utils = utils;
// Source: Adapted from https://github.com/sindresorhus/filenamify/blob/master/index.js
let filenameReservedRegex = () => (/[<>:"\/\\|?*\x00-\x1F]/g);
filenameReservedRegex.windowsNames = () => (/^(con|prn|aux|nul|com[0-9]|lpt[0-9])$/i);
// Doesn't make sense to have longer filenames
const MAX_FILENAME_LENGTH = 100;
const matchOperatorsRe = /[|\\{}()[\]^$+*?.]/g;
const reControlChars = /[\u0000-\u001f\u0080-\u009f]/g; // eslint-disable-line no-control-regex
const reRelativePath = /^\.+/;
function trimRepeated (str, target) {
if (typeof str !== 'string' || typeof target !== 'string') {
throw new TypeError('Expected a string');
}
return str.replace(new RegExp('(?:' + escapeStringRegexp(target) + '){2,}', 'g'), target);
}
function escapeStringRegexp () {
return function (str) {
if (typeof str !== 'string') {
throw new TypeError('Expected a string');
}
return str.replace(matchOperatorsRe, '\\$&');
};
}
function stripOuter (input, substring) {
if (typeof input !== 'string' || typeof substring !== 'string') {
throw new TypeError('Expected a string');
}
substring = escapeStringRegexp(substring);
return input.replace(new RegExp(`^${substring}|${substring}$`, 'g'), '');
}
function filenamify (string, options = {}) {
if (typeof string !== 'string') {
throw new TypeError('Expected a string');
}
const replacement = options.replacement === undefined ? '!' : options.replacement;
if (filenameReservedRegex().test(replacement) && reControlChars.test(replacement)) {
throw new Error('Replacement string cannot contain reserved filename characters');
}
string = string.replace(filenameReservedRegex(), replacement);
string = string.replace(reControlChars, replacement);
string = string.replace(reRelativePath, replacement);
if (replacement.length > 0) {
string = trimRepeated(string, replacement);
string = string.length > 1 ? stripOuter(string, replacement) : string;
}
string = filenameReservedRegex.windowsNames().test(string) ? string + replacement : string;
string = string.slice(0, MAX_FILENAME_LENGTH);
return string;
}
filenamify._willEscapeUnsafeCharacters = true;
module.exports = filenamify;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment