Skip to content

Instantly share code, notes, and snippets.

@iexa
Last active April 25, 2020 16:26
Show Gist options
  • Save iexa/5ec548fe85d902ab653fafd11a254fbd to your computer and use it in GitHub Desktop.
Save iexa/5ec548fe85d902ab653fafd11a254fbd to your computer and use it in GitHub Desktop.
mass files downloader using node.js + json file. {also for es6 concepts}
// dl.js script, dl.json is {'folder_name1': [url1, url2, url3, ...], 'folder_name2': [...], ...}
//
// 1st "major" mod - now uses async "threads" to download several files at once. m.o. is not the
// best way to do it; but it saves some time and does not overwhelm servers
//
// examples for data scraping :D
// a = document.querySelectorAll('ul#list>li>a.folder')
// JSON.stringify(Array.prototype.map.call(a, i => decodeURI(i.href.split('/').reverse()[1])))
//
// ... and files from inside folders:
// a = document.querySelectorAll('ul#list>li[class~="file"]>a')
// JSON.stringify(Array.prototype.map.call(a, i => i.href.split('?a')[0]))
const fs = require('fs')
const path = require('path')
const qs = require('querystring')
const ht = require('https')
const data = require('./dl.json') // JSON.parse( js.readFileSync('dl.json') )
// create epoch secs.msec [xxx.y]
const get_time_frsec = () => roundfr(Date.now()/1000)
const roundfr = x => Math.round(x*10) / 10
// return filename part only from url path
const get_filename = x => qs.unescape(x).split('/').reverse()[0]
const get_script = function (url, name) {
return new Promise((resolve, reject) =>
ht.get(url, resp => {
//let data = ''
//resp.on('data', chunk => data += chunk)
//resp.on('end', _ => {
// fs.writeFileSync(name, data, 'utf-8') // utf8 or binary encoding
// resolve()
//})
let tm = get_time_frsec()
resp.pipe(fs.createWriteStream(name, 'binary')) // streams to allow low mem. usage
resp.on('end', _ => resolve({name, time: get_time_frsec()-tm}))
resp.on('error', _ => reject(_))
})
)
}
// so await could be used to grab files sequentially
async function dofiles(dirname, files) {
// check for prev. downloaded files; no hash||size check [TODO, maybe]
let missing_files = [];
[...files].forEach(file => {
if(!fs.existsSync(path.join(dirname, get_filename(file))))
missing_files.push(file) // just mark it to be removed
})
if(missing_files.length < files.length) {
console.log(` > ...skipped ${files.length-missing_files.length} already existing files`)
files = missing_files
}
let [files_cnt, files_now] = [files.length, 0]
let parallel_queue = []
while(files.length) {
let file = files.shift().replace('#', '%23') // needed for some servers -- can be used with qs
let file_name = get_filename(file)
files_now += 1
let files_progress = files_now.toString()
.padStart(files_cnt.toString().length, '0')
process.stdout.write(` ${files_progress}/${files_cnt} '${file_name}' \n`)
// fill list of files to dl parallelly
if(parallel_queue.length < threads)
parallel_queue.push( get_script(file, path.join(dirname, file_name)) )
// do processing only if last file or parallel queue full
if(files.length === 0 || parallel_queue.length === threads) {
let data = await Promise.all(parallel_queue).catch(err => console.error(err))
process.stdout.moveCursor(0, 0-parallel_queue.length)
data.map(item => {
let {name, time} = item
let stat = fs.statSync(name)
process.stdout.moveCursor(68)
process.stdout.write(` [${roundfr(stat.size/1024/1024)}mb`
+` @ ${roundfr(time)}s]\n`)
})
parallel_queue = [] // reset queue
}
}
}
// go sequentially first folders then (up) files
const doitall = async (data) => {
for (let [dirname, files] of Object.entries(data)) {
if(!fs.existsSync(dirname)) {
fs.mkdirSync(dirname)
}
console.log(`>>> Getting "${dirname}"`)
await dofiles(dirname, files)
.then(_ => console.log(` --------`))
.catch(_ => console.log(`!!! ERROR !!! ${_}`))
}
}
console.log('>>> JS^2ON-file grabber v0.057 🚄 alpha by iexa\n')
// 1st and only param n of `threads` (async `procs`) def. 2.
let threads = parseInt(process.argv[2] || 2)
threads = Number.isNaN(threads) || threads < 1 ? 2 : threads
const time_start = get_time_frsec()
let results_promise
if(data instanceof Array) // [files only]
results_promise = dofiles('.', data)
else // {subfolder: [files], subfolder2: [files2]}
results_promise = doitall(data)
results_promise.then(_ => console.log(
`>>> DONE. Took ${roundfr(get_time_frsec()-time_start)} secs overall.\n`))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment