rplacd/tumblr-downloader.js

## tumblr-downloader.js
"use strict";

/*
  tumblr-downloader.js - a Node.js script to download all media files posted in
  a Tumblr blog, most recent first.

  Usage:
    node tumblr-downloader.js

    Options - domain to download from, API key, and so on - are exposed below.
    It's not good behavior in general, but it's well adapted to how I use this:
    often, but for very few sites (and so very few combinations of arguments.)

    Remember to set API_KEY.

  Setup:
    Built with ES6 and Node.js v6.2.2. After installing Node.js...
    - You'll need a Tumblr API key set in API_KEY below. An example of one is
    available in the official Tumblr API demo. (Tumblr offers the source in good
    faith; please don't abuse that access.)
    - Download the various modules required via npm:
    "npm install underscore cli async request sync-request wu mkdirp"

  Fine details:
    - Since media files on Tumblr don't come attached with their original
    filenames, each download is given a computed filename based on the tumblr
    title and post description.
    - Downloads that correspond to filenames already existing in the target
    folder will be skipped.
    - A JSON "metadata log" is output on stdout for every file downloaded;
    progress information on stderr.
    - The dumper downloads as many media files as it can find.
    - To quit the dumper at any time, send SIGINT to the application - Control-C
    in the vast majority of setups. The dumper should remove any half-downloaded
    files.

  I put this source in the public domain.
*/

const DOMAIN = (() => throw "set me - to wallpapers.tumblr.com, for example.")();
const DOWNLOAD_DIRECTORY = (() => throw "set me - to ~/Photos/Wallpapers, for example")();
const API_KEY = (() => throw "set me to a Tumblr API key - see the documentation.")();

// Setting these will change behavior from that documented above.
// Their use is unsupported.
const NUM_CONCURRENT_DOWNLOADS = 2;
const REDOWNLOAD_EXISTING = "REWRITING";
const NUM_TARGET_DOWNLOADS = Number.POSITIVE_INFINITY;

// Modules you'll have to install.
const _ = require('underscore');
const cli = require('cli');
const async = require('async');
const request = require('request');
const srequest = require('sync-request');
const wu = require('wu');
const mkdirp = require('mkdirp');

// Standard library modules, and a few monkeypatches.
const fs = require('fs');
const https = require('https');
const url = require('url');
const path = require('path');
console.info = console.warn;
  // We reserve stdout as a source of structured output data;
  // Logging then goes, by default, to stderr.

// Begin control flow here.
main();

// Error handling policy - similar to Erlang's fail-fast:
// - unhandled - deletes the working directory:
// Serious logic errors.
// Failures in the main thread.
// - handled and suppressed:
// Failures in worker threads.

function main() {
  // Create the UI state.
  let ui = "a placeholder.";

  // Set up filesystem state.
  mkdirp.sync(DOWNLOAD_DIRECTORY);

  // Now, given a stream of posts that
  // - we've trimmed to the correct size;
  // - converted into "download specifications".
  let downloads = wu(posts(DOMAIN, API_KEY))
                   .take(NUM_TARGET_DOWNLOADS)
                   .concatMap(postToDownloads);

  // ...for each post,
  async.eachLimit(downloads, NUM_CONCURRENT_DOWNLOADS,
    (download, cont) => {
      console.log(DOWNLOAD_DIRECTORY, download["reporting"]["fname"]);

      let guarded_IO_download = IO_handlingSIGINT(
        IO_download,
        () => {
          let fpath = path.join(
            DOWNLOAD_DIRECTORY,
            download["reporting"]["fname"]
          );
          console.info("Handled SIGINT; gracefully removing ", fpath);
          if(fs.existsSync(fpath)) {
            fs.unlinkSync(fpath);
          }
        }
      );

      // download the post (and rolling back if interrupted);
      guarded_IO_download.apply(this,
        download["downloadInvoc"].concat(ui, _cont =>
          // output metadata on stdout.
          IO_outputMetadata(download, cont)
        )
      );
    },

    // When done with all of our downloads, applaud.
    () => console.info("main: Done.")
  );
}

// These IO functions have a property that all functions implementing
// asynchronous IO in node.js have: they guarantee their last argument,
// "cont", will be called after all asynchronous operations have finished.

process.once('SIGINT', () => {
  process.exit();
});

function IO_handlingSIGINT(body, rollback) {
  // An IO function combinator that installs a rollback callback before
  // entering the body, and removes it in the continuation (regardless
  // or not of whether the callback is passed an error or not.)

  return function IO_wrapper() {
    let args = Array.from(arguments); args.pop();
    let continuation = arguments[arguments.length - 1];

    process.setMaxListeners(1 + process.getMaxListeners());
    process.prependOnceListener('SIGINT', rollback);

    return body(...args, err => {
      process.removeListener('SIGINT', rollback);
      return continuation(err);
    });
  };
}

function IO_download(url, fname, targetDir, ui, cont) {
  console.info('IO_download: beginning download', fname);
  if(!REDOWNLOAD_EXISTING) {
    console.info("main: user has chosen to replace posts already downloaded");
  }

  let fpath = path.join(targetDir, fname);

  if(REDOWNLOAD_EXISTING && fs.existsSync(fpath)) {
    console.info("IO_download: skipping existing download " +
                fpath + "a s requested by user.");
    return cont();
  }

  let file = fs.createWriteStream(fpath);

  return https.get(url, res => {
    let totalBytes = parseInt(res.headers['content-length'], 10);
    res.pipe(file);

    file.on('data', function(downloadChunk) {
      // Do something interesting: show a progress bar, perhaps.
    });

    file.on('finish', function() {
      console.info('IO_download: finished downloading', fname, fpath);
      file.close(cont);
    });

    file.on('error', function(err) {
      console.error('IO_download: error downloading', fname, err);
      fs.unlink(cont);
    });
  });
}

function IO_outputMetadata(metadata_record, cont) {
  console.log(JSON.stringify(metadata_record));
  cont();
}

// Operations on posts.

function postToHumanTitle(post, opt_suffix) {
  // Extract, from the post, a title that is, in order of acceptability,
  // (a) human-readable,
  // (b) human-memorisable.
  // Given a substantial amount of metadata, we select from many possibilites:
  if (post['slug'] !== "") {
    return post['slug'];
  }

  // (Check to see whether the source_url has a descriptive name, or
  //  whether it simply uses a numerical post ID),
  const urlFname = addr => path.basename(url.parse(addr).pathname);

  if (undefined !== post['source_url'] && "" !== post['source_url']) {
    let srcUrlB = urlFname(post['source_url']);
    if(isNaN(parseInt(srcUrlB, 10)))
      return srcUrlB;
  }

  // The same for the reblog_key: is it a name, or a number?
  if(isNaN(parseInt(post['reblog_key'], 10))) {
  	return post['reblog_key'];
  }

  // If that's failed, use the source_title + reblog_key...
  if (undefined !== post['source_url'] && "" !== post['source_title']) {
    return post['source_title'] + '-' + post['reblog_key'];
  }

  // And if the source_title doesn't exist either, use the reblog_key only.
  return post['reblog_key'];
}

function postToDownloads(post) {
  // TODO: remove some of the duplicated object literal structure
  // here with an object property spread operator.
  let name = postToHumanTitle(post);
  let fname = (name, dotExt) => name + dotExt;

  console.info('postsToDownload: extracted from reponse post ' + name);

  switch (post['type']) {
    case 'video':
      {
        let dotExt = path.extname(url.parse(post['video_url']).pathname);
        return [{
          reporting: {
            title: name,
            source_title: post['source_title'],
            source_url: post['source_url'],
            fname: fname(name, dotExt)
          },
          tumblrMetadata: post,
          downloadInvoc: [post['video_url'], fname(name, dotExt), DOWNLOAD_DIRECTORY]
        }];
      }
    case 'photo':
      {
        return post['photos'].map(function(photo, idx) {
          let dotExt = path.extname(url.parse(photo['original_size']['url']).pathname);
          return {
            reporting: {
              title: name,
              source_title: post['source_title'],
              source_url: post['source_url'],
              fname: fname(name + '-' + idx, dotExt)
            },
            tumblrMetadata: post,
            downloadInvoc: [photo['original_size']['url'], fname(name + '-' + idx, dotExt), DOWNLOAD_DIRECTORY]
          };
        });
      }
    default:
      {
        console.error('postsToDownload: passed a post I can\'t handle: ' + name + ', of type ' + post['type']);
        return [];
      }
  }
}

// Operations on arrays of download specs.

function downloadsLessExisting(downloads) {
  // TODO: I should be reading some metadata report and comparing
  // Tumblr's authoritative post IDs, rather than simply looking at filenames
  // - the mapping between posts and filenames will change, after all.
  // SUBTLETLY: This will treat posts with multiple images as effectively one
  // download.

  // Forgive me - the forbidden fruit of synchronous
  // IO lays bitten.
  return downloads.filter(d => !downloadExistsP(d));

  function downloadExistsP(download) {
    // TODO: I should be reading some metadata report and comparing
    // Tumblr's authoritative post IDs, rather than simply looking at filenames
    // - the mapping between posts and filenames will change, after all.
    // SUBTLETLY: This will treat posts with multiple images as effectively one
    // download.

    // Forgive me - the forbidden fruit of synchronous
    // IO lays bitten.
   	let potentialPath = path.join(DOWNLOAD_DIRECTORY, download['reporting']['fname']);
    let existsP = fs.existsSync(potentialPath);

    if(existsP) {
      console.info('downloadExistsP: ' +
      	download['reporting']['title'] + ' is already downloaded at ' +
    		potentialPath + ' Filtering out.');
  	} else {
      console.info('downloadExistsP: ' +
      	download['reporting']['title']  + ' is not yet downloaded. Not filtering out.');
    }

    return existsP;
  }}

// An iterator that continuously returns Tumblr posts, newest-first
// - sequential access to Tumblr posts is the API's sole form of
// access to posts.

function* posts(domain, apiKey) {
  // Define (but not enter) a loop that downloads post descriptors from
  // within a certain range, parses them, and yields them to the user,
  // one-by-one...
  // (The loop iterates by tail-calling itself.)
  function* download20MorePosts(currBatch) {
    // Ask tumblr for a fixed number of posts.
    var res = srequest(
      'GET',
      'https://api.tumblr.com/v2/blog/' + domain + '/posts?api_key=' + apiKey + '&offset=' + (currBatch * 20)
    );

    // The request may fail, of course; check for these cases.
    if (res.statusCode == 404) {
      console.error("posts: 404 from tumblr. Check input blog identifier or API key.");
      throw res.statusCode;
    } else if (res.statusCode != 200 && res.statusCode != 404) {
      console.error("posts: status code not denoting success that I can't handle: ", res.statusCode);
      throw res.statusCode;
    } else if (res.statusCode == 200) {
      // Unless it indicates it hasn't; we may then continue.
      console.info(
        "posts: now processing posts " +
        currBatch * 20 +
        ' - ' +
        (currBatch + 1) * 20
      );

      // Destructure the raw response;
      let posts = JSON.parse(res.body)['response']['posts'];

      // At which point we've done all we can to process the posts -
      // either we've run out of posts, and so terminate; or
      // hand them over the user.
      if(posts.length == 0) {
        return;
      } else {
        yield* posts;
        yield* download20MorePosts(currBatch + 1);
      }
    }
  }

  // ...now actually start the loop.
  yield* download20MorePosts(0);
	"use strict";

	/*
	tumblr-downloader.js - a Node.js script to download all media files posted in
	a Tumblr blog, most recent first.

	Usage:
	node tumblr-downloader.js

	Options - domain to download from, API key, and so on - are exposed below.
	It's not good behavior in general, but it's well adapted to how I use this:
	often, but for very few sites (and so very few combinations of arguments.)

	Remember to set API_KEY.

	Setup:
	Built with ES6 and Node.js v6.2.2. After installing Node.js...
	- You'll need a Tumblr API key set in API_KEY below. An example of one is
	available in the official Tumblr API demo. (Tumblr offers the source in good
	faith; please don't abuse that access.)
	- Download the various modules required via npm:
	"npm install underscore cli async request sync-request wu mkdirp"

	Fine details:
	- Since media files on Tumblr don't come attached with their original
	filenames, each download is given a computed filename based on the tumblr
	title and post description.
	- Downloads that correspond to filenames already existing in the target
	folder will be skipped.
	- A JSON "metadata log" is output on stdout for every file downloaded;
	progress information on stderr.
	- The dumper downloads as many media files as it can find.
	- To quit the dumper at any time, send SIGINT to the application - Control-C
	in the vast majority of setups. The dumper should remove any half-downloaded
	files.

	I put this source in the public domain.
	*/

	const DOMAIN = (() => throw "set me - to wallpapers.tumblr.com, for example.")();
	const DOWNLOAD_DIRECTORY = (() => throw "set me - to ~/Photos/Wallpapers, for example")();
	const API_KEY = (() => throw "set me to a Tumblr API key - see the documentation.")();

	// Setting these will change behavior from that documented above.
	// Their use is unsupported.
	const NUM_CONCURRENT_DOWNLOADS = 2;
	const REDOWNLOAD_EXISTING = "REWRITING";
	const NUM_TARGET_DOWNLOADS = Number.POSITIVE_INFINITY;

	// Modules you'll have to install.
	const _ = require('underscore');
	const cli = require('cli');
	const async = require('async');
	const request = require('request');
	const srequest = require('sync-request');
	const wu = require('wu');
	const mkdirp = require('mkdirp');

	// Standard library modules, and a few monkeypatches.
	const fs = require('fs');
	const https = require('https');
	const url = require('url');
	const path = require('path');
	console.info = console.warn;
	// We reserve stdout as a source of structured output data;
	// Logging then goes, by default, to stderr.

	// Begin control flow here.
	main();

	// Error handling policy - similar to Erlang's fail-fast:
	// - unhandled - deletes the working directory:
	// Serious logic errors.
	// Failures in the main thread.
	// - handled and suppressed:
	// Failures in worker threads.

	function main() {
	// Create the UI state.
	let ui = "a placeholder.";

	// Set up filesystem state.
	mkdirp.sync(DOWNLOAD_DIRECTORY);

	// Now, given a stream of posts that
	// - we've trimmed to the correct size;
	// - converted into "download specifications".
	let downloads = wu(posts(DOMAIN, API_KEY))
	.take(NUM_TARGET_DOWNLOADS)
	.concatMap(postToDownloads);

	// ...for each post,
	async.eachLimit(downloads, NUM_CONCURRENT_DOWNLOADS,
	(download, cont) => {
	console.log(DOWNLOAD_DIRECTORY, download["reporting"]["fname"]);

	let guarded_IO_download = IO_handlingSIGINT(
	IO_download,
	() => {
	let fpath = path.join(
	DOWNLOAD_DIRECTORY,
	download["reporting"]["fname"]
	);
	console.info("Handled SIGINT; gracefully removing ", fpath);
	if(fs.existsSync(fpath)) {
	fs.unlinkSync(fpath);
	}
	}
	);

	// download the post (and rolling back if interrupted);
	guarded_IO_download.apply(this,
	download["downloadInvoc"].concat(ui, _cont =>
	// output metadata on stdout.
	IO_outputMetadata(download, cont)
	)
	);
	},

	// When done with all of our downloads, applaud.
	() => console.info("main: Done.")
	);
	}

	// These IO functions have a property that all functions implementing
	// asynchronous IO in node.js have: they guarantee their last argument,
	// "cont", will be called after all asynchronous operations have finished.

	process.once('SIGINT', () => {
	process.exit();
	});

	function IO_handlingSIGINT(body, rollback) {
	// An IO function combinator that installs a rollback callback before
	// entering the body, and removes it in the continuation (regardless
	// or not of whether the callback is passed an error or not.)

	return function IO_wrapper() {
	let args = Array.from(arguments); args.pop();
	let continuation = arguments[arguments.length - 1];

	process.setMaxListeners(1 + process.getMaxListeners());
	process.prependOnceListener('SIGINT', rollback);

	return body(...args, err => {
	process.removeListener('SIGINT', rollback);
	return continuation(err);
	});
	};
	}

	function IO_download(url, fname, targetDir, ui, cont) {
	console.info('IO_download: beginning download', fname);
	if(!REDOWNLOAD_EXISTING) {
	console.info("main: user has chosen to replace posts already downloaded");
	}

	let fpath = path.join(targetDir, fname);

	if(REDOWNLOAD_EXISTING && fs.existsSync(fpath)) {
	console.info("IO_download: skipping existing download " +
	fpath + "a s requested by user.");
	return cont();
	}

	let file = fs.createWriteStream(fpath);

	return https.get(url, res => {
	let totalBytes = parseInt(res.headers['content-length'], 10);
	res.pipe(file);

	file.on('data', function(downloadChunk) {
	// Do something interesting: show a progress bar, perhaps.
	});

	file.on('finish', function() {
	console.info('IO_download: finished downloading', fname, fpath);
	file.close(cont);
	});

	file.on('error', function(err) {
	console.error('IO_download: error downloading', fname, err);
	fs.unlink(cont);
	});
	});
	}

	function IO_outputMetadata(metadata_record, cont) {
	console.log(JSON.stringify(metadata_record));
	cont();
	}

	// Operations on posts.

	function postToHumanTitle(post, opt_suffix) {
	// Extract, from the post, a title that is, in order of acceptability,
	// (a) human-readable,
	// (b) human-memorisable.
	// Given a substantial amount of metadata, we select from many possibilites:
	if (post['slug'] !== "") {
	return post['slug'];
	}

	// (Check to see whether the source_url has a descriptive name, or
	// whether it simply uses a numerical post ID),
	const urlFname = addr => path.basename(url.parse(addr).pathname);

	if (undefined !== post['source_url'] && "" !== post['source_url']) {
	let srcUrlB = urlFname(post['source_url']);
	if(isNaN(parseInt(srcUrlB, 10)))
	return srcUrlB;
	}

	// The same for the reblog_key: is it a name, or a number?
	if(isNaN(parseInt(post['reblog_key'], 10))) {
	return post['reblog_key'];
	}

	// If that's failed, use the source_title + reblog_key...
	if (undefined !== post['source_url'] && "" !== post['source_title']) {
	return post['source_title'] + '-' + post['reblog_key'];
	}

	// And if the source_title doesn't exist either, use the reblog_key only.
	return post['reblog_key'];
	}

	function postToDownloads(post) {
	// TODO: remove some of the duplicated object literal structure
	// here with an object property spread operator.
	let name = postToHumanTitle(post);
	let fname = (name, dotExt) => name + dotExt;

	console.info('postsToDownload: extracted from reponse post ' + name);

	switch (post['type']) {
	case 'video':
	{
	let dotExt = path.extname(url.parse(post['video_url']).pathname);
	return [{
	reporting: {
	title: name,
	source_title: post['source_title'],
	source_url: post['source_url'],
	fname: fname(name, dotExt)
	},
	tumblrMetadata: post,
	downloadInvoc: [post['video_url'], fname(name, dotExt), DOWNLOAD_DIRECTORY]
	}];
	}
	case 'photo':
	{
	return post['photos'].map(function(photo, idx) {
	let dotExt = path.extname(url.parse(photo['original_size']['url']).pathname);
	return {
	reporting: {
	title: name,
	source_title: post['source_title'],
	source_url: post['source_url'],
	fname: fname(name + '-' + idx, dotExt)
	},
	tumblrMetadata: post,
	downloadInvoc: [photo['original_size']['url'], fname(name + '-' + idx, dotExt), DOWNLOAD_DIRECTORY]
	};
	});
	}
	default:
	{
	console.error('postsToDownload: passed a post I can\'t handle: ' + name + ', of type ' + post['type']);
	return [];
	}
	}
	}

	// Operations on arrays of download specs.

	function downloadsLessExisting(downloads) {
	// TODO: I should be reading some metadata report and comparing
	// Tumblr's authoritative post IDs, rather than simply looking at filenames
	// - the mapping between posts and filenames will change, after all.
	// SUBTLETLY: This will treat posts with multiple images as effectively one
	// download.

	// Forgive me - the forbidden fruit of synchronous
	// IO lays bitten.
	return downloads.filter(d => !downloadExistsP(d));

	function downloadExistsP(download) {
	// TODO: I should be reading some metadata report and comparing
	// Tumblr's authoritative post IDs, rather than simply looking at filenames
	// - the mapping between posts and filenames will change, after all.
	// SUBTLETLY: This will treat posts with multiple images as effectively one
	// download.

	// Forgive me - the forbidden fruit of synchronous
	// IO lays bitten.
	let potentialPath = path.join(DOWNLOAD_DIRECTORY, download['reporting']['fname']);
	let existsP = fs.existsSync(potentialPath);

	if(existsP) {
	console.info('downloadExistsP: ' +
	download['reporting']['title'] + ' is already downloaded at ' +
	potentialPath + ' Filtering out.');
	} else {
	console.info('downloadExistsP: ' +
	download['reporting']['title'] + ' is not yet downloaded. Not filtering out.');
	}

	return existsP;
	}}

	// An iterator that continuously returns Tumblr posts, newest-first
	// - sequential access to Tumblr posts is the API's sole form of
	// access to posts.

	function* posts(domain, apiKey) {
	// Define (but not enter) a loop that downloads post descriptors from
	// within a certain range, parses them, and yields them to the user,
	// one-by-one...
	// (The loop iterates by tail-calling itself.)
	function* download20MorePosts(currBatch) {
	// Ask tumblr for a fixed number of posts.
	var res = srequest(
	'GET',
	'https://api.tumblr.com/v2/blog/' + domain + '/posts?api_key=' + apiKey + '&offset=' + (currBatch * 20)
	);

	// The request may fail, of course; check for these cases.
	if (res.statusCode == 404) {
	console.error("posts: 404 from tumblr. Check input blog identifier or API key.");
	throw res.statusCode;
	} else if (res.statusCode != 200 && res.statusCode != 404) {
	console.error("posts: status code not denoting success that I can't handle: ", res.statusCode);
	throw res.statusCode;
	} else if (res.statusCode == 200) {
	// Unless it indicates it hasn't; we may then continue.
	console.info(
	"posts: now processing posts " +
	currBatch * 20 +
	' - ' +
	(currBatch + 1) * 20
	);

	// Destructure the raw response;
	let posts = JSON.parse(res.body)['response']['posts'];

	// At which point we've done all we can to process the posts -
	// either we've run out of posts, and so terminate; or
	// hand them over the user.
	if(posts.length == 0) {
	return;
	} else {
	yield* posts;
	yield* download20MorePosts(currBatch + 1);
	}
	}
	}

	// ...now actually start the loop.
	yield* download20MorePosts(0);