Skip to content

Instantly share code, notes, and snippets.

@rplacd
Created July 4, 2016 20:45
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rplacd/f7e4870dd2201ed8642219abda8fdfe8 to your computer and use it in GitHub Desktop.
Save rplacd/f7e4870dd2201ed8642219abda8fdfe8 to your computer and use it in GitHub Desktop.
Dump media from a Tumblr blog - a Node.js script
"use strict";
/*
tumblr-downloader.js - a Node.js script to download all media files posted in
a Tumblr blog, most recent first.
Usage:
node tumblr-downloader.js
Options - domain to download from, API key, and so on - are exposed below.
It's not good behavior in general, but it's well adapted to how I use this:
often, but for very few sites (and so very few combinations of arguments.)
Remember to set API_KEY.
Setup:
Built with ES6 and Node.js v6.2.2. After installing Node.js...
- You'll need a Tumblr API key set in API_KEY below. An example of one is
available in the official Tumblr API demo. (Tumblr offers the source in good
faith; please don't abuse that access.)
- Download the various modules required via npm:
"npm install underscore cli async request sync-request wu mkdirp"
Fine details:
- Since media files on Tumblr don't come attached with their original
filenames, each download is given a computed filename based on the tumblr
title and post description.
- Downloads that correspond to filenames already existing in the target
folder will be skipped.
- A JSON "metadata log" is output on stdout for every file downloaded;
progress information on stderr.
- The dumper downloads as many media files as it can find.
- To quit the dumper at any time, send SIGINT to the application - Control-C
in the vast majority of setups. The dumper should remove any half-downloaded
files.
I put this source in the public domain.
*/
const DOMAIN = (() => throw "set me - to wallpapers.tumblr.com, for example.")();
const DOWNLOAD_DIRECTORY = (() => throw "set me - to ~/Photos/Wallpapers, for example")();
const API_KEY = (() => throw "set me to a Tumblr API key - see the documentation.")();
// Setting these will change behavior from that documented above.
// Their use is unsupported.
const NUM_CONCURRENT_DOWNLOADS = 2;
const REDOWNLOAD_EXISTING = "REWRITING";
const NUM_TARGET_DOWNLOADS = Number.POSITIVE_INFINITY;
// Modules you'll have to install.
const _ = require('underscore');
const cli = require('cli');
const async = require('async');
const request = require('request');
const srequest = require('sync-request');
const wu = require('wu');
const mkdirp = require('mkdirp');
// Standard library modules, and a few monkeypatches.
const fs = require('fs');
const https = require('https');
const url = require('url');
const path = require('path');
console.info = console.warn;
// We reserve stdout as a source of structured output data;
// Logging then goes, by default, to stderr.
// Begin control flow here.
main();
// Error handling policy - similar to Erlang's fail-fast:
// - unhandled - deletes the working directory:
// Serious logic errors.
// Failures in the main thread.
// - handled and suppressed:
// Failures in worker threads.
function main() {
// Create the UI state.
let ui = "a placeholder.";
// Set up filesystem state.
mkdirp.sync(DOWNLOAD_DIRECTORY);
// Now, given a stream of posts that
// - we've trimmed to the correct size;
// - converted into "download specifications".
let downloads = wu(posts(DOMAIN, API_KEY))
.take(NUM_TARGET_DOWNLOADS)
.concatMap(postToDownloads);
// ...for each post,
async.eachLimit(downloads, NUM_CONCURRENT_DOWNLOADS,
(download, cont) => {
console.log(DOWNLOAD_DIRECTORY, download["reporting"]["fname"]);
let guarded_IO_download = IO_handlingSIGINT(
IO_download,
() => {
let fpath = path.join(
DOWNLOAD_DIRECTORY,
download["reporting"]["fname"]
);
console.info("Handled SIGINT; gracefully removing ", fpath);
if(fs.existsSync(fpath)) {
fs.unlinkSync(fpath);
}
}
);
// download the post (and rolling back if interrupted);
guarded_IO_download.apply(this,
download["downloadInvoc"].concat(ui, _cont =>
// output metadata on stdout.
IO_outputMetadata(download, cont)
)
);
},
// When done with all of our downloads, applaud.
() => console.info("main: Done.")
);
}
// These IO functions have a property that all functions implementing
// asynchronous IO in node.js have: they guarantee their last argument,
// "cont", will be called after all asynchronous operations have finished.
process.once('SIGINT', () => {
process.exit();
});
function IO_handlingSIGINT(body, rollback) {
// An IO function combinator that installs a rollback callback before
// entering the body, and removes it in the continuation (regardless
// or not of whether the callback is passed an error or not.)
return function IO_wrapper() {
let args = Array.from(arguments); args.pop();
let continuation = arguments[arguments.length - 1];
process.setMaxListeners(1 + process.getMaxListeners());
process.prependOnceListener('SIGINT', rollback);
return body(...args, err => {
process.removeListener('SIGINT', rollback);
return continuation(err);
});
};
}
function IO_download(url, fname, targetDir, ui, cont) {
console.info('IO_download: beginning download', fname);
if(!REDOWNLOAD_EXISTING) {
console.info("main: user has chosen to replace posts already downloaded");
}
let fpath = path.join(targetDir, fname);
if(REDOWNLOAD_EXISTING && fs.existsSync(fpath)) {
console.info("IO_download: skipping existing download " +
fpath + "a s requested by user.");
return cont();
}
let file = fs.createWriteStream(fpath);
return https.get(url, res => {
let totalBytes = parseInt(res.headers['content-length'], 10);
res.pipe(file);
file.on('data', function(downloadChunk) {
// Do something interesting: show a progress bar, perhaps.
});
file.on('finish', function() {
console.info('IO_download: finished downloading', fname, fpath);
file.close(cont);
});
file.on('error', function(err) {
console.error('IO_download: error downloading', fname, err);
fs.unlink(cont);
});
});
}
function IO_outputMetadata(metadata_record, cont) {
console.log(JSON.stringify(metadata_record));
cont();
}
// Operations on posts.
function postToHumanTitle(post, opt_suffix) {
// Extract, from the post, a title that is, in order of acceptability,
// (a) human-readable,
// (b) human-memorisable.
// Given a substantial amount of metadata, we select from many possibilites:
if (post['slug'] !== "") {
return post['slug'];
}
// (Check to see whether the source_url has a descriptive name, or
// whether it simply uses a numerical post ID),
const urlFname = addr => path.basename(url.parse(addr).pathname);
if (undefined !== post['source_url'] && "" !== post['source_url']) {
let srcUrlB = urlFname(post['source_url']);
if(isNaN(parseInt(srcUrlB, 10)))
return srcUrlB;
}
// The same for the reblog_key: is it a name, or a number?
if(isNaN(parseInt(post['reblog_key'], 10))) {
return post['reblog_key'];
}
// If that's failed, use the source_title + reblog_key...
if (undefined !== post['source_url'] && "" !== post['source_title']) {
return post['source_title'] + '-' + post['reblog_key'];
}
// And if the source_title doesn't exist either, use the reblog_key only.
return post['reblog_key'];
}
function postToDownloads(post) {
// TODO: remove some of the duplicated object literal structure
// here with an object property spread operator.
let name = postToHumanTitle(post);
let fname = (name, dotExt) => name + dotExt;
console.info('postsToDownload: extracted from reponse post ' + name);
switch (post['type']) {
case 'video':
{
let dotExt = path.extname(url.parse(post['video_url']).pathname);
return [{
reporting: {
title: name,
source_title: post['source_title'],
source_url: post['source_url'],
fname: fname(name, dotExt)
},
tumblrMetadata: post,
downloadInvoc: [post['video_url'], fname(name, dotExt), DOWNLOAD_DIRECTORY]
}];
}
case 'photo':
{
return post['photos'].map(function(photo, idx) {
let dotExt = path.extname(url.parse(photo['original_size']['url']).pathname);
return {
reporting: {
title: name,
source_title: post['source_title'],
source_url: post['source_url'],
fname: fname(name + '-' + idx, dotExt)
},
tumblrMetadata: post,
downloadInvoc: [photo['original_size']['url'], fname(name + '-' + idx, dotExt), DOWNLOAD_DIRECTORY]
};
});
}
default:
{
console.error('postsToDownload: passed a post I can\'t handle: ' + name + ', of type ' + post['type']);
return [];
}
}
}
// Operations on arrays of download specs.
function downloadsLessExisting(downloads) {
// TODO: I should be reading some metadata report and comparing
// Tumblr's authoritative post IDs, rather than simply looking at filenames
// - the mapping between posts and filenames will change, after all.
// SUBTLETLY: This will treat posts with multiple images as effectively one
// download.
// Forgive me - the forbidden fruit of synchronous
// IO lays bitten.
return downloads.filter(d => !downloadExistsP(d));
function downloadExistsP(download) {
// TODO: I should be reading some metadata report and comparing
// Tumblr's authoritative post IDs, rather than simply looking at filenames
// - the mapping between posts and filenames will change, after all.
// SUBTLETLY: This will treat posts with multiple images as effectively one
// download.
// Forgive me - the forbidden fruit of synchronous
// IO lays bitten.
let potentialPath = path.join(DOWNLOAD_DIRECTORY, download['reporting']['fname']);
let existsP = fs.existsSync(potentialPath);
if(existsP) {
console.info('downloadExistsP: ' +
download['reporting']['title'] + ' is already downloaded at ' +
potentialPath + ' Filtering out.');
} else {
console.info('downloadExistsP: ' +
download['reporting']['title'] + ' is not yet downloaded. Not filtering out.');
}
return existsP;
}}
// An iterator that continuously returns Tumblr posts, newest-first
// - sequential access to Tumblr posts is the API's sole form of
// access to posts.
function* posts(domain, apiKey) {
// Define (but not enter) a loop that downloads post descriptors from
// within a certain range, parses them, and yields them to the user,
// one-by-one...
// (The loop iterates by tail-calling itself.)
function* download20MorePosts(currBatch) {
// Ask tumblr for a fixed number of posts.
var res = srequest(
'GET',
'https://api.tumblr.com/v2/blog/' + domain + '/posts?api_key=' + apiKey + '&offset=' + (currBatch * 20)
);
// The request may fail, of course; check for these cases.
if (res.statusCode == 404) {
console.error("posts: 404 from tumblr. Check input blog identifier or API key.");
throw res.statusCode;
} else if (res.statusCode != 200 && res.statusCode != 404) {
console.error("posts: status code not denoting success that I can't handle: ", res.statusCode);
throw res.statusCode;
} else if (res.statusCode == 200) {
// Unless it indicates it hasn't; we may then continue.
console.info(
"posts: now processing posts " +
currBatch * 20 +
' - ' +
(currBatch + 1) * 20
);
// Destructure the raw response;
let posts = JSON.parse(res.body)['response']['posts'];
// At which point we've done all we can to process the posts -
// either we've run out of posts, and so terminate; or
// hand them over the user.
if(posts.length == 0) {
return;
} else {
yield* posts;
yield* download20MorePosts(currBatch + 1);
}
}
}
// ...now actually start the loop.
yield* download20MorePosts(0);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment