saagarjha/file_drain.c

## file_drain.c
// Sometimes you have a large file on a small disk and would like to "transform"
// it in some way: for example, by decompressing it. However, you might not have
// enough space on disk to keep both the the compressed file and the
// decompressed results. If the process can be done in a streaming fashion, it
// would be nice if the file could be "drained"; that is, the file would be
// sequentially deleted as it is consumed. At the start you'd have 100% of the
// original file, somewhere in the middle you'd have about half of the original
// file and half of your output, and by the end the original file will be gone
// and you'll be left with just the results. If you do it this way, you might
// be able to do the entire operation without extra space!
//
// file_drain does exactly that. It's quite simple: first it reverses the file
// in place (technically, it reverses file *blocks*-but that's just for
// performance). Then it goes backwards through the file and reads it back out.
// Why all the reversing? Because POSIX offers an API to shrink a file from the
// end: ftruncate(2). This lets it trim the file as it goes along, which is
// exactly what it does.
//
// To use it, first compile it:
//
// $ gcc file_drain.c -o file_drain
//
// Then, pass it the file you want to drain, and it will do its work and
// redirect it out to standard output. For example:
//
// $ ./file_drain my_big_archive.tgz | tar zxf -
//
// WARNING: Due to the nature of how file_drain works, it is necessarily
// somewhat unsafe, since it modifies your input file to reverse it and then
// progressively prunes it. THIS IS A DESTRUCTIVE OPERATION. But it also means
// that if something fails in the middle you'll end up with a mess that is
// likely not recoverable, and I don't make any efforts to do so. I would
// suggest only running this on files you can easily re-create (for example by
// downloading them again) when things go wrong.


#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <unistd.h>

#define BLOCK_SIZE (16 * 1 << 14) /* 16 * 16k */

static void reverse(int fd, off_t head, off_t tail) {
	static char buffer1[BLOCK_SIZE];
	static char buffer2[BLOCK_SIZE];

	tail -= BLOCK_SIZE;
	while (head <= tail) {
		pread(fd, buffer1, BLOCK_SIZE, head);
		pread(fd, buffer2, BLOCK_SIZE, tail);
		pwrite(fd, buffer2, BLOCK_SIZE, head);
		pwrite(fd, buffer1, BLOCK_SIZE, tail);

		head += BLOCK_SIZE;
		tail -= BLOCK_SIZE;
	}
}

static void do_write(int fd, char *buffer, size_t size) {
	size_t written = 0;
	while (written < size) {
		ssize_t count = write(fd, buffer + written, size - written);
		// Not much we can do here, but we might as well check for it.
		// (Unless it's EINTR or something, but handling that is annoying.)
		assert(count > 0);
		written += count;
	}
}

static void output(int fd, off_t tail, size_t size) {
	static char buffer[BLOCK_SIZE];

	tail -= BLOCK_SIZE;
	while (size) {
		pread(fd, buffer, BLOCK_SIZE, tail);
		size_t to_write = size > BLOCK_SIZE ? BLOCK_SIZE : size;
		do_write(STDOUT_FILENO, buffer, to_write);

		ftruncate(fd, tail);
		tail -= BLOCK_SIZE;
		size -= to_write;
	}
}

int main(int argc, char **argv) {
	assert(argc == 2);
	int fd = open(*++argv, O_RDWR);
	assert(fd >= 0);
	off_t size = lseek(fd, 0, SEEK_END);
	assert(size >= 0);
	off_t rounded_size = lseek(fd, (size + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, SEEK_SET);
	assert(rounded_size >= 0);

	reverse(fd, 0, rounded_size);
	output(fd, rounded_size, size);

	unlink(*argv);
}
	// Sometimes you have a large file on a small disk and would like to "transform"
	// it in some way: for example, by decompressing it. However, you might not have
	// enough space on disk to keep both the the compressed file and the
	// decompressed results. If the process can be done in a streaming fashion, it
	// would be nice if the file could be "drained"; that is, the file would be
	// sequentially deleted as it is consumed. At the start you'd have 100% of the
	// original file, somewhere in the middle you'd have about half of the original
	// file and half of your output, and by the end the original file will be gone
	// and you'll be left with just the results. If you do it this way, you might
	// be able to do the entire operation without extra space!
	//
	// file_drain does exactly that. It's quite simple: first it reverses the file
	// in place (technically, it reverses file blocks-but that's just for
	// performance). Then it goes backwards through the file and reads it back out.
	// Why all the reversing? Because POSIX offers an API to shrink a file from the
	// end: ftruncate(2). This lets it trim the file as it goes along, which is
	// exactly what it does.
	//
	// To use it, first compile it:
	//
	// $ gcc file_drain.c -o file_drain
	//
	// Then, pass it the file you want to drain, and it will do its work and
	// redirect it out to standard output. For example:
	//
	// $ ./file_drain my_big_archive.tgz \| tar zxf -
	//
	// WARNING: Due to the nature of how file_drain works, it is necessarily
	// somewhat unsafe, since it modifies your input file to reverse it and then
	// progressively prunes it. THIS IS A DESTRUCTIVE OPERATION. But it also means
	// that if something fails in the middle you'll end up with a mess that is
	// likely not recoverable, and I don't make any efforts to do so. I would
	// suggest only running this on files you can easily re-create (for example by
	// downloading them again) when things go wrong.


	#include <fcntl.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <assert.h>
	#include <unistd.h>

	#define BLOCK_SIZE (16 * 1 << 14) /* 16 * 16k */

	static void reverse(int fd, off_t head, off_t tail) {
	static char buffer1[BLOCK_SIZE];
	static char buffer2[BLOCK_SIZE];

	tail -= BLOCK_SIZE;
	while (head <= tail) {
	pread(fd, buffer1, BLOCK_SIZE, head);
	pread(fd, buffer2, BLOCK_SIZE, tail);
	pwrite(fd, buffer2, BLOCK_SIZE, head);
	pwrite(fd, buffer1, BLOCK_SIZE, tail);

	head += BLOCK_SIZE;
	tail -= BLOCK_SIZE;
	}
	}

	static void do_write(int fd, char *buffer, size_t size) {
	size_t written = 0;
	while (written < size) {
	ssize_t count = write(fd, buffer + written, size - written);
	// Not much we can do here, but we might as well check for it.
	// (Unless it's EINTR or something, but handling that is annoying.)
	assert(count > 0);
	written += count;
	}
	}

	static void output(int fd, off_t tail, size_t size) {
	static char buffer[BLOCK_SIZE];

	tail -= BLOCK_SIZE;
	while (size) {
	pread(fd, buffer, BLOCK_SIZE, tail);
	size_t to_write = size > BLOCK_SIZE ? BLOCK_SIZE : size;
	do_write(STDOUT_FILENO, buffer, to_write);

	ftruncate(fd, tail);
	tail -= BLOCK_SIZE;
	size -= to_write;
	}
	}

	int main(int argc, char **argv) {
	assert(argc == 2);
	int fd = open(*++argv, O_RDWR);
	assert(fd >= 0);
	off_t size = lseek(fd, 0, SEEK_END);
	assert(size >= 0);
	off_t rounded_size = lseek(fd, (size + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, SEEK_SET);
	assert(rounded_size >= 0);

	reverse(fd, 0, rounded_size);
	output(fd, rounded_size, size);

	unlink(*argv);
	}