drydenp/sparsepack.c

## sparsepack.c
#include "sparsepack.h"

#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>


struct context {
	char *cur_buf;
	struct chunk_h header;
	int buf_read;  // buf_read means data read
	int zero_read; // zero read means empty blocks read
	int block_size;
};

struct statistics {
	off_t zero_chunks_written;
	off_t data_chunks_written;
	off_t total_chunks_written;

	off_t zero_blocks_written;
	off_t data_blocks_written;
	off_t total_blocks_written;
};

void fixup(struct statistics *s) {
	s->total_chunks_written = s->zero_chunks_written + s->data_chunks_written;
	s->total_blocks_written = s->zero_blocks_written + s->data_blocks_written;
}

int block_size = DEF_BLOCK_SIZE;

bool has_data(char* chunk, int size) {
	while (size -= sizeof(int)) {
		if (*((int*)chunk)) {
			return true;
		}
		chunk += sizeof(int);
	}
	return false;
}

void write_zero_header(struct context *ctx, struct statistics *stats) {
	ctx->header.type = unused;
	ctx->header.size = ctx->zero_read;
	fprintf(stderr, "Writing zero chunk of %d blocks\n", ctx->zero_read);
	fwrite(&ctx->header, sizeof(ctx->header), 1, stdout);

	stats->zero_chunks_written++;
	stats->zero_blocks_written += ctx->zero_read;

	ctx->zero_read = 0;
}

void write_data_header(struct context *ctx, char *full_buf, struct statistics *stats) {
	ctx->header.type = used;
	ctx->header.size = ctx->buf_read;
	fprintf(stderr, "Writing data chunk of %d blocks\n", ctx->buf_read);
	fwrite(&ctx->header, sizeof(ctx->header), 1, stdout);
	fwrite(full_buf, ctx->block_size, ctx->buf_read, stdout);

	stats->data_chunks_written++;
	stats->data_blocks_written += ctx->buf_read;

	ctx->buf_read = 0;
	ctx->cur_buf = full_buf;
}

struct statistics stats = {0, 0, 0, 0};

int main(char args[]) {
	char *buf = malloc(block_size * MAX_BLOCK_COUNT);
	struct context ctx = {0};
	int latest;

	ctx.cur_buf = buf;
	ctx.block_size = block_size;

	// so what are our cases?
	// 1. data has been read and we read a zero block.
	// 2. data has been read and we read a data block.
	// 3. data has not been read and we read a zero block.
	// 4. data has not been read and we read a data block.

	// this code will fail to read a partial block at the end.

	fwrite(&our_header, sizeof(struct spaf_h), 1, stdout);

	while (latest = fread(ctx.cur_buf, block_size, 1, stdin) == 1) {
		// 1. scan the chunk for data
		if (has_data(ctx.cur_buf, block_size)) {
			// case 1: we read data but we had zeroes before.

			if (ctx.zero_read > 0) {
				// we write out a zero header:
				write_zero_header(&ctx, &stats);
				ctx.buf_read = 1;
				ctx.cur_buf += block_size;
			} else {
			// case 2: we may or may not have read data before.
				ctx.buf_read++;
				if (ctx.buf_read == MAX_BLOCK_COUNT) {
					// write out chunk
					write_data_header(&ctx, buf, &stats);
				} else {
					ctx.cur_buf += block_size;
				}
			}
		} else {
			// case 4: we read zero but we have existing data
			if (ctx.buf_read > 0) {
				write_data_header(&ctx, buf, &stats);
				ctx.zero_read = 1;
			} else {
			// case 3: we may or may not have read zero before:
				ctx.zero_read++;
				// check whether we are exceeding the maximum chunk size.
				// for 4k blocks this is 64k * 4k = 256MB.
				// that's not a lot...
				if (ctx.zero_read == (1 << (sizeof(unsigned short) * 8)) - 1) {
					write_zero_header(&ctx, &stats);
				}
			}
		}
	}
	if (ctx.zero_read > 0) {
		write_zero_header(&ctx, &stats);
	} else {
		write_data_header(&ctx, buf, &stats);
	}

	fixup(&stats);

	fprintf(stderr, "Bytes read %" PRIu64 ", data chunks written %d totalling %" PRIu64 " blocks and %" PRIu64 " bytes. Zero chunks written %d totalling %" PRIu64 " blocks and %" PRIu64 " bytes. Total blocks written %" PRIu64 " and total chunks %d.\n", stats.total_blocks_written * block_size, stats.data_chunks_written, stats.data_blocks_written, stats.data_blocks_written * block_size, stats.zero_chunks_written, stats.zero_blocks_written, stats.zero_blocks_written * block_size, stats.total_blocks_written, stats.total_chunks_written);
}


## sparsepack.h
#define MAGIC "SPAF"
#define VERSION "10"

#define false 0
#define true 1

#define MAX_BLOCK_COUNT 4096
#define DEF_BLOCK_SIZE 4096

/**
 *  This macro changes off_t to 64 bits, and ftruncate to 64 bits.
 */

#define _FILE_OFFSET_BITS 64

#include <stdint.h>

/*
 *  Although I don't really see why I shouldn't use 64-bit versions directly.
 *  fprintf requires the use of PRIu64 or PRIi64 macros to select the proper long type to get at 64 bits.
 */

enum checksum_algos {
	algo_crc32 = 0,
	algo_md5sum
};

typedef unsigned char bool;
typedef unsigned char byte;

struct spaf_h {                         // 16 bytes
	char magic[4];                      // "SPAF"
	char version[2];                    // "10"
	bool hammington_used:8;             // unused, could be used to get a kind of ECC correction without using ECC
	                                    // memory, but would probably be rather slow.
	byte hammington_block_size:8;       // would have to be 247 with 8 parity bits and one unused extra parity.
	                                    // but this is the data part. The full code is (255, 247) and matrices could
										// be downloaded at [1]
										// The output of the matrix is the 8 parity bit position values that can
										// indicate an error. To be error free, the computation of this vector
										// needs to be the zero vector.

	bool checksum_used:8;                 //   9
	unsigned short checksum_bits:16;      //  11
	enum checksum_algos checksum_algo:8;  //  12
	uint32_t block_size:32;          //  16
} __attribute__((packed));

// [1] http://www.uni-kl.de/en/channel-codes/channel-codes-database/bch-and-hamming/

enum block_type {
	used = 0,
	unused
};

struct chunk_h {

	enum block_type type:8;
	unsigned short size:16;
	byte padding:8;

} __attribute__((packed));

struct chunk_h_checksum {
	enum block_type type:8;
	unsigned short size:16;
	unsigned long checksum:32;
} __attribute__((packed));


struct spaf_h our_header = {
	MAGIC,
	VERSION,
	false,
	247,
	false,
	32,
	algo_crc32,
	4096
};


## sparseunpack.c
#include "sparsepack.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>

#include <sys/types.h>
#include <sys/stat.h>
#include <sys/unistd.h>

struct statistics {
	off_t zero_chunks_read;
	off_t data_chunks_read;
	off_t total_chunks_read;

	off_t zero_blocks_read;
	off_t data_blocks_read;
	off_t total_blocks_read;
};

void fixup(struct statistics *s) {
	s->total_chunks_read = s->zero_chunks_read + s->data_chunks_read;
	s->total_blocks_read = s->zero_blocks_read + s->data_blocks_read;
}

enum file_type {
	ft_invalid = 0,
	ft_file,
	ft_block,
	ft_pipe
};

enum file_type obtain_file_type() {
	int fd = fileno(stdout);
	struct stat buf;
	fstat(fd, &buf);
	if (buf.st_mode & (S_IFREG | S_IFLNK)) return ft_file;
	if (buf.st_mode & S_IFBLK) return ft_block;
	if (buf.st_mode & S_IFIFO) return ft_pipe;
	return ft_invalid;
}


int main(char args[]) {
	fprintf(stderr, "Sizeof %d\n", sizeof(off_t));
	exit(1);

	char *buf = malloc(DEF_BLOCK_SIZE * MAX_BLOCK_COUNT);
	struct chunk_h chunk_header;

	struct spaf_h my_header;
	struct statistics stats = {0};

	char *zero;

	int read, i, res;
	enum block_type last;

	enum file_type stdout_type = obtain_file_type();

	fprintf(stderr, "File type is %s.\n", stdout_type == ft_file ? "file" : (stdout_type == ft_block) ? "block device" : "pipe");

	if (stdout_type == ft_invalid) {
		fprintf(stderr, "Cannot write to this output.\n");
		return 1;
	}

	fread(&my_header, sizeof(struct spaf_h), 1, stdin);

	if (memcmp(&my_header.magic, MAGIC, 4) == 0 && memcmp(&my_header.version, VERSION, 2) == 0) {
		fprintf(stderr, "Valid SPAF header found in input stream.\n");
	} else {
		fprintf(stderr, "No valid SPAF header found in input stream.\n");
		return 1;
	}

	fprintf(stderr, "Block size %d.\n", my_header.block_size);

	zero = malloc(my_header.block_size);
	memset(zero, 0, my_header.block_size);

	while (fread(&chunk_header, sizeof(struct chunk_h), 1, stdin)) {
		fprintf(stderr, "Decoding %s chunk of %d blocks\n", chunk_header.type == used ? "data" : "zero", chunk_header.size);

		if (chunk_header.type == used) {
			read = fread(buf, my_header.block_size, chunk_header.size, stdin);
			fwrite(buf, my_header.block_size, chunk_header.size, stdout);
			stats.data_chunks_read += 1;
			stats.data_blocks_read += chunk_header.size;
		} else {
			// if it is a regular file, then seek and truncate at the end, creating a sparse file.
			if (stdout_type == ft_file || stdout_type == ft_block) {
				fseek(stdout, chunk_header.size * my_header.block_size, SEEK_CUR);
			} else {
				for (i = 0; i < chunk_header.size; i++) {
					fwrite(zero, my_header.block_size, 1, stdout);
				}
			}
			stats.zero_chunks_read += 1;
			stats.zero_blocks_read += chunk_header.size;
		}
	}
	fixup(&stats);

	if (chunk_header.type == unused && stdout_type == ft_file) {
		fprintf(stderr, "Truncating file at %" PRIu64 " bytes.\n", stats.total_blocks_read * my_header.block_size);
		res = ftruncate(fileno(stdout), stats.total_blocks_read * my_header.block_size);
		/*if (res) {
			fprintf(stderr, "Truncate error.\n");
		}*/
	}

	fprintf(stderr, "Total blocks processed: %" PRIu64 ".", stats.total_blocks_read);
	fprintf(stderr, " Total chunks: %" PRIu64 ".", stats.total_chunks_read);
	fprintf(stderr, " Zero: %" PRIu64 "/%" PRIu64 ", data: %" PRIu64 "/%" PRIu64 ". Data %: %04.2f\n",
		stats.zero_blocks_read,
		stats.zero_chunks_read,
		stats.data_blocks_read,
		stats.data_chunks_read,
		stats.data_blocks_read / (double)stats.total_blocks_read);
}
	#include "sparsepack.h"

	#include <stdio.h>
	#include <stdlib.h>
	#include <inttypes.h>



	struct context {
	char *cur_buf;
	struct chunk_h header;
	int buf_read; // buf_read means data read
	int zero_read; // zero read means empty blocks read
	int block_size;
	};

	struct statistics {
	off_t zero_chunks_written;
	off_t data_chunks_written;
	off_t total_chunks_written;

	off_t zero_blocks_written;
	off_t data_blocks_written;
	off_t total_blocks_written;
	};

	void fixup(struct statistics *s) {
	s->total_chunks_written = s->zero_chunks_written + s->data_chunks_written;
	s->total_blocks_written = s->zero_blocks_written + s->data_blocks_written;
	}

	int block_size = DEF_BLOCK_SIZE;

	bool has_data(char* chunk, int size) {
	while (size -= sizeof(int)) {
	if (((int)chunk)) {
	return true;
	}
	chunk += sizeof(int);
	}
	return false;
	}

	void write_zero_header(struct context ctx, struct statistics stats) {
	ctx->header.type = unused;
	ctx->header.size = ctx->zero_read;
	fprintf(stderr, "Writing zero chunk of %d blocks\n", ctx->zero_read);
	fwrite(&ctx->header, sizeof(ctx->header), 1, stdout);

	stats->zero_chunks_written++;
	stats->zero_blocks_written += ctx->zero_read;

	ctx->zero_read = 0;
	}

	void write_data_header(struct context ctx, char full_buf, struct statistics *stats) {
	ctx->header.type = used;
	ctx->header.size = ctx->buf_read;
	fprintf(stderr, "Writing data chunk of %d blocks\n", ctx->buf_read);
	fwrite(&ctx->header, sizeof(ctx->header), 1, stdout);
	fwrite(full_buf, ctx->block_size, ctx->buf_read, stdout);

	stats->data_chunks_written++;
	stats->data_blocks_written += ctx->buf_read;

	ctx->buf_read = 0;
	ctx->cur_buf = full_buf;
	}

	struct statistics stats = {0, 0, 0, 0};

	int main(char args[]) {
	char buf = malloc(block_size MAX_BLOCK_COUNT);
	struct context ctx = {0};
	int latest;

	ctx.cur_buf = buf;
	ctx.block_size = block_size;

	// so what are our cases?
	// 1. data has been read and we read a zero block.
	// 2. data has been read and we read a data block.
	// 3. data has not been read and we read a zero block.
	// 4. data has not been read and we read a data block.

	// this code will fail to read a partial block at the end.

	fwrite(&our_header, sizeof(struct spaf_h), 1, stdout);

	while (latest = fread(ctx.cur_buf, block_size, 1, stdin) == 1) {
	// 1. scan the chunk for data
	if (has_data(ctx.cur_buf, block_size)) {
	// case 1: we read data but we had zeroes before.

	if (ctx.zero_read > 0) {
	// we write out a zero header:
	write_zero_header(&ctx, &stats);
	ctx.buf_read = 1;
	ctx.cur_buf += block_size;
	} else {
	// case 2: we may or may not have read data before.
	ctx.buf_read++;
	if (ctx.buf_read == MAX_BLOCK_COUNT) {
	// write out chunk
	write_data_header(&ctx, buf, &stats);
	} else {
	ctx.cur_buf += block_size;
	}
	}
	} else {
	// case 4: we read zero but we have existing data
	if (ctx.buf_read > 0) {
	write_data_header(&ctx, buf, &stats);
	ctx.zero_read = 1;
	} else {
	// case 3: we may or may not have read zero before:
	ctx.zero_read++;
	// check whether we are exceeding the maximum chunk size.
	// for 4k blocks this is 64k * 4k = 256MB.
	// that's not a lot...
	if (ctx.zero_read == (1 << (sizeof(unsigned short) * 8)) - 1) {
	write_zero_header(&ctx, &stats);
	}
	}
	}
	}
	if (ctx.zero_read > 0) {
	write_zero_header(&ctx, &stats);
	} else {
	write_data_header(&ctx, buf, &stats);
	}

	fixup(&stats);

	fprintf(stderr, "Bytes read %" PRIu64 ", data chunks written %d totalling %" PRIu64 " blocks and %" PRIu64 " bytes. Zero chunks written %d totalling %" PRIu64 " blocks and %" PRIu64 " bytes. Total blocks written %" PRIu64 " and total chunks %d.\n", stats.total_blocks_written * block_size, stats.data_chunks_written, stats.data_blocks_written, stats.data_blocks_written * block_size, stats.zero_chunks_written, stats.zero_blocks_written, stats.zero_blocks_written * block_size, stats.total_blocks_written, stats.total_chunks_written);
	}
	#define MAGIC "SPAF"
	#define VERSION "10"

	#define false 0
	#define true 1

	#define MAX_BLOCK_COUNT 4096
	#define DEF_BLOCK_SIZE 4096

	/**
	* This macro changes off_t to 64 bits, and ftruncate to 64 bits.
	*/

	#define _FILE_OFFSET_BITS 64

	#include <stdint.h>

	/*
	* Although I don't really see why I shouldn't use 64-bit versions directly.
	* fprintf requires the use of PRIu64 or PRIi64 macros to select the proper long type to get at 64 bits.
	*/

	enum checksum_algos {
	algo_crc32 = 0,
	algo_md5sum
	};

	typedef unsigned char bool;
	typedef unsigned char byte;

	struct spaf_h { // 16 bytes
	char magic[4]; // "SPAF"
	char version[2]; // "10"
	bool hammington_used:8; // unused, could be used to get a kind of ECC correction without using ECC
	// memory, but would probably be rather slow.
	byte hammington_block_size:8; // would have to be 247 with 8 parity bits and one unused extra parity.
	// but this is the data part. The full code is (255, 247) and matrices could
	// be downloaded at [1]
	// The output of the matrix is the 8 parity bit position values that can
	// indicate an error. To be error free, the computation of this vector
	// needs to be the zero vector.

	bool checksum_used:8; // 9
	unsigned short checksum_bits:16; // 11
	enum checksum_algos checksum_algo:8; // 12
	uint32_t block_size:32; // 16
	} __attribute__((packed));

	// [1] http://www.uni-kl.de/en/channel-codes/channel-codes-database/bch-and-hamming/

	enum block_type {
	used = 0,
	unused
	};

	struct chunk_h {

	enum block_type type:8;
	unsigned short size:16;
	byte padding:8;

	} __attribute__((packed));

	struct chunk_h_checksum {
	enum block_type type:8;
	unsigned short size:16;
	unsigned long checksum:32;
	} __attribute__((packed));



	struct spaf_h our_header = {
	MAGIC,
	VERSION,
	false,
	247,
	false,
	32,
	algo_crc32,
	4096
	};