Skip to content

Instantly share code, notes, and snippets.

@jmarshall
Last active February 10, 2024 18:36
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jmarshall/11fac803b1be9e1ce889b858879ebe2d to your computer and use it in GitHub Desktop.
Save jmarshall/11fac803b1be9e1ce889b858879ebe2d to your computer and use it in GitHub Desktop.
Display BGZF compression statistics
/* bgzfsize.c -- Display uncompressed sizes of BGZF files
Copyright (C) 2009 John Marshall
*/
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <inttypes.h>
#include <errno.h>
#include <fcntl.h>
#include <unistd.h>
#include "bam_endian.h"
const char usage[] =
"Usage: bgzfsize [-v] FILE...\n"
"\n"
"Displays the uncompressed size of each specified BGZF (e.g., BAM) file,\n"
"by totalling the counts in the trailer of each BGZF block in the files.\n"
"If more than one file is specified, each filename is also displayed\n"
"alongside its corresponding file size.\n"
"\n"
"When -v is given, displays the compressed size, uncompressed size, and\n"
"compression ratio for each specified BGZF file and also for each BGZF block\n"
"within each file.\n";
#if !defined(O_BINARY) && defined(_O_BINARY)
#define O_BINARY _O_BINARY
#elif !defined(O_BINARY)
#define O_BINARY 0
#endif
static int big_endian;
static int verbose;
/* These conversion functions assume that they are given appropriately aligned
pointers. The pointers supplied in the invocations below are naturally
aligned, which is hopefully sufficient on architectures of interest. */
uint16_t to_uint16(void *pv) {
uint16_t k = *(uint16_t *)pv;
return (big_endian)? bam_swap_endian_2(k) : k;
}
uint32_t to_uint32(void *pv) {
uint32_t k = *(uint32_t *)pv;
return (big_endian)? bam_swap_endian_4(k) : k;
}
void print_data(intmax_t compressed, intmax_t uncompressed, int count,
const char* fname) {
if (verbose) {
double ratio = (uncompressed == 0)? 0 :
(uncompressed - compressed) / (double) uncompressed;
printf("%"PRIdMAX"\t%"PRIdMAX"\t%4.1f%%", compressed, uncompressed,
100.0 * ratio);
}
else
printf("%"PRIdMAX, uncompressed);
if (fname) printf("\t%s", fname);
if (verbose && count >= 0) printf(" (%d blocks)", count);
printf("\n");
}
int fail(int use_perror, const char *format, ...) {
const char *message = strerror(errno);
va_list args;
fprintf(stderr, "bgzfsize: ");
va_start(args, format);
vfprintf(stderr, format, args);
va_end(args);
if (use_perror)
fprintf(stderr, ": %s\n", message);
else
fprintf(stderr, "\n");
return 0;
}
#define HEADER_LEN 18
#define FOOTER_LEN 8
int count_blocks(int fd, const char *fname, int show_fname) {
char buffer[FOOTER_LEN + HEADER_LEN];
char* const header = &buffer[FOOTER_LEN];
size_t n = FOOTER_LEN;
intmax_t btotal = 0, itotal = 0;
ssize_t nread;
uint32_t bsize, isize;
int i, blockcount = 0;
for (i = 0; i < sizeof buffer; i++) buffer[i] = '\0';
while (1) {
if (n < FOOTER_LEN + HEADER_LEN) {
ssize_t nread = read(fd, &buffer[n], sizeof buffer - n);
if (nread < 0) return fail(1, "reading \"%s\" failed", fname);
else if (nread == 0) break;
else n += nread;
}
if (! (n >= sizeof buffer &&
header[0] == '\x1f' && header[1] == '\x8b' &&
header[12] == '\x42' && header[13] == '\x43'))
return fail(0, "desynchronised at offset %"PRIuMAX" in \"%s\"",
(uintmax_t) lseek(fd, 0, SEEK_CUR), fname);
bsize = to_uint16(&header[16]) + 1;
if (lseek(fd, bsize - sizeof buffer, SEEK_CUR) < 0)
return fail(1, "seeking forward in \"%s\" failed", fname);
nread = read(fd, buffer, sizeof buffer);
if (nread < 0)
return fail(1, "reading \"%s\" failed", fname);
else if (nread < FOOTER_LEN)
return fail(0, "footer truncated in \"%s\"", fname);
else
n = nread;
isize = to_uint32(&buffer[4]);
itotal += isize;
/* Don't display the block header/footer overhead. */
/* bsize -= FOOTER_LEN + HEADER_LEN; ...or maybe do */
if (verbose) print_data(bsize, isize, -1, NULL);
btotal += bsize;
blockcount++;
}
print_data(btotal, itotal, blockcount, show_fname? fname : NULL);
return 1;
}
int main(int argc, char** argv) {
int i, status;
if (argc < 2) {
fputs(usage, stderr);
return EXIT_FAILURE;
}
else if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0) {
fputs(usage, stdout);
return EXIT_SUCCESS;
}
big_endian = bam_is_big_endian();
verbose = 0;
status = EXIT_SUCCESS;
for (i = 1; i < argc; i++) {
if (strcmp(argv[i], "-v") == 0)
verbose = 1;
else {
int fd = open(argv[i], O_RDONLY | O_BINARY);
if (fd >= 0) {
if (! count_blocks(fd, argv[i], (argc > 2)))
status = EXIT_FAILURE;
close(fd);
}
else {
fail(1, "can't open \"%s\"", argv[i]);
status = EXIT_FAILURE;
}
}
}
return status;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment