Skip to content

Instantly share code, notes, and snippets.

@lh3
Last active March 6, 2016 04:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lh3/abe2e1d86867a0c08ad7 to your computer and use it in GitHub Desktop.
Save lh3/abe2e1d86867a0c08ad7 to your computer and use it in GitHub Desktop.
Recoding-free fast BAM slicing for BAMs generated by samtools-0.1.8+

This gist demonstrates fast BAM slicing without re-compressing the alignment data. It only works for BAMs generated by samtools-0.1.8+ and htslib, as it requires the BAM to be block aligned.

To use this gist, you need to first acquire and compile htslib. After that:

cd htslib; gcc -g -O2 -Wall -o fsi -I. /path/to/fsi.c libhts.a -lz
./fsi aln.bam > aln.fsi
perl fsi-get.pl aln.bam aln.fsi chr11:1000000-1200000 > slice.bam

If your BAM is not compatible with this gist (which will lead to a segfault), you need to recode the BAM with

samtools view -b ori.bam > recode.bam

FSI index format:

N  n_contigs  n_blocks
L  contig_length
I  file_offset  accumulative_align_start_position  genomic_span  n_alignments  n_bytes
#!/usr/bin/env perl
use strict;
use warnings;
die("Usage: fsi-get.pl <aln.bam> <idx.fsi> <region>\n") if @ARGV < 3;
my (@acc_len, @idx);
push(@acc_len, 0);
# read the FSI index
open(FH, $ARGV[1]) || die;
while (<FH>) {
if (/^L\t(\d+)/) {
push(@acc_len, $acc_len[$#acc_len] + $1);
} elsif (/^I\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)/) {
push(@idx, [$1, $2, $2 + $3, $4, $5]);
}
}
close(FH);
# Parse range (copied from James' cram_slicer)
$ARGV[2] =~ /(\d+)(?::(\d+)(?:-(\d+))?)?/;
my $ctg = $1;
my $off = $acc_len[$ctg] || die;
my $start = $off + (defined($2)? $2 : 1);
my $end = defined($3)? $off + $3 : defined($2)? $off + $2 : 2**31-1;
# the following is inefficient; using a BAM-like linear index or two binary searches would be better
open(FH, $ARGV[0]) || die;
read(FH, $_, $idx[0][0]); print; # the header
my ($min, $max) = ($acc_len[$#acc_len], 0);
for (my $i = 0; $i < @idx - 1; ++$i) {
if ($start < $idx[$i][2] && $idx[$i][1] < $end) {
$min = $min < $idx[$i][1]? $min : $idx[$i][1];
$max = $max > $idx[$i][2]? $max : $idx[$i][2];
seek(FH, $idx[$i][0], 0);
read(FH, $_, $idx[$i+1][0] - $idx[$i][0]); print;
}
}
close(FH);
warn("[", $min - $off, ",", $max - $off, ")", "\n");
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include "htslib/bgzf.h"
#include "htslib/sam.h"
typedef struct {
uint64_t off, a_st, a_en;
uint32_t cnt, n_bytes;
} fsi1_t;
typedef struct {
uint32_t size, n_ctg;
uint64_t *acc_len;
uint32_t n, m;
fsi1_t *a;
} fsi_t;
static fsi_t *read_header(BGZF *fp)
{
fsi_t *fsi;
int i;
bam_hdr_t *h;
if ((h = bam_hdr_read(fp)) == 0) return 0; // fail to read the header
if (bgzf_tell(fp) & 0xffff) return 0; // the header is not in separate block(s)
fsi = (fsi_t*)calloc(1, sizeof(fsi_t));
fsi->n_ctg = h->n_targets;
fsi->acc_len = (uint64_t*)calloc(fsi->n_ctg + 1, 8);
for (i = 0; i < fsi->n_ctg; ++i)
fsi->acc_len[i+1] = fsi->acc_len[i] + h->target_len[i];
bam_hdr_destroy(h);
return fsi;
}
#define kv_push(type, v, x) do { \
if ((v).n == (v).m) { \
(v).m = (v).m? (v).m<<1 : 2; \
(v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \
} \
(v).a[(v).n++] = (x); \
} while (0)
fsi_t *fsi_build(BGZF *fp, int size)
{
fsi_t *fsi;
bam1_t *b;
uint64_t off;
fsi1_t t;
int ret;
if ((fsi = read_header(fp)) == 0) return 0;
fsi->size = size;
memset(&t, 0, sizeof(fsi1_t));
off = bgzf_tell(fp);
b = bam_init1();
while ((ret = bam_read1(fp, b)) >= 0) {
fsi1_t s;
s.a_st = b->core.tid < 0? fsi->acc_len[fsi->n_ctg] : fsi->acc_len[b->core.tid] + b->core.pos;
s.a_en = b->core.tid < 0? s.a_st : s.a_st + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
s.off = off>>16, s.n_bytes = ret, s.cnt = 1;
if (t.n_bytes == 0 || (t.n_bytes > size && (off&0xffff) == 0)) {
if (t.n_bytes > size) kv_push(fsi1_t, *fsi, t);
t = s;
} else {
t.a_en = t.a_en > s.a_en? t.a_en : s.a_en;
++t.cnt;
t.n_bytes += s.n_bytes;
}
off = bgzf_tell(fp);
}
kv_push(fsi1_t, *fsi, t);
t.off = off>>16, t.a_st = t.a_en = fsi->acc_len[fsi->n_ctg], t.n_bytes = t.cnt = 0;
kv_push(fsi1_t, *fsi, t);
bam_destroy1(b);
return fsi;
}
void fsi_print(const fsi_t *fsi)
{
int i;
printf("N\t%d\t%d\n", fsi->n_ctg, fsi->n);
for (i = 0; i < fsi->n_ctg; ++i)
printf("L\t%lld\n", (long long)(fsi->acc_len[i+1] - fsi->acc_len[i]));
for (i = 0; i < fsi->n; ++i) {
fsi1_t *p = &fsi->a[i];
printf("I\t%lld\t%lld\t%d\t%u\t%u\n", (long long)p->off, (long long)p->a_st, (int)(p->a_en - p->a_st), p->cnt, p->n_bytes);
}
}
void fsi_destroy(fsi_t *fsi)
{
free(fsi->a); free(fsi->acc_len); free(fsi);
}
#include <unistd.h>
int main(int argc, char *argv[])
{
int c, size = 1000000;
fsi_t *fsi;
BGZF *fp;
while ((c = getopt(argc, argv, "s:")) >= 0)
if (c == 's') size = atoi(optarg);
if (argc - optind < 1) {
fprintf(stderr, "Usage: fsi [-s %d] <in.bam>\n", size);
return 1;
}
fp = bgzf_open(argv[optind], "r");
fsi = fsi_build(fp, size);
fsi_print(fsi);
fsi_destroy(fsi);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment