Skip to content

Instantly share code, notes, and snippets.

@lindenb
Created February 15, 2011 15:59
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lindenb/827697 to your computer and use it in GitHub Desktop.
Save lindenb/827697 to your computer and use it in GitHub Desktop.
Transforms a BAM file to WIG
/*
Author:
Pierre Lindenbaum PhD
WWW:
http://plindenbaum.blogspot.com
contact:
plindenbaum@yahoo.fr
Motivation:
creates a WIG file from a BAM file
Compilation:
gcc -m64 -I path/to/samtools-0.1.7a -L path/to/samtools-0.1.7a bam2wig.c -lbam -lz
See also:
API: http://samtools.sourceforge.net/samtools/sam/
*/
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <limits.h>
#include "bam.h"
#include "sam.h"
typedef short depth_t;
#define LOG(a) a
#define WHERE fprintf(stderr,"%s:%s,%d\n",__FILE__,__FUNCTION__,__LINE__)
static const depth_t INIT_IGNORE_THIS_BASE=-1;
static const depth_t INIT_DEFAULT=0;
typedef struct Reference_t
{
depth_t init;
depth_t* data;
int length;
int buffer_size;
}Reference,*ReferencePtr;
static ReferencePtr referenceNew(depth_t init)
{
ReferencePtr ptr= calloc(1,sizeof(Reference));
if(ptr==NULL)
{
fprintf(stderr,"Out of memory\n");
exit(EXIT_FAILURE);
}
ptr->init=init;
ptr->buffer_size=100000000;
ptr->data=malloc(ptr->buffer_size*sizeof(depth_t));
if(ptr->data==NULL)
{
fprintf(stderr,"Out of memory\n");
exit(EXIT_FAILURE);
}
memset(ptr->data,0,(ptr->buffer_size*sizeof(depth_t)));
ptr->length=0;
return ptr;
}
void referenceFree(ReferencePtr ptr)
{
free(ptr->data);
free(ptr);
}
void referenceEnsureIndex(ReferencePtr ptr,int pos)
{
if(pos>= ptr->buffer_size)
{
int new_buffer_size=pos+1000000;
LOG(fprintf(stderr,"[LOG]Realloc to %d\n",new_buffer_size));
ptr->data=realloc(ptr->data,new_buffer_size*sizeof(depth_t));
if(ptr->data==NULL)
{
fprintf(stderr,"Out of memory\n");
exit(EXIT_FAILURE);
}
ptr->buffer_size=new_buffer_size;
}
while( ptr->length <= pos )
{
ptr->data[ptr->length]=ptr->init;
ptr->length++;
}
}
void referenceReadWasSeen(ReferencePtr ptr,int pos)
{
referenceEnsureIndex(ptr,pos);
if( ptr->data[pos]==INIT_IGNORE_THIS_BASE) return;
if( ptr->data[pos]==SHRT_MAX ) return;
ptr->data[pos]++;
}
void referenceDump(const ReferencePtr ptr,const char* name,FILE* out)
{
int pos=0;
while(pos< ptr->length)
{
//skip empty
while(pos< ptr->length && ptr->data[pos]==ptr->init)
{
++pos;
}
if(pos>= ptr->length) break;
fprintf(out,"fixedStep chrom=%s start=%d step=1 span=1\n",name,pos);
while(pos< ptr->length && ptr->data[pos]!=ptr->init)
{
fprintf(out,"%d\n",ptr->data[pos]);
++pos;
}
}
}
static char* readline(FILE* in,int *len)
{
int c;
for(;;)
{
char* buff=NULL;
int buffer_size=BUFSIZ;
*len=0;
if(feof(in)) return NULL;
buff=malloc(buffer_size);
if(buff==NULL)
{
fprintf(stderr,"Out of memory\n");
exit(EXIT_FAILURE);
}
buff[0]=0;
while((c=fgetc(in))!=EOF)
{
if(c=='\n') break;
if(*len+2>=buffer_size)
{
buff=realloc(buff,buffer_size+BUFSIZ);
if(buff==NULL)
{
fprintf(stderr,"Out of memory\n");
exit(EXIT_FAILURE);
}
buffer_size+=BUFSIZ;
}
buff[*len]=c;
(*len)++;
buff[*len]=0;
}
if(buff[0]!=0)
{
return buff;
}
free(buff);
buff=NULL;
}
}
int main(int argc, char *args[])
{
samfile_t *fp_in = NULL;
bam1_t *b=NULL;
int optind=1;
char* knownGeneFile=NULL;
int reference_count=0;
int reference_index=0;
while(optind<argc)
{
if(strcmp(args[optind],"-h")==0)
{
printf("%s . Pierre Lindenbaum PhD. Compilation %s",args[0],__DATE__);
printf(" -k knownGene.txt (optional)");
return 0;
}
else if(strcmp(args[optind],"-k")==0 && optind+1< argc)
{
knownGeneFile=args[++optind];
}
else if(args[optind][0]=='-' && args[optind][1]=='-')
{
optind++;
break;
}
else if(args[optind][0]=='-')
{
fprintf(stderr,"Unnown option: %s\n",args[optind]);
return EXIT_FAILURE;
}
else
{
break;
}
++optind;
}
if(optind+1!=argc)
{
fprintf(stderr,"Illegal number of arguments\n");
return EXIT_FAILURE;
}
fp_in = samopen(args[optind], "rb", 0);
if(NULL == fp_in)
{
fprintf(stderr,"Could not open file %s\n",args[optind]);
return EXIT_FAILURE;
}
reference_count=fp_in->header-> n_targets;
samclose(fp_in);
fprintf(stdout,"track type=wiggle_0 name=__SED_THIS__ viewLimits=0:100\n");
for( reference_index=0;
reference_index < reference_count;
++reference_index
)
{
ReferencePtr chromosome;
fp_in = samopen(args[optind], "rb", 0);
chromosome=referenceNew(knownGeneFile==NULL?INIT_DEFAULT:INIT_IGNORE_THIS_BASE);
if(NULL == fp_in)
{
fprintf(stderr,"Could not open file %s\n",args[optind]);
return EXIT_FAILURE;
}
LOG(fprintf(stderr,"[LOG]Reading Chromosome:%s\n",fp_in->header->target_name[reference_index]));
if(knownGeneFile!=NULL)
{
int n=0;
char* line=NULL;
FILE* kgFile=fopen(knownGeneFile,"r");
if(kgFile==NULL)
{
fprintf(stderr,"Cannot open %s\n",knownGeneFile);
return EXIT_FAILURE;
}
while((line=readline(kgFile,&n))!=NULL)
{
int i;
int tIdx=1;
char* tokens[12];
if(line[0]==0 || strncmp(line,"name\t",5)==0)
{
free(line);
continue;
}
tokens[0]=line;
for(i=0;i< n;++i)
{
if(line[i]=='\t' && tIdx< 12)
{
tokens[tIdx]=&line[i+1];
line[i]=0;
++tIdx;
}
}
if(tIdx!=12)
{
fprintf(stderr,"BOUM \"%s\" %d\n",line,tIdx);
exit(EXIT_FAILURE);
}
if(strcmp(tokens[1],fp_in->header->target_name[reference_index])==0)
{
int cdsStart=atoi(tokens[5]);
int cdsEnd=atoi(tokens[6]);
char *start=tokens[8];
char *end=tokens[9];
while(*start!=0 && *end!=0)
{
char *next_start=NULL;
char *next_end=NULL;
int x1=(int)strtol(start,&next_start,10);
int x2=(int)strtol(end,&next_end,10);
while(x1<x2)
{
if(x1>=cdsStart && x1<cdsEnd)
{
referenceEnsureIndex(chromosome,x1+1);
chromosome->data[x1+1]=0;
}
++x1;
}
if(*next_start==',') ++next_start;
if(*next_end==',') ++next_end;
if(*next_start==0 || *next_end==0) break;
start=next_start;
end=next_end;
}
}
free(line);
}
}
b = bam_init1();
int pos=0;
while(samread(fp_in, b) > 0)
{
if(b->core.tid != reference_index)
{
bam_destroy1(b);
b = bam_init1();
continue;
}
pos = b->core.pos;
uint32_t* cigar= bam1_cigar(b);
int k;
for( k=0;k< b->core.n_cigar;++k)
{
int cop =cigar[k] & BAM_CIGAR_MASK; // operation
int cl = cigar[k] >> BAM_CIGAR_SHIFT; // length
switch(cop)
{
case BAM_CMATCH:
{
while(cl>0)
{
referenceReadWasSeen(chromosome,pos);
pos++;
cl--;
}
break;
}
case BAM_CHARD_CLIP:break; /* ignore */
case BAM_CSOFT_CLIP:break; /* ignore */
case BAM_CPAD: //...
case BAM_CREF_SKIP://....
case BAM_CDEL:
{
/* Spans positions, No Coverage */
pos+=cl;
break;
}
case BAM_CINS: /* cursor not moved on reference */ ;break;
default:printf("?");assert(0);break;
}
}
bam_destroy1(b);
b = bam_init1();
}
bam_destroy1(b);
referenceDump(chromosome,fp_in->header->target_name[reference_index],stdout);
referenceFree(chromosome);
samclose(fp_in);
}
return 0;
}
PROXY=--proxy host:port
bwa.dir=${HOME}/package/bwa/bwa-0.5.7
bwa.bin=${bwa.dir}/bwa
sam.dir=${HOME}/package/sam/samtools-0.1.7a
sam.bin=${sam.dir}/samtools
CFLAGS= -Wall -O3 #-m64
LIMIT=1000004
data.wig:bam2wig knownGenes.txt sorted1.bam
./bam2wig -k knownGenes.txt sorted1.bam | sed 's/__SED_THIS__/chr22Genes/' > $@
bam2wig: ${HOME}/bam2wig.c
gcc ${CFLAGS} -o $@ -I ${sam.dir} -L ${sam.dir} $< -lbam -lz
sorted1.bam:aln.bam
${sam.bin} sort aln.bam sorted1
${sam.bin} index sorted1.bam
aln.bam:aln.sam
${sam.bin} view -b -T chr22.fa aln.sam > $@
aln.sam:aln1.sai aln2.sai reads_1.fastq reads_2.fastq
${bwa.bin} sampe chr22db aln1.sai aln2.sai reads_1.fastq reads_2.fastq > $@
aln2.sai:chr22db.bwt reads_2.fastq
${bwa.bin} aln chr22db reads_2.fastq > $@
aln1.sai:chr22db.bwt reads_1.fastq
${bwa.bin} aln chr22db reads_1.fastq > $@
reads_1.fastq reads_2.fastq:
${sam.dir}/misc/wgsim chr22.fa reads_1.fastq reads_2.fastq > _rand.txt
chr22db.bwt:chr22.fa
${bwa.bin} index -p chr22db -a bwtsw chr22.fa
chr22.fa.fai:chr22.fa
${sam.bin} faidx chr22.fa
knownGenes.txt:
curl ${PROXY} "http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/knownGene.txt.gz" |\
gunzip -c | grep chr22 > $@
chr22.fa:
curl ${PROXY} "http://hgdownload.cse.ucsc.edu/goldenPath/hg18/chromosomes/chr22.fa.gz" |\
gunzip -c > $@
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment