Created
February 15, 2011 15:59
-
-
Save lindenb/827697 to your computer and use it in GitHub Desktop.
Transforms a BAM file to WIG
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Author: | |
Pierre Lindenbaum PhD | |
WWW: | |
http://plindenbaum.blogspot.com | |
contact: | |
plindenbaum@yahoo.fr | |
Motivation: | |
creates a WIG file from a BAM file | |
Compilation: | |
gcc -m64 -I path/to/samtools-0.1.7a -L path/to/samtools-0.1.7a bam2wig.c -lbam -lz | |
See also: | |
API: http://samtools.sourceforge.net/samtools/sam/ | |
*/ | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <assert.h> | |
#include <limits.h> | |
#include "bam.h" | |
#include "sam.h" | |
typedef short depth_t; | |
#define LOG(a) a | |
#define WHERE fprintf(stderr,"%s:%s,%d\n",__FILE__,__FUNCTION__,__LINE__) | |
static const depth_t INIT_IGNORE_THIS_BASE=-1; | |
static const depth_t INIT_DEFAULT=0; | |
typedef struct Reference_t | |
{ | |
depth_t init; | |
depth_t* data; | |
int length; | |
int buffer_size; | |
}Reference,*ReferencePtr; | |
static ReferencePtr referenceNew(depth_t init) | |
{ | |
ReferencePtr ptr= calloc(1,sizeof(Reference)); | |
if(ptr==NULL) | |
{ | |
fprintf(stderr,"Out of memory\n"); | |
exit(EXIT_FAILURE); | |
} | |
ptr->init=init; | |
ptr->buffer_size=100000000; | |
ptr->data=malloc(ptr->buffer_size*sizeof(depth_t)); | |
if(ptr->data==NULL) | |
{ | |
fprintf(stderr,"Out of memory\n"); | |
exit(EXIT_FAILURE); | |
} | |
memset(ptr->data,0,(ptr->buffer_size*sizeof(depth_t))); | |
ptr->length=0; | |
return ptr; | |
} | |
void referenceFree(ReferencePtr ptr) | |
{ | |
free(ptr->data); | |
free(ptr); | |
} | |
void referenceEnsureIndex(ReferencePtr ptr,int pos) | |
{ | |
if(pos>= ptr->buffer_size) | |
{ | |
int new_buffer_size=pos+1000000; | |
LOG(fprintf(stderr,"[LOG]Realloc to %d\n",new_buffer_size)); | |
ptr->data=realloc(ptr->data,new_buffer_size*sizeof(depth_t)); | |
if(ptr->data==NULL) | |
{ | |
fprintf(stderr,"Out of memory\n"); | |
exit(EXIT_FAILURE); | |
} | |
ptr->buffer_size=new_buffer_size; | |
} | |
while( ptr->length <= pos ) | |
{ | |
ptr->data[ptr->length]=ptr->init; | |
ptr->length++; | |
} | |
} | |
void referenceReadWasSeen(ReferencePtr ptr,int pos) | |
{ | |
referenceEnsureIndex(ptr,pos); | |
if( ptr->data[pos]==INIT_IGNORE_THIS_BASE) return; | |
if( ptr->data[pos]==SHRT_MAX ) return; | |
ptr->data[pos]++; | |
} | |
void referenceDump(const ReferencePtr ptr,const char* name,FILE* out) | |
{ | |
int pos=0; | |
while(pos< ptr->length) | |
{ | |
//skip empty | |
while(pos< ptr->length && ptr->data[pos]==ptr->init) | |
{ | |
++pos; | |
} | |
if(pos>= ptr->length) break; | |
fprintf(out,"fixedStep chrom=%s start=%d step=1 span=1\n",name,pos); | |
while(pos< ptr->length && ptr->data[pos]!=ptr->init) | |
{ | |
fprintf(out,"%d\n",ptr->data[pos]); | |
++pos; | |
} | |
} | |
} | |
static char* readline(FILE* in,int *len) | |
{ | |
int c; | |
for(;;) | |
{ | |
char* buff=NULL; | |
int buffer_size=BUFSIZ; | |
*len=0; | |
if(feof(in)) return NULL; | |
buff=malloc(buffer_size); | |
if(buff==NULL) | |
{ | |
fprintf(stderr,"Out of memory\n"); | |
exit(EXIT_FAILURE); | |
} | |
buff[0]=0; | |
while((c=fgetc(in))!=EOF) | |
{ | |
if(c=='\n') break; | |
if(*len+2>=buffer_size) | |
{ | |
buff=realloc(buff,buffer_size+BUFSIZ); | |
if(buff==NULL) | |
{ | |
fprintf(stderr,"Out of memory\n"); | |
exit(EXIT_FAILURE); | |
} | |
buffer_size+=BUFSIZ; | |
} | |
buff[*len]=c; | |
(*len)++; | |
buff[*len]=0; | |
} | |
if(buff[0]!=0) | |
{ | |
return buff; | |
} | |
free(buff); | |
buff=NULL; | |
} | |
} | |
int main(int argc, char *args[]) | |
{ | |
samfile_t *fp_in = NULL; | |
bam1_t *b=NULL; | |
int optind=1; | |
char* knownGeneFile=NULL; | |
int reference_count=0; | |
int reference_index=0; | |
while(optind<argc) | |
{ | |
if(strcmp(args[optind],"-h")==0) | |
{ | |
printf("%s . Pierre Lindenbaum PhD. Compilation %s",args[0],__DATE__); | |
printf(" -k knownGene.txt (optional)"); | |
return 0; | |
} | |
else if(strcmp(args[optind],"-k")==0 && optind+1< argc) | |
{ | |
knownGeneFile=args[++optind]; | |
} | |
else if(args[optind][0]=='-' && args[optind][1]=='-') | |
{ | |
optind++; | |
break; | |
} | |
else if(args[optind][0]=='-') | |
{ | |
fprintf(stderr,"Unnown option: %s\n",args[optind]); | |
return EXIT_FAILURE; | |
} | |
else | |
{ | |
break; | |
} | |
++optind; | |
} | |
if(optind+1!=argc) | |
{ | |
fprintf(stderr,"Illegal number of arguments\n"); | |
return EXIT_FAILURE; | |
} | |
fp_in = samopen(args[optind], "rb", 0); | |
if(NULL == fp_in) | |
{ | |
fprintf(stderr,"Could not open file %s\n",args[optind]); | |
return EXIT_FAILURE; | |
} | |
reference_count=fp_in->header-> n_targets; | |
samclose(fp_in); | |
fprintf(stdout,"track type=wiggle_0 name=__SED_THIS__ viewLimits=0:100\n"); | |
for( reference_index=0; | |
reference_index < reference_count; | |
++reference_index | |
) | |
{ | |
ReferencePtr chromosome; | |
fp_in = samopen(args[optind], "rb", 0); | |
chromosome=referenceNew(knownGeneFile==NULL?INIT_DEFAULT:INIT_IGNORE_THIS_BASE); | |
if(NULL == fp_in) | |
{ | |
fprintf(stderr,"Could not open file %s\n",args[optind]); | |
return EXIT_FAILURE; | |
} | |
LOG(fprintf(stderr,"[LOG]Reading Chromosome:%s\n",fp_in->header->target_name[reference_index])); | |
if(knownGeneFile!=NULL) | |
{ | |
int n=0; | |
char* line=NULL; | |
FILE* kgFile=fopen(knownGeneFile,"r"); | |
if(kgFile==NULL) | |
{ | |
fprintf(stderr,"Cannot open %s\n",knownGeneFile); | |
return EXIT_FAILURE; | |
} | |
while((line=readline(kgFile,&n))!=NULL) | |
{ | |
int i; | |
int tIdx=1; | |
char* tokens[12]; | |
if(line[0]==0 || strncmp(line,"name\t",5)==0) | |
{ | |
free(line); | |
continue; | |
} | |
tokens[0]=line; | |
for(i=0;i< n;++i) | |
{ | |
if(line[i]=='\t' && tIdx< 12) | |
{ | |
tokens[tIdx]=&line[i+1]; | |
line[i]=0; | |
++tIdx; | |
} | |
} | |
if(tIdx!=12) | |
{ | |
fprintf(stderr,"BOUM \"%s\" %d\n",line,tIdx); | |
exit(EXIT_FAILURE); | |
} | |
if(strcmp(tokens[1],fp_in->header->target_name[reference_index])==0) | |
{ | |
int cdsStart=atoi(tokens[5]); | |
int cdsEnd=atoi(tokens[6]); | |
char *start=tokens[8]; | |
char *end=tokens[9]; | |
while(*start!=0 && *end!=0) | |
{ | |
char *next_start=NULL; | |
char *next_end=NULL; | |
int x1=(int)strtol(start,&next_start,10); | |
int x2=(int)strtol(end,&next_end,10); | |
while(x1<x2) | |
{ | |
if(x1>=cdsStart && x1<cdsEnd) | |
{ | |
referenceEnsureIndex(chromosome,x1+1); | |
chromosome->data[x1+1]=0; | |
} | |
++x1; | |
} | |
if(*next_start==',') ++next_start; | |
if(*next_end==',') ++next_end; | |
if(*next_start==0 || *next_end==0) break; | |
start=next_start; | |
end=next_end; | |
} | |
} | |
free(line); | |
} | |
} | |
b = bam_init1(); | |
int pos=0; | |
while(samread(fp_in, b) > 0) | |
{ | |
if(b->core.tid != reference_index) | |
{ | |
bam_destroy1(b); | |
b = bam_init1(); | |
continue; | |
} | |
pos = b->core.pos; | |
uint32_t* cigar= bam1_cigar(b); | |
int k; | |
for( k=0;k< b->core.n_cigar;++k) | |
{ | |
int cop =cigar[k] & BAM_CIGAR_MASK; // operation | |
int cl = cigar[k] >> BAM_CIGAR_SHIFT; // length | |
switch(cop) | |
{ | |
case BAM_CMATCH: | |
{ | |
while(cl>0) | |
{ | |
referenceReadWasSeen(chromosome,pos); | |
pos++; | |
cl--; | |
} | |
break; | |
} | |
case BAM_CHARD_CLIP:break; /* ignore */ | |
case BAM_CSOFT_CLIP:break; /* ignore */ | |
case BAM_CPAD: //... | |
case BAM_CREF_SKIP://.... | |
case BAM_CDEL: | |
{ | |
/* Spans positions, No Coverage */ | |
pos+=cl; | |
break; | |
} | |
case BAM_CINS: /* cursor not moved on reference */ ;break; | |
default:printf("?");assert(0);break; | |
} | |
} | |
bam_destroy1(b); | |
b = bam_init1(); | |
} | |
bam_destroy1(b); | |
referenceDump(chromosome,fp_in->header->target_name[reference_index],stdout); | |
referenceFree(chromosome); | |
samclose(fp_in); | |
} | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
PROXY=--proxy host:port | |
bwa.dir=${HOME}/package/bwa/bwa-0.5.7 | |
bwa.bin=${bwa.dir}/bwa | |
sam.dir=${HOME}/package/sam/samtools-0.1.7a | |
sam.bin=${sam.dir}/samtools | |
CFLAGS= -Wall -O3 #-m64 | |
LIMIT=1000004 | |
data.wig:bam2wig knownGenes.txt sorted1.bam | |
./bam2wig -k knownGenes.txt sorted1.bam | sed 's/__SED_THIS__/chr22Genes/' > $@ | |
bam2wig: ${HOME}/bam2wig.c | |
gcc ${CFLAGS} -o $@ -I ${sam.dir} -L ${sam.dir} $< -lbam -lz | |
sorted1.bam:aln.bam | |
${sam.bin} sort aln.bam sorted1 | |
${sam.bin} index sorted1.bam | |
aln.bam:aln.sam | |
${sam.bin} view -b -T chr22.fa aln.sam > $@ | |
aln.sam:aln1.sai aln2.sai reads_1.fastq reads_2.fastq | |
${bwa.bin} sampe chr22db aln1.sai aln2.sai reads_1.fastq reads_2.fastq > $@ | |
aln2.sai:chr22db.bwt reads_2.fastq | |
${bwa.bin} aln chr22db reads_2.fastq > $@ | |
aln1.sai:chr22db.bwt reads_1.fastq | |
${bwa.bin} aln chr22db reads_1.fastq > $@ | |
reads_1.fastq reads_2.fastq: | |
${sam.dir}/misc/wgsim chr22.fa reads_1.fastq reads_2.fastq > _rand.txt | |
chr22db.bwt:chr22.fa | |
${bwa.bin} index -p chr22db -a bwtsw chr22.fa | |
chr22.fa.fai:chr22.fa | |
${sam.bin} faidx chr22.fa | |
knownGenes.txt: | |
curl ${PROXY} "http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/knownGene.txt.gz" |\ | |
gunzip -c | grep chr22 > $@ | |
chr22.fa: | |
curl ${PROXY} "http://hgdownload.cse.ucsc.edu/goldenPath/hg18/chromosomes/chr22.fa.gz" |\ | |
gunzip -c > $@ | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment