Skip to content

Instantly share code, notes, and snippets.

@PoisonAlien
Last active October 15, 2021 12:56
Show Gist options
  • Save PoisonAlien/4a5fcc2126a731417671c4738bcd94cc to your computer and use it in GitHub Desktop.
Save PoisonAlien/4a5fcc2126a731417671c4738bcd94cc to your computer and use it in GitHub Desktop.
Tool to extract nucleotide counts at user specific loci
//A minimal program to extract nucelotide counts of selected genomic loci from the BAM file
//gcc -g -O3 -pthread ntcounts.c -lhts -Ihts -o ntcounts
//MIT License
//Copyright (c) 2021 Anand Mayakonda <anandmt3@gmail.com>
#include <unistd.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <htslib/sam.h>
#include <htslib/faidx.h>
#include <argp.h>
#define PBSTR "||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"
#define PBWIDTH 60
void is_bam(char *fname);
char *basename(char const *path);
char *removeExt(char* myStr);
int countlines(char *filename);
void printProgress(double percentage, int done, int tot);
void usage();
char *VERSION = "0.1.0";
int main (int argc, char **argv){
char *fafile = NULL;
char *op = NULL;
int c;
char *bedfile = NULL;
char *bam = NULL;
uint32_t q = 10; //BAM default MAPQ
uint16_t F = 3852; //BAM default FLAG
int vars_gt = 1; //No. of BED entries
int nloci = 0;
opterr = 0;
hts_verbose = 0; //suppresses htslib warnings
//Parse cl args
while ((c = getopt (argc, argv, "q:o:f:F:")) != -1){
switch (c)
{
case 'q':
q = atoi(optarg);
break;
case 'o':
op = optarg;
break;
case 'f':
fafile = atoi(optarg);
break;
case 'F':
F = atoi(optarg);
break;
case '?':
if (optopt == 'm'){
fprintf (stderr, "MAPQ must be provided when -%c is specified.\n", optopt);
usage();
}
if (optopt == 'v'){
fprintf (stderr, "VAF must be provided when -%c is specified.\n", optopt);
usage();
}
if (optopt == 'F'){
fprintf (stderr, "FLAG must be provided when -%c is specified.\n", optopt);
usage();
}
if (optopt == 'f'){
usage();
fprintf (stderr, "Input fasta file must provided when -%c is specified.\n", optopt);
} else if (isprint (optopt)){
fprintf (stderr, "Unknown option `-%c'.\n", optopt);
usage();
}else{
fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt);
usage();
}
return 1;
default:
abort ();
}
}
bedfile = argv[optind];
if(bedfile == NULL){
usage();
fprintf(stderr, "Missing loci file!\n");
return 0;
}
bam = argv[optind+1];
if(bam == NULL){
usage();
fprintf(stderr, "Missing BAM file!\n");
return 0;
}
is_bam(bam); //Check the file extension to to figure out BAM file or not
char tsv_file[1000];
char *bn = basename(bam); //get basename of the file
char *bnn = removeExt(bn); //remove extension from the basename
if(op == NULL){
strcpy(tsv_file, bnn);
strcat(tsv_file, ".tsv");
}else{
strcpy(tsv_file, op);
strcat(tsv_file, ".tsv");
}
//Open bed file
FILE *bed_fp;
nloci = countlines(bedfile);
bed_fp = fopen(bedfile, "r");
char buff[1000];
//Open TSV report file and print HTML header
FILE *tsv_fp;
tsv_fp = fopen(tsv_file, "w" );
fprintf(tsv_fp, "loci\tfa_ref\tA\tT\tG\tC\tIns\tDel\n");
//fasta file
char *seq;
faidx_t *fa = fai_load(fafile);
//BAM file
samFile *fp_in = hts_open(bam,"r"); //open bam file
hts_idx_t *fp_idx = sam_index_load(fp_in, bam);
bam_hdr_t *bamHdr = sam_hdr_read(fp_in); //read header
bam1_t *aln = bam_init1(); //initialize an alignment
fprintf(stderr, "Input BAM file : %s\n", bam);
fprintf(stderr, "Loci file : %s\n", bedfile);
fprintf(stderr, "No. of loci : %d\n", nloci);
fprintf(stderr, "MAPQ filter : %d\n", q);
fprintf(stderr, "FLAG filter : %d\n", F);
fprintf(stderr, "HTSlib version : %s\n\n", hts_version());
//For every loci in the BED file
while(fgets(buff,1000,bed_fp) != NULL){
//Remove trailing new line chars
int len = strlen(buff);
if(buff[len-1] == '\n' ){
buff[len-1] = 0;
}
char *chrom = strtok(buff,"\t");
char *start = strtok(NULL,"\t");
char loci[250] = "";
strcat(loci, chrom); strcat(loci, ":"); strcat(loci, start); strcat(loci, "-"); strcat(loci, start);
//Fetch base at target loci from fasta file
if(fa != NULL){
int templen = 100;
seq = fai_fetch(fa, loci, &templen);
fprintf(tsv_fp, "%s:%s\t%s", chrom, start, seq);
free(seq);
}else{
fprintf(tsv_fp, "%s:%s\tNA", chrom, start);
}
int32_t target_pos = atoi(start) -1; //input position are 1 based
//load reads in target loci
hts_itr_t *samitr = sam_itr_querys(fp_idx, bamHdr, loci);
//Keep track of total reads and nt counts per loci
int32_t tot_reads = 0;
float nt[6] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
//fprintf(stderr, "\rPercent completed: %d [%d of %d]", 100 * vars_gt/nloci, vars_gt, nloci);
//fflush(stdout);
printProgress(vars_gt/(double)nloci, vars_gt, nloci);
vars_gt = vars_gt + 1;
//For every read in the BAM file of target region
while(sam_itr_next(fp_in, samitr, aln) > 0){
int32_t pos = aln->core.pos ; //left most position of alignment in zero based coordianate (0-based)
uint32_t len = aln->core.l_qseq; //length of the read.
uint32_t* cig = bam_get_cigar(aln);
uint8_t *qs = bam_get_seq(aln); //quality string
//MAPQ and FLAG filter
if(aln->core.qual <= q){
continue;
}
if(aln->core.flag >= F){
continue;
}
tot_reads = tot_reads +1;
//char ins_seq[100]; char del_seq[100];
//get nucleotide id and converts them into IUPAC id.
char *qseq = (char *)malloc(len);
int i = 0;
for(i=0; i< len ; i++){
qseq[i] = seq_nt16_str[bam_seqi(qs,i)];
}
//target position on the read
int32_t pos_onread = 0;
//For every CIGAR string
int k = 0;
for(k=0;k< aln->core.n_cigar ;++k){
int cop =cig[k] & BAM_CIGAR_MASK; // CIGAR string
int cl = cig[k] >> BAM_CIGAR_SHIFT; // CIGAR length
if(BAM_CIGAR_STR[cop] == 'M'){
pos_onread = pos_onread + cl;
pos = pos + cl;
}else if(BAM_CIGAR_STR[cop] == 'S'){
pos_onread = pos_onread + cl;
}else if(BAM_CIGAR_STR[cop] == 'I'){
pos_onread = pos_onread + cl;
}else if(BAM_CIGAR_STR[cop] == 'D'){
pos = pos + cl;
}
if(pos > target_pos){
if(BAM_CIGAR_STR[cop] == 'M'){
pos_onread = pos_onread - (pos - target_pos);
if(qseq[pos_onread] == 'A'){
nt[0] = nt[0] + 1;
}else if(qseq[pos_onread] == 'T'){
nt[1] = nt[1] + 1;
}else if(qseq[pos_onread] == 'G'){
nt[2] = nt[2] + 1;
}else if(qseq[pos_onread] == 'C'){
nt[3] = nt[3] + 1;
}
break;
}
}else if(pos == target_pos){
if(BAM_CIGAR_STR[cop] == 'I'){
nt[4] = nt[4] + 1;
// insertion sequence
// for(int i = 0; i < cl; i++){
// //strcat(ins_seq, &qseq[pos_onread+i]);
// }
break;
}else if(BAM_CIGAR_STR[cop] == 'D'){
nt[5] = nt[5] + 1;
// deletion sequence
// for(int i = 0; i < cl; i++){
// strcat(del_seq, qseq[pos_onread+i]);
// }
break;
}
}
}
free(qseq);
}
hts_itr_destroy(samitr);
fprintf(tsv_fp, "\t%.f\t%.f\t%.f\t%.f\t%.f\t%.f\n", nt[0], nt[1], nt[2], nt[3], nt[4], nt[5]);
//write nt counts as table row
}
bam_destroy1(aln);
bam_hdr_destroy(bamHdr);
fai_destroy(fa);
sam_close(fp_in);
fclose(bed_fp);
fclose(tsv_fp);
free(bn);
free(bnn);
fprintf(stderr, "\n\nDone!\n");
return 0;
}
void is_bam(char *fname){
int l = strlen(fname);
if (l>=4 && strcasecmp(fname+l-4, ".bam") != 0){
usage();
fprintf(stderr, "%s is not a BAM file!\n", fname);
exit(0);
}else if(l<4){
fprintf(stderr, "%s is not a BAM file!\n", fname);
usage();
exit(0);
}
}
//from: https://stackoverflow.com/questions/3288006/are-there-any-c-apis-to-extract-the-base-file-name-from-its-full-path-in-linux
char *basename(char const *path){
char *s = strrchr(path, '/');
if (!s){
return strdup(path);
}else{
return strdup(s + 1);
}
}
//from: https://stackoverflow.com/questions/2736753/how-to-remove-extension-from-file-name
char *removeExt(char* myStr) {
char *retStr;
char *lastExt;
if (myStr == NULL) return NULL;
if ((retStr = malloc (strlen (myStr) + 1)) == NULL) return NULL;
strcpy (retStr, myStr);
lastExt = strrchr (retStr, '.');
if (lastExt != NULL)
*lastExt = '\0';
return retStr;
}
void usage(){
fprintf(stderr, "-------------------------------------------------------------------------\n");
fprintf(stderr, "ntcounts: A tool to extract nucleotide counts\n");
fprintf(stderr, " of genomic loci from the BAM file. Version: %s\n", VERSION);
fprintf(stderr, "USAGE:\n");
fprintf(stderr, " ntcounts [OPTIONS] <loci> <bam>\n");
fprintf(stderr, " e.g; ntcounts target_loci.tsv Tumor.bam\n");
fprintf(stderr, "OPTIONS:\n");
fprintf(stderr, " -f Indexed fasta file. If provided, extracts and adds reference base to the ouput tsv\n");
fprintf(stderr, " -q Min. mapping quality threshold. Default 10 [Read filter]\n");
fprintf(stderr, " -F Exclude reads with FLAGS >= INT. Default 3852 [Read filter]\n");
fprintf(stderr, " -o Output file basename. Default parses from basename of BAM file\n");
fprintf(stderr, "ARGS:\n");
fprintf(stderr, " <loci> variant file (chr\\tpos)\n");
fprintf(stderr, " <bam> Indexed BAM file\n");
fprintf(stderr, "OUPUT:\n");
fprintf(stderr, " <output>.tsv TSV file with nucletide counts for all variants\n");
fprintf(stderr, "-------------------------------------------------------------------------\n");
}
int countlines(char *filename){
// count the number of lines in the file called filename
FILE *fp = fopen(filename,"r");
int ch=0;
int lines=0;
if (fp == NULL){
return 0;
}
while(!feof(fp)){
ch = fgetc(fp);
if(ch == '\n'){
lines++;
}
}
fclose(fp);
return lines;
}
// Progress bar modified from: https://stackoverflow.com/a/36315819
void printProgress(double percentage, int done, int tot) {
int val = (int) (percentage * 100);
int lpad = (int) (percentage * PBWIDTH);
int rpad = PBWIDTH - lpad;
fprintf(stderr, "\r%3d%% [%.*s%*s] %d/%d", val, lpad, PBSTR, rpad, "", done, tot);
fflush(stdout);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment