|
/** |
|
|
|
Author: Pierre Lindenbaum |
|
|
|
|
|
https://www.biostars.org/p/354468/ |
|
|
|
# compilation : |
|
|
|
``` |
|
gcc -o biostars354468 -Wall -O3 biostars354468.c |
|
``` |
|
|
|
# execute: |
|
|
|
``` |
|
./biostars354468 input.fa > kmers.txt |
|
``` |
|
|
|
|
|
*/ |
|
#include <stdlib.h> |
|
#include <stdio.h> |
|
#include <string.h> |
|
#include <errno.h> |
|
#include <ctype.h> |
|
|
|
#define LOGBASES if(nread++%100000==0) fprintf(stderr,"[LOG] n. bases =%ld\n",nread) |
|
#define INDEXFILE "biostar354468.db" |
|
|
|
//const int K=16; |
|
//const long NUM = 4294967296L; |
|
|
|
//const int K=4; |
|
//const long NUM=256; |
|
|
|
const int K=8; |
|
const long NUM=65536; |
|
|
|
const char NOT_FOUND = '0'; |
|
const char FOUND = '1'; |
|
|
|
static char compl(char c) |
|
{ |
|
switch(c) |
|
{ |
|
case 'A': return 'T'; |
|
case 'C': return 'G'; |
|
case 'G': return 'C'; |
|
case 'T': return 'A'; |
|
default: { |
|
fprintf(stderr,"compl bad base \"%c\"\n",c); |
|
exit(EXIT_FAILURE); |
|
} |
|
} |
|
} |
|
|
|
static int opcode(char c) |
|
{ |
|
switch(c) |
|
{ |
|
case 'A': return 0; |
|
case 'C': return 1; |
|
case 'G': return 2; |
|
case 'T': return 3; |
|
default: { |
|
fprintf(stderr,"opcode bad base \"%c\"\n",c); |
|
exit(EXIT_FAILURE); |
|
} |
|
} |
|
} |
|
|
|
int main(int argc,char** argv) |
|
{ |
|
long nread=0L; |
|
char buffer[K]; |
|
int buffer_size=0; |
|
|
|
long i; |
|
FILE* f1; |
|
FILE* f2; |
|
|
|
|
|
|
|
f1 = fopen(INDEXFILE,"w+b"); |
|
if(f1==NULL) { |
|
fprintf(stderr,"[FATAL]Cannot open tmp file " INDEXFILE ".\n"); |
|
return -1; |
|
} |
|
fprintf(stderr,"[INFO]BEGIN Writing index file " INDEXFILE ". Please wait... N=%ld\n",NUM); |
|
for(i=0L;i< NUM;i++) fwrite((void*)&NOT_FOUND,1,1,f1); |
|
if(ferror(f1)) { |
|
fprintf(stderr,"[FATAL]I/O error for tmp file " INDEXFILE ". %s\n",strerror(errno)); |
|
return -1; |
|
} |
|
fprintf(stderr,"[INFO]END Writing index file " INDEXFILE " N=%ld\n",NUM); |
|
memset((void*)&buffer,0,K); |
|
|
|
fprintf(stderr,"[INFO]BEGIN Reading fasta\n"); |
|
f2 = argc>1 ? fopen(argv[1],"r") : stdin; |
|
if(f2==NULL) { |
|
fprintf(stderr,"[FATAL]Cannot open fasta file.\n"); |
|
return -1; |
|
} |
|
for(;;) |
|
{ |
|
int c=fgetc(f2); |
|
if(c==EOF || c=='>') |
|
{ |
|
buffer_size=0; |
|
if(c==EOF) break; |
|
fputs("[LOG] found sequence :",stderr); |
|
while((c=fgetc(f2))!=EOF && c!='\n') { |
|
fprintf(stderr,"%c",c); |
|
} |
|
fputc('\n',stderr); |
|
continue; |
|
} |
|
|
|
switch(c) |
|
{ |
|
case 'A': case 'a': |
|
case 'T': case 't': |
|
case 'G': case 'g': |
|
case 'C': case 'c': |
|
{ |
|
LOGBASES; |
|
buffer[buffer_size]=toupper(c); |
|
buffer_size++; |
|
if(buffer_size==K) |
|
{ |
|
int x; |
|
long offset = 0L; |
|
|
|
// forward |
|
for(x=0;x< K;++x) |
|
{ |
|
offset = offset *4L + opcode(buffer[x]); |
|
} |
|
fseek(f1,offset,SEEK_SET); |
|
fwrite((void*)&FOUND,1,1,f1); |
|
|
|
//reverse |
|
offset=0; |
|
for(x=K-1;x>=0;--x) |
|
{ |
|
offset = offset *4L + opcode(compl(buffer[x])); |
|
} |
|
fseek(f1,offset,SEEK_SET); |
|
fwrite((void*)&FOUND,1,1,f1); |
|
|
|
memmove((void*)&buffer[0], (void*)&buffer[1],buffer_size-1); |
|
buffer_size--; |
|
} |
|
|
|
break; |
|
} |
|
case '\n': case ' ' : case '\r': |
|
{ |
|
break; |
|
} |
|
default: |
|
{ |
|
LOGBASES; |
|
buffer_size=0; |
|
break; |
|
} |
|
} |
|
} |
|
|
|
|
|
fprintf(stderr,"[INFO]END Reading fasta\n"); |
|
fclose(f2); |
|
|
|
fseek(f1,0L,SEEK_SET); |
|
fprintf(stderr,"[INFO]BEGIN DUMPING DATA\n"); |
|
i=0L; |
|
while(i< NUM) |
|
{ |
|
char c; |
|
fread((void*)&c,1,1,f1); |
|
if(c==NOT_FOUND) |
|
{ |
|
int x; |
|
long n = i; |
|
for(x=K-1;x>=0;--x) |
|
{ |
|
switch(n%4) { |
|
case 0: buffer[x]='A'; break; |
|
case 1: buffer[x]='C'; break; |
|
case 2: buffer[x]='G'; break; |
|
case 3: buffer[x]='T'; break; |
|
} |
|
n=n/4; |
|
} |
|
fwrite((void*)buffer,1,K,stdout); |
|
fputc('\n',stdout); |
|
} |
|
i++; |
|
} |
|
|
|
|
|
|
|
fclose(f1); |
|
fprintf(stderr,"[INFO]END DUMPING DATA\n"); |
|
remove(INDEXFILE); |
|
|
|
return 0; |
|
} |