Skip to content

Instantly share code, notes, and snippets.

@lindenb
Last active December 17, 2018 13:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lindenb/866410c3fa9babe7e3f3ca5fcf53d6b5 to your computer and use it in GitHub Desktop.
Save lindenb/866410c3fa9babe7e3f3ca5fcf53d6b5 to your computer and use it in GitHub Desktop.
Finding 16 mer not present in GRCh38

change K and NUM according to your needs.

const int K=16;
const long NUM = 4294967296L;

example

$ gcc -o biostars354468 -Wall -O3 biostars354468.c 
$ wget -O NC_000913.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=NC_000913.3&rettype=fasta"
$ ./biostars354468 NC_000913.fa | gzip --best > kmers.gz
(...)
$ gunzip -c kmers.gz

ACCTAGGT
AGCCTAGG
AGGTCTAG
AGTCTAGG
CACCTAGA
CCCCCTAG
CCCTAGAC
CCTAGACA
CCTAGACT
CCTAGGAA
CCTAGGAC
CCTAGGAG
CCTAGGCA
CCTAGGCC
CCTAGGCT
CCTAGGGC
CCTAGGTC
CCTAGTAG
CCTCCTAG
CTACTAGG
CTAGACCT
CTAGGAAG
CTAGGACA
CTAGGAGG
CTAGGCAC
CTAGGGGG
CTAGGGTA
CTCCTAGA
CTCCTAGG
CTCTAGTC
CTTCCTAG
GACCTAGG
GACTAGAG
GCCCTAGG
GCCTAGGA
GCCTAGGC
GGCCTAGG
GGGGCCCC
GTCCTAGG
GTCTAGGG
GTGCCTAG
TACCCTAG
TCCTAGCA
TCCTAGGA
TCCTAGGC
TCTAGGAG
TCTAGGTG
TGCCTAGG
TGCTAGGA
TGTCCTAG
TGTCTAGG
TTCCTAGG
/**
Author: Pierre Lindenbaum
https://www.biostars.org/p/354468/
# compilation :
```
gcc -o biostars354468 -Wall -O3 biostars354468.c
```
# execute:
```
./biostars354468 input.fa > kmers.txt
```
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <ctype.h>
#define LOGBASES if(nread++%100000==0) fprintf(stderr,"[LOG] n. bases =%ld\n",nread)
#define INDEXFILE "biostar354468.db"
//const int K=16;
//const long NUM = 4294967296L;
//const int K=4;
//const long NUM=256;
const int K=8;
const long NUM=65536;
const char NOT_FOUND = '0';
const char FOUND = '1';
static char compl(char c)
{
switch(c)
{
case 'A': return 'T';
case 'C': return 'G';
case 'G': return 'C';
case 'T': return 'A';
default: {
fprintf(stderr,"compl bad base \"%c\"\n",c);
exit(EXIT_FAILURE);
}
}
}
static int opcode(char c)
{
switch(c)
{
case 'A': return 0;
case 'C': return 1;
case 'G': return 2;
case 'T': return 3;
default: {
fprintf(stderr,"opcode bad base \"%c\"\n",c);
exit(EXIT_FAILURE);
}
}
}
int main(int argc,char** argv)
{
long nread=0L;
char buffer[K];
int buffer_size=0;
long i;
FILE* f1;
FILE* f2;
f1 = fopen(INDEXFILE,"w+b");
if(f1==NULL) {
fprintf(stderr,"[FATAL]Cannot open tmp file " INDEXFILE ".\n");
return -1;
}
fprintf(stderr,"[INFO]BEGIN Writing index file " INDEXFILE ". Please wait... N=%ld\n",NUM);
for(i=0L;i< NUM;i++) fwrite((void*)&NOT_FOUND,1,1,f1);
if(ferror(f1)) {
fprintf(stderr,"[FATAL]I/O error for tmp file " INDEXFILE ". %s\n",strerror(errno));
return -1;
}
fprintf(stderr,"[INFO]END Writing index file " INDEXFILE " N=%ld\n",NUM);
memset((void*)&buffer,0,K);
fprintf(stderr,"[INFO]BEGIN Reading fasta\n");
f2 = argc>1 ? fopen(argv[1],"r") : stdin;
if(f2==NULL) {
fprintf(stderr,"[FATAL]Cannot open fasta file.\n");
return -1;
}
for(;;)
{
int c=fgetc(f2);
if(c==EOF || c=='>')
{
buffer_size=0;
if(c==EOF) break;
fputs("[LOG] found sequence :",stderr);
while((c=fgetc(f2))!=EOF && c!='\n') {
fprintf(stderr,"%c",c);
}
fputc('\n',stderr);
continue;
}
switch(c)
{
case 'A': case 'a':
case 'T': case 't':
case 'G': case 'g':
case 'C': case 'c':
{
LOGBASES;
buffer[buffer_size]=toupper(c);
buffer_size++;
if(buffer_size==K)
{
int x;
long offset = 0L;
// forward
for(x=0;x< K;++x)
{
offset = offset *4L + opcode(buffer[x]);
}
fseek(f1,offset,SEEK_SET);
fwrite((void*)&FOUND,1,1,f1);
//reverse
offset=0;
for(x=K-1;x>=0;--x)
{
offset = offset *4L + opcode(compl(buffer[x]));
}
fseek(f1,offset,SEEK_SET);
fwrite((void*)&FOUND,1,1,f1);
memmove((void*)&buffer[0], (void*)&buffer[1],buffer_size-1);
buffer_size--;
}
break;
}
case '\n': case ' ' : case '\r':
{
break;
}
default:
{
LOGBASES;
buffer_size=0;
break;
}
}
}
fprintf(stderr,"[INFO]END Reading fasta\n");
fclose(f2);
fseek(f1,0L,SEEK_SET);
fprintf(stderr,"[INFO]BEGIN DUMPING DATA\n");
i=0L;
while(i< NUM)
{
char c;
fread((void*)&c,1,1,f1);
if(c==NOT_FOUND)
{
int x;
long n = i;
for(x=K-1;x>=0;--x)
{
switch(n%4) {
case 0: buffer[x]='A'; break;
case 1: buffer[x]='C'; break;
case 2: buffer[x]='G'; break;
case 3: buffer[x]='T'; break;
}
n=n/4;
}
fwrite((void*)buffer,1,K,stdout);
fputc('\n',stdout);
}
i++;
}
fclose(f1);
fprintf(stderr,"[INFO]END DUMPING DATA\n");
remove(INDEXFILE);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment