Skip to content

Instantly share code, notes, and snippets.

@drio
Created August 24, 2011 15:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save drio/1168330 to your computer and use it in GitHub Desktop.
Save drio/1168330 to your computer and use it in GitHub Desktop.
finding the fastest way to extract from a fastq
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "khash.h" // khash.h from samtools/bwa/seqtk/klib
KHASH_SET_INIT_STR(s)
#define BUF_SIZE 4096
int main(int argc, char *argv[])
{
char buf[BUF_SIZE];
FILE *fp;
int lineno = 0, flag = 0, ret;
khash_t(s) *h;
if (argc == 1) {
fprintf(stderr, "Usage: cat in.fq | fqextract <name.lst>\n");
return 1;
}
h = kh_init(s);
fp = fopen(argv[1], "rb"); // FIXME: check fp
while (fgets(buf, BUF_SIZE, fp))
kh_put(s, h, strdup(buf), &ret); // FIXME: check ret
fclose(fp);
while (fgets(buf, BUF_SIZE, stdin)) {
if (++lineno%4 == 1)
flag = (kh_get(s, h, buf + 1) != kh_end(h));
if (flag) fputs(buf, stdout);
}
kh_destroy(s, h); // FIXME: free keys before destroy
return 0;
}
CC=gcc
FLAGS=-O3 -Wall
FQ=input.fastq
LIST=list.txt
TMP=/stornext/snfs0/next-gen/Illumina/Instruments/700733/110803_SN733_0151_BD07UMACXX/Results/Project_110803_SN733_0151_BD07UMACXX/Sample_D07UMACXX-8-ID12/D07UMACXX-8-ID12_1_sequence.txt
uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
ifeq ($(uname_S),Linux)
O_FQ=$(TMP)
SED=sed
else
O_FQ=/Users/drio/sshfs/ardmore$(TMP)
SED=gsed
endif
ifndef N_READS
N_READS=1000000
endif
BIN=fqextract
BIN_MM=mmap_fqextract
LINE=@printf '_________________________________________________ ';
all: clean run
run: $(FQ) $(LIST) $(BIN) $(BIN_MM) out.txt out_mmap.txt diff
diff: out.txt out_mmap.txt
$(LINE) echo "DIFF"
diff $+ | wc -l
out.txt: $(FQ) $(BIN) $(LIST)
$(LINE) echo "BM: $(BIN)"
time cat $(FQ) | ./$(BIN) $(LIST) > out.txt
out_mmap.txt: $(FQ) $(BIN_MM) $(LIST)
$(LINE) echo "BM: $(BIN_MM)"
time ./$(BIN_MM) $(LIST) $(FQ) > out_mmap.txt
khash.h:
curl https://raw.github.com/attractivechaos/klib/master/khash.h > $@
$(FQ): $(O_FQ)
time head -$(N_READS) $< | awk '{print $$1}' > $@
@echo "Number of reads: `cat $@ | egrep -e "^@HWI" | wc -l`"
$(LIST): $(FQ)
egrep -e "^@HWI" $< | awk '{print $$1}' | $(SED) -s 's/@//' |\
ruby -ne 'puts $$_ if rand(1000) % 5 == 0' > $@
wc -l $@
$(BIN): $(BIN).c khash.h
$(CC) $(FLAGS) $< -o $@
$(BIN_MM): $(BIN_MM).c khash.h
$(CC) $(FLAGS) $< -o $@
clean:
rm -f out* $(LIST) $(FQ) $(BIN) $(BIN_MM)
.PHONY: clean
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include "khash.h" // khash.h from samtools/bwa/seqtk/klib
KHASH_SET_INIT_STR(s)
#define BUF_SIZE 4096
int main(int argc, char *argv[])
{
char buf[BUF_SIZE];
int fd;
int lineno = 0, flag = 0, ret, bp = 0;
struct stat sb;
char *p, *mp; // start of mmap data, pointer within the mmap data
off_t len;
khash_t(s) *h;
if (argc == 2) {
fprintf(stderr, "Usage: fqextract <name.lst> <in.fq>\n");
return 1;
}
h = kh_init(s);
fd = open(argv[1], O_RDONLY); // FIXME: check fp
fstat(fd, &sb);
p = mmap(0, sb.st_size, PROT_READ, MAP_SHARED, fd, 0);
close(fd);
for (len=0; len < sb.st_size; len++) {
if (p[len] == '\n') {
buf[bp] = '\0';
bp = 0;
kh_put(s, h, strdup(buf), &ret); // FIXME: check ret
}
else
buf[bp++] = p[len];
}
munmap(p, sb.st_size);
fd = open(argv[2], O_RDWR); // FIXME: check fp
fstat(fd, &sb);
p = mmap(0, sb.st_size, PROT_WRITE, MAP_SHARED, fd, 0);
close(fd);
for (len=0, mp=p, lineno=0; len < sb.st_size; len++) {
if (p[len] == '\n') {
if (lineno == 0) {
p[len] = '\0';
flag = (kh_get(s, h, mp+1) != kh_end(h));
p[len] = '\n';
lineno = 1;
}
else if (lineno == 3) {
p[len+1] = '\0';
if (flag) fputs(mp, stdout);
p[len+1] = '@';
mp = p + len + 1;
lineno = 0;
}
else {
lineno++;
}
}
}
munmap(p, sb.st_size);
kh_destroy(s, h); // FIXME: free keys before destroy
return 0;
}
@ehenrion
Copy link

ehenrion commented Apr 4, 2017

I do not understand how to use this program...

I downloaded it and tried 'make all' but it failed with this error :
make: *** No rule to make target /stornext/snfs0/next-gen/Illumina/Instruments/700733/110803_SN733_0151_BD07UMACXX/Results/Project_110803_SN733_0151_BD07UMACXX/Sample_D07UMACXX-8-ID12/D07UMACXX-8-ID12_1_sequence.txt', needed by input.fastq'. Stop.

Am I not supposed to build it ?

Any help would be very much appreciated.

Thanks !

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment