Last active
December 16, 2015 10:48
-
-
Save brainstorm/5422455 to your computer and use it in GitHub Desktop.
Patch for SimNGS to support CASAVA 1.8 FASTQ format
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff -ru simNGS.orig/src/intensities.c simNGS/src/intensities.c | |
--- simNGS.orig/src/intensities.c 2011-11-14 13:04:17.000000000 +0200 | |
+++ simNGS/src/intensities.c 2013-04-24 15:29:40.100739000 +0300 | |
@@ -356,7 +356,7 @@ | |
ints->x[NBASE*i+seq.elt[i]] += lambda; | |
} else { | |
uint32_t j = i-seq.nelt; | |
- if(j>=0 && j<adapter.nelt && NUC_AMBIG != adapter.elt[j]){ | |
+ if(j<adapter.nelt && NUC_AMBIG != adapter.elt[j]){ | |
ints->x[NBASE*i+adapter.elt[j]] += lambda; | |
} | |
} | |
diff -ru simNGS.orig/src/Makefile.osx simNGS/src/Makefile.osx | |
--- simNGS.orig/src/Makefile.osx 2012-01-01 13:19:27.000000000 +0200 | |
+++ simNGS/src/Makefile.osx 2013-04-24 15:29:40.098734000 +0300 | |
@@ -1,4 +1,4 @@ | |
-CC = gcc | |
+CC = clang | |
CFLAGS = -Wall -O3 -funroll-loops -DNDEBUG -fomit-frame-pointer -std=gnu99 -msse2 -Wno-unused | |
#CFLAGS = -std=c99 -Wall -g -std=gnu99 -fstack-protector-all -D_FORTIFY_SOURCE=2 | |
#CFLAGS = -pg -O3 -std=gnu99 | |
diff -ru simNGS.orig/src/sequence.c simNGS/src/sequence.c | |
--- simNGS.orig/src/sequence.c 2011-11-11 17:17:14.000000000 +0200 | |
+++ simNGS/src/sequence.c 2013-04-24 17:18:02.840657000 +0300 | |
@@ -297,17 +297,23 @@ | |
} | |
-void show_SEQ( FILE * fp, const SEQ seq){ | |
+void show_SEQ( FILE * fp, const SEQ seq, CSTRING fmt){ | |
validate(NULL!=fp,); | |
validate(NULL!=seq,); | |
(!hasQual(seq)) ? fputc('>',fp) : fputc('@',fp); | |
if(NULL!=seq->name){ fputs(seq->name,fp); fputc(' ',fp);} | |
- show_CIGLIST(fp,seq->cigar); | |
+ | |
+ // only print CIGAR if we are writing on original SIMNGS read format | |
+ if(!strncasecmp(fmt,"orig",4)) | |
+ show_CIGLIST(fp,seq->cigar); | |
+ | |
fputc('\n',fp); | |
- for ( uint32_t i=0 ; i<seq->length ; i++){ | |
- show_NUC(fp,seq->seq.elt[i]); | |
- } | |
+// if(!strncasecmp(fmt,"orig",4)) { | |
+ for ( uint32_t i=0 ; i<seq->length ; i++){ | |
+ show_NUC(fp,seq->seq.elt[i]); | |
+ } | |
+// } | |
fputc('\n',fp); | |
if(hasQual(seq)){ | |
fputc('+',fp); | |
@@ -547,7 +553,7 @@ | |
} | |
// Cigar string | |
free_CIGLIST(subseq->cigar); | |
- subseq->cigar = sub_cigar(seq->cigar,len); | |
+ //subseq->cigar = sub_cigar(seq->cigar,len); | |
return subseq; | |
} | |
@@ -559,13 +565,16 @@ | |
SEQ seq=NULL; | |
while( NULL!=(seq=sequence_from_file(fp)) ){ | |
- show_SEQ(stdout,seq); | |
- SEQ mutseq = mutate_SEQ(seq,0.02,0.02,0.02); | |
- show_SEQ(stdout,mutseq); | |
+ show_SEQ(stdout,seq, NULL); | |
+ | |
+ SEQ mutseq = mutate_SEQ(seq,0.02,0.02,0.02); | |
+ show_SEQ(stdout,mutseq, NULL); | |
+ | |
SEQ rcseq = reverse_complement_SEQ(mutseq); | |
- show_SEQ(stdout,rcseq); | |
+ show_SEQ(stdout,rcseq, NULL); | |
+ | |
free_SEQ(rcseq); | |
- free_SEQ(mutseq); | |
+ free_SEQ(mutseq); | |
free_SEQ(seq); | |
} | |
return EXIT_SUCCESS; | |
diff -ru simNGS.orig/src/sequence.h simNGS/src/sequence.h | |
--- simNGS.orig/src/sequence.h 2011-11-11 17:35:48.000000000 +0200 | |
+++ simNGS/src/sequence.h 2013-04-24 15:29:40.104762000 +0300 | |
@@ -62,7 +62,7 @@ | |
SEQ new_SEQ(const uint32_t len, const bool has_qual); | |
SEQ sequence_from_str( const char * restrict name, const char * restrict seqstr, const char * qname, const char * restrict qualstr ); | |
SEQ copy_SEQ( const SEQ seq); | |
-void show_SEQ(FILE * fp, const SEQ seq); | |
+void show_SEQ(FILE * fp, const SEQ seq, CSTRING fmt); | |
// Creating | |
SEQ sequence_from_file ( FILE * fp); | |
diff -ru simNGS.orig/src/simlibrary.c simNGS/src/simlibrary.c | |
--- simNGS.orig/src/simlibrary.c 2012-01-01 13:49:48.000000000 +0200 | |
+++ simNGS/src/simlibrary.c 2013-04-24 17:25:11.070944000 +0300 | |
@@ -32,21 +32,34 @@ | |
#define Q_(A) #A | |
#define QUOTE(A) Q_(A) | |
#define DEFAULT_COV 0.055 | |
+#define DEFAULT_OUT "fasta" | |
#define DEFAULT_INSERT 400 | |
#define DEFAULT_NCYCLE 45 | |
#define DEFAULT_COVERAGE 2.0 | |
#define DEFAULT_BIAS 0.5 | |
#define PROGNAME "simLibrary" | |
-#define PROGVERSION "1.3" | |
+#define PROGVERSION "1.4" | |
uint32_t nfragment_from_coverage(const uint32_t genlen, const real_t coverage, const uint32_t readlen, const bool paired){ | |
const uint32_t bases_per_read = paired?(2*readlen):readlen; | |
return (uint32_t)(0.5+(genlen*coverage)/bases_per_read); | |
} | |
-CSTRING fragname(const CSTRING name, const unsigned int idx, const char strand, const uint32_t loc, const uint32_t fraglen){ | |
- char * str; | |
- asprintf(&str,"Frag_%u %s (Strand %c Offset %u--%u)",idx,name,strand,loc+1,loc+fraglen); | |
+CSTRING fragname(const CSTRING name, const unsigned int idx, const char strand, const uint32_t loc, const uint32_t fraglen, CSTRING fmt){ | |
+ char * str = NULL; | |
+ if(!strcmp(fmt, "casava")){ | |
+ // CASAVA 1.8 format: | |
+ // | |
+ // @EAS139:136:FC706VJ:2:5:1000:12850 1:Y:18:ATCACG | |
+ // | |
+ // Information like tiles coordinates is made up by loc & fraglen, | |
+ // so it does not make much sense semantically. The intention is that programs | |
+ // expecting a certain header structure do not bail out unnecesarily. | |
+ asprintf(&str,"SIMNGS:%u:fcsimNGS:1:1:1:1 1:N:2:GATTACA", idx); | |
+ } else if (!strcmp(fmt, "fasta")){ | |
+ // Old custom format by original SIMNGS's simlibrary | |
+ asprintf(&str,"Frag_%u %s (Strand %c Offset %u--%u)",idx,name,strand,loc+1,loc+fraglen); | |
+ } | |
return str; | |
} | |
@@ -60,7 +73,7 @@ | |
"Usage:\n" | |
"\t" PROGNAME " [-b bias] [-c cov] [-g lower:upper] [-i insertlen]\n" | |
"\t [-m multiplier_file] [-n nfragments] -p [-r readlen] [-s strand]\n" | |
-"\t [-v variance] [-x coverage] [--seed seed] seq1.fa ...\n" | |
+"\t [-v variance] [-x coverage] [-o output] [--seed seed] seq1.fa ...\n" | |
"\t" PROGNAME " --help\n" | |
"\t" PROGNAME " --licence\n" | |
"\t" PROGNAME " --version\n" | |
@@ -106,6 +119,11 @@ | |
"log-normal distribution by cov = exp(var)-1. If the variance option is set,\n" | |
"it takes presidence.\n" | |
"\n" | |
+"-o, --output format [default: " QUOTE(DEFAULT_OUT) "]\n" | |
+"Formats supported are \"fasta\" for original headers used by SIMNGS's simlibrary\n" | |
+"and \"casava\", for a more standard (CASAVA 1.8) format as shown in:\n" | |
+"http://en.wikipedia.org/wiki/FASTQ_format.\n" | |
+"\n" | |
"-g, --gel_cut lower:upper [default: no cut]\n" | |
"\tStrict lower and upper boundaries for fragment length, representing a\n" | |
"\"cut\" of a gel. The default is no boundaries.\n" | |
@@ -166,6 +184,7 @@ | |
static struct option longopts[] = { | |
{ "bias", required_argument, NULL, 'b'}, | |
{ "cov", required_argument, NULL, 'c'}, | |
+ { "output", required_argument, NULL, 'o'}, | |
{ "gel_cut", required_argument, NULL, 'g'}, | |
{ "insert", required_argument, NULL, 'i'}, | |
{ "mutate", optional_argument, NULL, 3}, | |
@@ -188,6 +207,7 @@ | |
bool paired; | |
uint32_t seed; | |
real_t variance,cov,strand_bias,coverage; | |
+ CSTRING output; | |
enum strand_opt strand; | |
FILE * multiplier_fp; | |
real_t cut_lower, cut_upper; | |
@@ -204,6 +224,7 @@ | |
opt->variance = 0; | |
opt->cov = DEFAULT_COV; | |
opt->coverage = DEFAULT_COVERAGE; | |
+ opt->output = DEFAULT_OUT; | |
opt->nfragment = 0; | |
opt->paired = true; | |
opt->strand_bias = DEFAULT_BIAS; | |
@@ -249,7 +270,7 @@ | |
OPT opt = new_OPT(); | |
validate(NULL!=opt,NULL); | |
- while ((ch = getopt_long(argc, argv, "b:c:g:i:m:n:pr:s:v:x:h", longopts, NULL)) != -1){ | |
+ while ((ch = getopt_long(argc, argv, "b:c:o:g:i:m:n:pr:s:v:x:h", longopts, NULL)) != -1){ | |
switch(ch){ | |
case 'b': | |
opt->strand_bias = parse_real(optarg); | |
@@ -259,6 +280,10 @@ | |
opt->cov = parse_real(optarg); | |
if(opt->cov<0.0){errx(EXIT_FAILURE,"Coefficient Of Variance of for insert size should be non-zero");} | |
break; | |
+ case 'o': | |
+ if(!strncasecmp(optarg,"fasta",5)){ opt->output = "fasta"; break;} | |
+ if(!strncasecmp(optarg,"casava",6)){ opt->output = "casava"; break;} | |
+ errx(EXIT_FAILURE,"Unrecognised choice fastq format\"%s\"",optarg); break; | |
case 'g': | |
ret = sscanf(optarg, real_format_str ":" real_format_str ,&opt->cut_lower,&opt->cut_upper); | |
if(ret<1){ | |
@@ -426,12 +451,12 @@ | |
free_SEQ(fragseq); | |
SEQ sampseq = (strand=='+')? copy_SEQ(mutseq) : reverse_complement_SEQ(mutseq,false); | |
free_SEQ(mutseq); | |
- | |
- CSTRING sampname = fragname(seq->name,i+1,strand,loc,fraglen); | |
+ | |
+ CSTRING sampname = fragname(seq->name,i+1,strand,loc,fraglen, opt->output); | |
free_CSTRING(sampseq->name); | |
sampseq->name = sampname; | |
- show_SEQ(stdout,sampseq); | |
+ show_SEQ(stdout,sampseq, opt->output); | |
free_SEQ(sampseq); | |
if( (tot_fragments%100000)==99999 ){ fprintf(stderr,"\rDone: %8u",tot_fragments+1); } | |
} | |
diff -ru simNGS.orig/src/simNGS.c simNGS/src/simNGS.c | |
--- simNGS.orig/src/simNGS.c 2012-01-01 13:49:39.000000000 +0200 | |
+++ simNGS/src/simNGS.c 2013-04-24 17:42:48.322593000 +0300 | |
@@ -43,12 +43,12 @@ | |
#define STRINGIFY(A) #A | |
#define ILLUMINA_ADAPTER "AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGAT" | |
#define PROGNAME "simNGS" | |
-#define PROGVERSION "1.6" | |
+#define PROGVERSION "1.7" | |
enum paired_type { PAIRED_TYPE_SINGLE=0, PAIRED_TYPE_CYCLE, PAIRED_TYPE_PAIRED }; | |
char * paired_type_str[] = {"single","cycle","paired"}; | |
-enum outformat { OUTPUT_LIKE=0, OUTPUT_FASTA, OUTPUT_FASTQ }; | |
-const char * output_format_str[] = { "likelihood", "fasta", "fastq" }; | |
+enum outformat { OUTPUT_LIKE=0, OUTPUT_FASTA, OUTPUT_FASTQ, OUTPUT_CASAVA }; | |
+const char * output_format_str[] = { "likelihood", "fasta", "fastq", "casava" }; | |
ARRAY(NUC) ambigseq = {NULL,0}; | |
@@ -234,7 +234,7 @@ | |
"\n" | |
"-o, --output format [default: fastq]\n" | |
"\tFormat in which to output results. Either \"likelihood\", \"fasta\",\n" | |
-"or \"fastq\".\n" | |
+",\"fastq\" or \"casava\".\n" | |
"\n" | |
"-O, --outfile prefix [default: stdout]\n" | |
"\tPrefix of filenames to output results. If simulating paired-end\n" | |
@@ -564,6 +564,9 @@ | |
else if ( strcasecmp(optarg,output_format_str[OUTPUT_FASTQ])==0 ){ | |
simopt->format = OUTPUT_FASTQ; | |
if(simopt->mu==0){ simopt->mu = 1e-5;} | |
+ } else if ( strcasecmp(optarg,output_format_str[OUTPUT_CASAVA])==0 ){ | |
+ simopt->format = OUTPUT_CASAVA; | |
+ if(simopt->mu==0){ simopt->mu = 1e-5;} | |
} else { | |
errx(EXIT_FAILURE,"Unrecognised output option %s.",optarg); | |
} | |
@@ -730,8 +733,14 @@ | |
output_fastq_sub(simopt->outfp[0],simopt,seqname,cigar1,"",called1,called2); | |
break; | |
case PAIRED_TYPE_PAIRED: | |
- output_fastq_sub(simopt->outfp[0],simopt,seqname,cigar1,use_suff?"/1":"",called1,NULL); | |
- output_fastq_sub(simopt->outfp[1],simopt,seqname,cigar2,use_suff?"/2":"",called2,NULL); | |
+ if (simopt->format == OUTPUT_CASAVA) { | |
+ output_fastq_sub(simopt->outfp[0],simopt,seqname,null_CIGLIST,"",called1,NULL); | |
+ output_fastq_sub(simopt->outfp[1],simopt,seqname,null_CIGLIST,"",called2,NULL); | |
+ } | |
+ else { | |
+ output_fastq_sub(simopt->outfp[0],simopt,seqname,cigar1,use_suff?"/1":"",called1,NULL); | |
+ output_fastq_sub(simopt->outfp[1],simopt,seqname,cigar2,use_suff?"/2":"",called2,NULL); | |
+ } | |
break; | |
default: | |
errx(EXIT_FAILURE,"Unrecognised case %s (%s:%d)",__func__,__FILE__,__LINE__); | |
@@ -758,6 +767,9 @@ | |
case OUTPUT_FASTQ: | |
output_fastq(simopt,seqname,cigar1,cigar2,called1,called2); | |
break; | |
+ case OUTPUT_CASAVA: | |
+ output_fastq(simopt,seqname,cigar1,cigar2,called1,called2); | |
+ break; | |
default: | |
errx(EXIT_FAILURE,"Unrecognised format in %s (%s:%d)",__func__,__FILE__,__LINE__); | |
} | |
@@ -1029,6 +1041,7 @@ | |
switch(simopt->format){ | |
case OUTPUT_LIKE: strcpy(fn+offset,".like"); offset+=5; break; | |
case OUTPUT_FASTQ: strcpy(fn+offset,".fq"); offset+=3; break; | |
+ case OUTPUT_CASAVA: strcpy(fn+offset,".fq"); offset+=3; break; | |
case OUTPUT_FASTA: strcpy(fn+offset,".fa"); offset+=3; break; | |
} | |
*(fn+offset) = '\0'; | |
@@ -1082,8 +1095,15 @@ | |
seqstr->name = copy_CSTRING(seq->name); | |
seqstr->seq = copy_ARRAY(NUC)(seq->seq); | |
seqstr->paired = model->paired; | |
- seqstr->cigar1 = sub_cigar(seq->cigar,model->ncycle); | |
- seqstr->cigar2 = null_CIGLIST; | |
+ | |
+ // No cigar strings for standard CASAVA 1.8 FASTQ format | |
+ if(simopt->format == OUTPUT_CASAVA) { | |
+ seqstr->cigar2 = null_CIGLIST; | |
+ seqstr->cigar1 = null_CIGLIST; | |
+ } else { | |
+ seqstr->cigar1 = sub_cigar(seq->cigar,model->ncycle); | |
+ seqstr->cigar2 = null_CIGLIST; | |
+ } | |
// Pick copula | |
struct pair_double lambda = correlated_distribution(simopt->threshold,simopt->corr,model->dist1,model->dist2); | |
seqstr->lambda1 = lambda.x1; | |
@@ -1091,7 +1111,7 @@ | |
// Generate intensities | |
seqstr->int1 = generate_pure_intensities(simopt->sdfact,lambda.x1,seqstr->seq,simopt->adapter1,model->ncycle,model->chol1_cycle,simopt->dustProb,simopt->invA,simopt->N,NULL); | |
- if ( model->paired ){ | |
+ if ( model->paired){ | |
seqstr->rcseq = reverse_complement(seqstr->seq); | |
CIGLIST revcig = reverse_cigar(seq->cigar); | |
seqstr->cigar2 = sub_cigar(revcig,model->ncycle); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment