Skip to content

Instantly share code, notes, and snippets.

@brainstorm
Last active December 16, 2015 10:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save brainstorm/5422455 to your computer and use it in GitHub Desktop.
Save brainstorm/5422455 to your computer and use it in GitHub Desktop.
Patch for SimNGS to support CASAVA 1.8 FASTQ format
diff -ru simNGS.orig/src/intensities.c simNGS/src/intensities.c
--- simNGS.orig/src/intensities.c 2011-11-14 13:04:17.000000000 +0200
+++ simNGS/src/intensities.c 2013-04-24 15:29:40.100739000 +0300
@@ -356,7 +356,7 @@
ints->x[NBASE*i+seq.elt[i]] += lambda;
} else {
uint32_t j = i-seq.nelt;
- if(j>=0 && j<adapter.nelt && NUC_AMBIG != adapter.elt[j]){
+ if(j<adapter.nelt && NUC_AMBIG != adapter.elt[j]){
ints->x[NBASE*i+adapter.elt[j]] += lambda;
}
}
diff -ru simNGS.orig/src/Makefile.osx simNGS/src/Makefile.osx
--- simNGS.orig/src/Makefile.osx 2012-01-01 13:19:27.000000000 +0200
+++ simNGS/src/Makefile.osx 2013-04-24 15:29:40.098734000 +0300
@@ -1,4 +1,4 @@
-CC = gcc
+CC = clang
CFLAGS = -Wall -O3 -funroll-loops -DNDEBUG -fomit-frame-pointer -std=gnu99 -msse2 -Wno-unused
#CFLAGS = -std=c99 -Wall -g -std=gnu99 -fstack-protector-all -D_FORTIFY_SOURCE=2
#CFLAGS = -pg -O3 -std=gnu99
diff -ru simNGS.orig/src/sequence.c simNGS/src/sequence.c
--- simNGS.orig/src/sequence.c 2011-11-11 17:17:14.000000000 +0200
+++ simNGS/src/sequence.c 2013-04-24 17:18:02.840657000 +0300
@@ -297,17 +297,23 @@
}
-void show_SEQ( FILE * fp, const SEQ seq){
+void show_SEQ( FILE * fp, const SEQ seq, CSTRING fmt){
validate(NULL!=fp,);
validate(NULL!=seq,);
(!hasQual(seq)) ? fputc('>',fp) : fputc('@',fp);
if(NULL!=seq->name){ fputs(seq->name,fp); fputc(' ',fp);}
- show_CIGLIST(fp,seq->cigar);
+
+ // only print CIGAR if we are writing on original SIMNGS read format
+ if(!strncasecmp(fmt,"orig",4))
+ show_CIGLIST(fp,seq->cigar);
+
fputc('\n',fp);
- for ( uint32_t i=0 ; i<seq->length ; i++){
- show_NUC(fp,seq->seq.elt[i]);
- }
+// if(!strncasecmp(fmt,"orig",4)) {
+ for ( uint32_t i=0 ; i<seq->length ; i++){
+ show_NUC(fp,seq->seq.elt[i]);
+ }
+// }
fputc('\n',fp);
if(hasQual(seq)){
fputc('+',fp);
@@ -547,7 +553,7 @@
}
// Cigar string
free_CIGLIST(subseq->cigar);
- subseq->cigar = sub_cigar(seq->cigar,len);
+ //subseq->cigar = sub_cigar(seq->cigar,len);
return subseq;
}
@@ -559,13 +565,16 @@
SEQ seq=NULL;
while( NULL!=(seq=sequence_from_file(fp)) ){
- show_SEQ(stdout,seq);
- SEQ mutseq = mutate_SEQ(seq,0.02,0.02,0.02);
- show_SEQ(stdout,mutseq);
+ show_SEQ(stdout,seq, NULL);
+
+ SEQ mutseq = mutate_SEQ(seq,0.02,0.02,0.02);
+ show_SEQ(stdout,mutseq, NULL);
+
SEQ rcseq = reverse_complement_SEQ(mutseq);
- show_SEQ(stdout,rcseq);
+ show_SEQ(stdout,rcseq, NULL);
+
free_SEQ(rcseq);
- free_SEQ(mutseq);
+ free_SEQ(mutseq);
free_SEQ(seq);
}
return EXIT_SUCCESS;
diff -ru simNGS.orig/src/sequence.h simNGS/src/sequence.h
--- simNGS.orig/src/sequence.h 2011-11-11 17:35:48.000000000 +0200
+++ simNGS/src/sequence.h 2013-04-24 15:29:40.104762000 +0300
@@ -62,7 +62,7 @@
SEQ new_SEQ(const uint32_t len, const bool has_qual);
SEQ sequence_from_str( const char * restrict name, const char * restrict seqstr, const char * qname, const char * restrict qualstr );
SEQ copy_SEQ( const SEQ seq);
-void show_SEQ(FILE * fp, const SEQ seq);
+void show_SEQ(FILE * fp, const SEQ seq, CSTRING fmt);
// Creating
SEQ sequence_from_file ( FILE * fp);
diff -ru simNGS.orig/src/simlibrary.c simNGS/src/simlibrary.c
--- simNGS.orig/src/simlibrary.c 2012-01-01 13:49:48.000000000 +0200
+++ simNGS/src/simlibrary.c 2013-04-24 17:25:11.070944000 +0300
@@ -32,21 +32,34 @@
#define Q_(A) #A
#define QUOTE(A) Q_(A)
#define DEFAULT_COV 0.055
+#define DEFAULT_OUT "fasta"
#define DEFAULT_INSERT 400
#define DEFAULT_NCYCLE 45
#define DEFAULT_COVERAGE 2.0
#define DEFAULT_BIAS 0.5
#define PROGNAME "simLibrary"
-#define PROGVERSION "1.3"
+#define PROGVERSION "1.4"
uint32_t nfragment_from_coverage(const uint32_t genlen, const real_t coverage, const uint32_t readlen, const bool paired){
const uint32_t bases_per_read = paired?(2*readlen):readlen;
return (uint32_t)(0.5+(genlen*coverage)/bases_per_read);
}
-CSTRING fragname(const CSTRING name, const unsigned int idx, const char strand, const uint32_t loc, const uint32_t fraglen){
- char * str;
- asprintf(&str,"Frag_%u %s (Strand %c Offset %u--%u)",idx,name,strand,loc+1,loc+fraglen);
+CSTRING fragname(const CSTRING name, const unsigned int idx, const char strand, const uint32_t loc, const uint32_t fraglen, CSTRING fmt){
+ char * str = NULL;
+ if(!strcmp(fmt, "casava")){
+ // CASAVA 1.8 format:
+ //
+ // @EAS139:136:FC706VJ:2:5:1000:12850 1:Y:18:ATCACG
+ //
+ // Information like tiles coordinates is made up by loc & fraglen,
+ // so it does not make much sense semantically. The intention is that programs
+ // expecting a certain header structure do not bail out unnecesarily.
+ asprintf(&str,"SIMNGS:%u:fcsimNGS:1:1:1:1 1:N:2:GATTACA", idx);
+ } else if (!strcmp(fmt, "fasta")){
+ // Old custom format by original SIMNGS's simlibrary
+ asprintf(&str,"Frag_%u %s (Strand %c Offset %u--%u)",idx,name,strand,loc+1,loc+fraglen);
+ }
return str;
}
@@ -60,7 +73,7 @@
"Usage:\n"
"\t" PROGNAME " [-b bias] [-c cov] [-g lower:upper] [-i insertlen]\n"
"\t [-m multiplier_file] [-n nfragments] -p [-r readlen] [-s strand]\n"
-"\t [-v variance] [-x coverage] [--seed seed] seq1.fa ...\n"
+"\t [-v variance] [-x coverage] [-o output] [--seed seed] seq1.fa ...\n"
"\t" PROGNAME " --help\n"
"\t" PROGNAME " --licence\n"
"\t" PROGNAME " --version\n"
@@ -106,6 +119,11 @@
"log-normal distribution by cov = exp(var)-1. If the variance option is set,\n"
"it takes presidence.\n"
"\n"
+"-o, --output format [default: " QUOTE(DEFAULT_OUT) "]\n"
+"Formats supported are \"fasta\" for original headers used by SIMNGS's simlibrary\n"
+"and \"casava\", for a more standard (CASAVA 1.8) format as shown in:\n"
+"http://en.wikipedia.org/wiki/FASTQ_format.\n"
+"\n"
"-g, --gel_cut lower:upper [default: no cut]\n"
"\tStrict lower and upper boundaries for fragment length, representing a\n"
"\"cut\" of a gel. The default is no boundaries.\n"
@@ -166,6 +184,7 @@
static struct option longopts[] = {
{ "bias", required_argument, NULL, 'b'},
{ "cov", required_argument, NULL, 'c'},
+ { "output", required_argument, NULL, 'o'},
{ "gel_cut", required_argument, NULL, 'g'},
{ "insert", required_argument, NULL, 'i'},
{ "mutate", optional_argument, NULL, 3},
@@ -188,6 +207,7 @@
bool paired;
uint32_t seed;
real_t variance,cov,strand_bias,coverage;
+ CSTRING output;
enum strand_opt strand;
FILE * multiplier_fp;
real_t cut_lower, cut_upper;
@@ -204,6 +224,7 @@
opt->variance = 0;
opt->cov = DEFAULT_COV;
opt->coverage = DEFAULT_COVERAGE;
+ opt->output = DEFAULT_OUT;
opt->nfragment = 0;
opt->paired = true;
opt->strand_bias = DEFAULT_BIAS;
@@ -249,7 +270,7 @@
OPT opt = new_OPT();
validate(NULL!=opt,NULL);
- while ((ch = getopt_long(argc, argv, "b:c:g:i:m:n:pr:s:v:x:h", longopts, NULL)) != -1){
+ while ((ch = getopt_long(argc, argv, "b:c:o:g:i:m:n:pr:s:v:x:h", longopts, NULL)) != -1){
switch(ch){
case 'b':
opt->strand_bias = parse_real(optarg);
@@ -259,6 +280,10 @@
opt->cov = parse_real(optarg);
if(opt->cov<0.0){errx(EXIT_FAILURE,"Coefficient Of Variance of for insert size should be non-zero");}
break;
+ case 'o':
+ if(!strncasecmp(optarg,"fasta",5)){ opt->output = "fasta"; break;}
+ if(!strncasecmp(optarg,"casava",6)){ opt->output = "casava"; break;}
+ errx(EXIT_FAILURE,"Unrecognised choice fastq format\"%s\"",optarg); break;
case 'g':
ret = sscanf(optarg, real_format_str ":" real_format_str ,&opt->cut_lower,&opt->cut_upper);
if(ret<1){
@@ -426,12 +451,12 @@
free_SEQ(fragseq);
SEQ sampseq = (strand=='+')? copy_SEQ(mutseq) : reverse_complement_SEQ(mutseq,false);
free_SEQ(mutseq);
-
- CSTRING sampname = fragname(seq->name,i+1,strand,loc,fraglen);
+
+ CSTRING sampname = fragname(seq->name,i+1,strand,loc,fraglen, opt->output);
free_CSTRING(sampseq->name);
sampseq->name = sampname;
- show_SEQ(stdout,sampseq);
+ show_SEQ(stdout,sampseq, opt->output);
free_SEQ(sampseq);
if( (tot_fragments%100000)==99999 ){ fprintf(stderr,"\rDone: %8u",tot_fragments+1); }
}
diff -ru simNGS.orig/src/simNGS.c simNGS/src/simNGS.c
--- simNGS.orig/src/simNGS.c 2012-01-01 13:49:39.000000000 +0200
+++ simNGS/src/simNGS.c 2013-04-24 17:42:48.322593000 +0300
@@ -43,12 +43,12 @@
#define STRINGIFY(A) #A
#define ILLUMINA_ADAPTER "AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGAT"
#define PROGNAME "simNGS"
-#define PROGVERSION "1.6"
+#define PROGVERSION "1.7"
enum paired_type { PAIRED_TYPE_SINGLE=0, PAIRED_TYPE_CYCLE, PAIRED_TYPE_PAIRED };
char * paired_type_str[] = {"single","cycle","paired"};
-enum outformat { OUTPUT_LIKE=0, OUTPUT_FASTA, OUTPUT_FASTQ };
-const char * output_format_str[] = { "likelihood", "fasta", "fastq" };
+enum outformat { OUTPUT_LIKE=0, OUTPUT_FASTA, OUTPUT_FASTQ, OUTPUT_CASAVA };
+const char * output_format_str[] = { "likelihood", "fasta", "fastq", "casava" };
ARRAY(NUC) ambigseq = {NULL,0};
@@ -234,7 +234,7 @@
"\n"
"-o, --output format [default: fastq]\n"
"\tFormat in which to output results. Either \"likelihood\", \"fasta\",\n"
-"or \"fastq\".\n"
+",\"fastq\" or \"casava\".\n"
"\n"
"-O, --outfile prefix [default: stdout]\n"
"\tPrefix of filenames to output results. If simulating paired-end\n"
@@ -564,6 +564,9 @@
else if ( strcasecmp(optarg,output_format_str[OUTPUT_FASTQ])==0 ){
simopt->format = OUTPUT_FASTQ;
if(simopt->mu==0){ simopt->mu = 1e-5;}
+ } else if ( strcasecmp(optarg,output_format_str[OUTPUT_CASAVA])==0 ){
+ simopt->format = OUTPUT_CASAVA;
+ if(simopt->mu==0){ simopt->mu = 1e-5;}
} else {
errx(EXIT_FAILURE,"Unrecognised output option %s.",optarg);
}
@@ -730,8 +733,14 @@
output_fastq_sub(simopt->outfp[0],simopt,seqname,cigar1,"",called1,called2);
break;
case PAIRED_TYPE_PAIRED:
- output_fastq_sub(simopt->outfp[0],simopt,seqname,cigar1,use_suff?"/1":"",called1,NULL);
- output_fastq_sub(simopt->outfp[1],simopt,seqname,cigar2,use_suff?"/2":"",called2,NULL);
+ if (simopt->format == OUTPUT_CASAVA) {
+ output_fastq_sub(simopt->outfp[0],simopt,seqname,null_CIGLIST,"",called1,NULL);
+ output_fastq_sub(simopt->outfp[1],simopt,seqname,null_CIGLIST,"",called2,NULL);
+ }
+ else {
+ output_fastq_sub(simopt->outfp[0],simopt,seqname,cigar1,use_suff?"/1":"",called1,NULL);
+ output_fastq_sub(simopt->outfp[1],simopt,seqname,cigar2,use_suff?"/2":"",called2,NULL);
+ }
break;
default:
errx(EXIT_FAILURE,"Unrecognised case %s (%s:%d)",__func__,__FILE__,__LINE__);
@@ -758,6 +767,9 @@
case OUTPUT_FASTQ:
output_fastq(simopt,seqname,cigar1,cigar2,called1,called2);
break;
+ case OUTPUT_CASAVA:
+ output_fastq(simopt,seqname,cigar1,cigar2,called1,called2);
+ break;
default:
errx(EXIT_FAILURE,"Unrecognised format in %s (%s:%d)",__func__,__FILE__,__LINE__);
}
@@ -1029,6 +1041,7 @@
switch(simopt->format){
case OUTPUT_LIKE: strcpy(fn+offset,".like"); offset+=5; break;
case OUTPUT_FASTQ: strcpy(fn+offset,".fq"); offset+=3; break;
+ case OUTPUT_CASAVA: strcpy(fn+offset,".fq"); offset+=3; break;
case OUTPUT_FASTA: strcpy(fn+offset,".fa"); offset+=3; break;
}
*(fn+offset) = '\0';
@@ -1082,8 +1095,15 @@
seqstr->name = copy_CSTRING(seq->name);
seqstr->seq = copy_ARRAY(NUC)(seq->seq);
seqstr->paired = model->paired;
- seqstr->cigar1 = sub_cigar(seq->cigar,model->ncycle);
- seqstr->cigar2 = null_CIGLIST;
+
+ // No cigar strings for standard CASAVA 1.8 FASTQ format
+ if(simopt->format == OUTPUT_CASAVA) {
+ seqstr->cigar2 = null_CIGLIST;
+ seqstr->cigar1 = null_CIGLIST;
+ } else {
+ seqstr->cigar1 = sub_cigar(seq->cigar,model->ncycle);
+ seqstr->cigar2 = null_CIGLIST;
+ }
// Pick copula
struct pair_double lambda = correlated_distribution(simopt->threshold,simopt->corr,model->dist1,model->dist2);
seqstr->lambda1 = lambda.x1;
@@ -1091,7 +1111,7 @@
// Generate intensities
seqstr->int1 = generate_pure_intensities(simopt->sdfact,lambda.x1,seqstr->seq,simopt->adapter1,model->ncycle,model->chol1_cycle,simopt->dustProb,simopt->invA,simopt->N,NULL);
- if ( model->paired ){
+ if ( model->paired){
seqstr->rcseq = reverse_complement(seqstr->seq);
CIGLIST revcig = reverse_cigar(seq->cigar);
seqstr->cigar2 = sub_cigar(revcig,model->ncycle);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment