Skip to content

Instantly share code, notes, and snippets.

@guillermo-carrasco
Created May 13, 2013 16:13
Show Gist options
  • Save guillermo-carrasco/5569506 to your computer and use it in GitHub Desktop.
Save guillermo-carrasco/5569506 to your computer and use it in GitHub Desktop.
Unique read IDs for simLibrary
diff -ru simNGS.orig/src/simlibrary.c simNGS/src/simlibrary.c
--- simNGS.orig/src/simlibrary.c 2013-05-13 17:45:17.000000000 +0200
+++ simNGS/src/simlibrary.c 2013-05-13 18:06:41.000000000 +0200
@@ -33,12 +33,13 @@
#define QUOTE(A) Q_(A)
#define DEFAULT_COV 0.055
#define DEFAULT_OUT "fasta"
+#define DEFAULT_UNIQUE "not"
#define DEFAULT_INSERT 400
#define DEFAULT_NCYCLE 45
#define DEFAULT_COVERAGE 2.0
#define DEFAULT_BIAS 0.5
#define PROGNAME "simLibrary"
-#define PROGVERSION "1.4"
+#define PROGVERSION "1.5"
uint32_t nfragment_from_coverage(const uint32_t genlen, const real_t coverage, const uint32_t readlen, const bool paired){
const uint32_t bases_per_read = paired?(2*readlen):readlen;
@@ -71,7 +72,7 @@
"Split sequence into a simulated library of fragments\n"
"\n"
"Usage:\n"
-"\t" PROGNAME " [-b bias] [-c cov] [-g lower:upper] [-i insertlen]\n"
+"\t" PROGNAME " [-b bias] [-c cov] [-g lower:upper] [-i insertlen] [-u unique]\n"
"\t [-m multiplier_file] [-n nfragments] -p [-r readlen] [-s strand]\n"
"\t [-v variance] [-x coverage] [-o output] [--seed seed] seq1.fa ...\n"
"\t" PROGNAME " --help\n"
@@ -124,6 +125,9 @@
"and \"casava\", for a more standard (CASAVA 1.8) format as shown in:\n"
"http://en.wikipedia.org/wiki/FASTQ_format.\n"
"\n"
+"-u, --unique [default: " QUOTE(DEFAULT_UNIQUE) "]\n"
+"\tGenerate unique identifiers for the reads, independently if the reference fasta has\n"
+"more than one chromosome or if the input are several files.\n"
"-g, --gel_cut lower:upper [default: no cut]\n"
"\tStrict lower and upper boundaries for fragment length, representing a\n"
"\"cut\" of a gel. The default is no boundaries.\n"
@@ -191,6 +195,7 @@
{ "bias", required_argument, NULL, 'b'},
{ "cov", required_argument, NULL, 'c'},
{ "output", required_argument, NULL, 'o'},
+ { "unique", required_argument, NULL, 'u'},
{ "gel_cut", required_argument, NULL, 'g'},
{ "insert", required_argument, NULL, 'i'},
{ "mutate", optional_argument, NULL, 3},
@@ -213,7 +218,7 @@
bool paired;
uint32_t seed;
real_t variance,cov,strand_bias,coverage;
- CSTRING output;
+ CSTRING output, unique;
enum strand_opt strand;
FILE * multiplier_fp;
real_t cut_lower, cut_upper;
@@ -231,6 +236,7 @@
opt->cov = DEFAULT_COV;
opt->coverage = DEFAULT_COVERAGE;
opt->output = DEFAULT_OUT;
+ opt->unique = DEFAULT_UNIQUE;
opt->nfragment = 0;
opt->paired = true;
opt->strand_bias = DEFAULT_BIAS;
@@ -276,7 +282,7 @@
OPT opt = new_OPT();
validate(NULL!=opt,NULL);
- while ((ch = getopt_long(argc, argv, "b:c:o:g:i:m:n:pr:s:v:x:h", longopts, NULL)) != -1){
+ while ((ch = getopt_long(argc, argv, "b:c:o:u:g:i:m:n:pr:s:v:x:h", longopts, NULL)) != -1){
switch(ch){
case 'b':
opt->strand_bias = parse_real(optarg);
@@ -290,6 +296,8 @@
if(!strncasecmp(optarg,"fasta",5)){ opt->output = "fasta"; break;}
if(!strncasecmp(optarg,"casava",6)){ opt->output = "casava"; break;}
errx(EXIT_FAILURE,"Unrecognised choice \"%s\" for --output format",optarg); break;
+ case 'u':
+ if(!strncasecmp(optarg,"unique",6)){ opt->unique = "unique"; break;}
case 'g':
ret = sscanf(optarg, real_format_str ":" real_format_str ,&opt->cut_lower,&opt->cut_upper);
if(ret<1){
@@ -421,6 +429,9 @@
warnx("Failed to open file \"%s\" for input",argv[0]);
}
}
+ //Keep a counter with the number of chromosomes (or files)
+ //to get unique read IDs
+ uint32_t chromosomes = 0;
while (NULL!=fp && (seq=sequence_from_fasta(fp))!=NULL){
// Read multiplier from file, if available
double multiplier = 1.;
@@ -440,6 +451,8 @@
// Number of fragments
uint32_t nfragment = multiplier * ( (opt->nfragment)?opt->nfragment:nfragment_from_coverage(seq->length,opt->coverage,opt->ncycle,opt->paired) );
+ uint32_t seq_id = 0;
+ if(!strcmp(opt->unique, "unique")) seq_id = chromosomes*nfragment;
for ( uint32_t i=0 ; i<nfragment ; i++,tot_fragments++){
const uint32_t fraglen = (opt->paired)?(uint32_t)(rlognorm_with_cuts(log_mean,log_sd,opt->cut_lower,opt->cut_upper)):opt->ncycle;
if(fraglen>seq->length){
@@ -458,7 +471,7 @@
SEQ sampseq = (strand=='+')? copy_SEQ(mutseq) : reverse_complement_SEQ(mutseq,false);
free_SEQ(mutseq);
- CSTRING sampname = fragname(seq->name,i+1,strand,loc,fraglen, opt->output);
+ CSTRING sampname = fragname(seq->name,seq_id+i+1,strand,loc,fraglen, opt->output);
free_CSTRING(sampseq->name);
sampseq->name = sampname;
@@ -467,6 +480,7 @@
if( (tot_fragments%100000)==99999 ){ fprintf(stderr,"\rDone: %8u",tot_fragments+1); }
}
free_SEQ(seq);
+ chromosomes++;
}
fclose(fp);
argc--;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment