-
-
Save pb-cdunn/a71046d8520c195e74e4 to your computer and use it in GitHub Desktop.
Split dazzler-db into smaller blocks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/******************************************************************************************* | |
* | |
* Split a .db into a set of sub-database blocks for use by the Dazzler: | |
* Divide the database <path>.db conceptually into a series of blocks referable to on the | |
* command line as <path>.1.db, <path>.2.db, ... If the -x option is set then all reads | |
* less than the given length are ignored, and if the -a option is not set then secondary | |
* reads from a given well are also ignored. The remaining reads are split amongst the | |
* blocks so that each block is of size -s * 1Mbp except for the last which necessarily | |
* contains a smaller residual. The default value for -s is 400Mbp because blocks of this | |
* size can be compared by our "overlapper" dalign in roughly 16Gb of memory. The blocks | |
* are very space efficient in that their sub-index of the master .idx is computed on the | |
* fly when loaded, and the .bps file of base pairs is shared with the master DB. Any | |
* tracks associated with the DB are also computed on the fly when loading a database block. | |
* | |
* Author: Gene Myers | |
* Date : September 2013 | |
* Mod : New splitting definition to support incrementality, and new stub file format | |
* Date : April 2014 | |
* | |
********************************************************************************************/ | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <string.h> | |
#include <unistd.h> | |
#include "DB.h" | |
#ifdef HIDE_FILES | |
#define PATHSEP "/." | |
#else | |
#define PATHSEP "/" | |
#endif | |
static char *Usage = "[-a] [-x<int>] [-s<float(200.)>] <path:db|dam>"; | |
int main(int argc, char *argv[]) | |
{ HITS_DB db, dbs; | |
int64 dbpos; | |
FILE *dbfile, *ixfile; | |
int status; | |
int ALL; | |
int CUTOFF; | |
int64 SIZE; | |
{ int i, j, k; | |
int flags[128]; | |
char *eptr; | |
float size; | |
ARG_INIT("DBsplit") | |
CUTOFF = 0; | |
size = 200; | |
j = 1; | |
for (i = 1; i < argc; i++) | |
if (argv[i][0] == '-') | |
switch (argv[i][1]) | |
{ default: | |
ARG_FLAGS("a") | |
break; | |
case 'x': | |
ARG_NON_NEGATIVE(CUTOFF,"Min read length cutoff") | |
break; | |
case 's': | |
ARG_REAL(size) | |
if (size <= 0.) | |
{ fprintf(stderr,"%s: Block size must be a positive number\n",Prog_Name); | |
exit (1); | |
} | |
break; | |
} | |
else | |
argv[j++] = argv[i]; | |
argc = j; | |
SIZE = size*1000000ll; | |
ALL = flags['a']; | |
if (argc != 2) | |
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); | |
exit (1); | |
} | |
} | |
// Open db | |
status = Open_DB(argv[1],&db); | |
if (status < 0) | |
exit (1); | |
if (db.part > 0) | |
{ fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]); | |
exit (1); | |
} | |
{ char *pwd, *root; | |
char buffer[2*MAX_NAME+100]; | |
int nfiles; | |
int i; | |
pwd = PathTo(argv[1]); | |
if (status) | |
{ root = Root(argv[1],".dam"); | |
dbfile = Fopen(Catenate(pwd,"/",root,".dam"),"r+"); | |
} | |
else | |
{ root = Root(argv[1],".db"); | |
dbfile = Fopen(Catenate(pwd,"/",root,".db"),"r+"); | |
} | |
ixfile = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+"); | |
if (dbfile == NULL || ixfile == NULL) | |
exit (1); | |
free(pwd); | |
free(root); | |
if (fscanf(dbfile,DB_NFILE,&nfiles) != 1) | |
SYSTEM_ERROR | |
for (i = 0; i < nfiles; i++) | |
if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL) | |
SYSTEM_ERROR | |
if (fread(&dbs,sizeof(HITS_DB),1,ixfile) != 1) | |
SYSTEM_ERROR | |
if (dbs.cutoff >= 0) | |
{ printf("You are about to overwrite the current partition settings. This\n"); | |
printf("will invalidate any tracks, overlaps, and other derivative files.\n"); | |
printf("Are you sure you want to proceed? [Y/N] "); | |
fflush(stdout); | |
if (fgets(buffer,100,stdin) == NULL) | |
SYSTEM_ERROR | |
if (index(buffer,'n') != NULL || index(buffer,'N') != NULL) | |
{ printf("Aborted\n"); | |
fflush(stdout); | |
fclose(dbfile); | |
exit (1); | |
} | |
} | |
dbpos = ftello(dbfile); | |
fseeko(dbfile,dbpos,SEEK_SET); | |
fprintf(dbfile,DB_NBLOCK,0); | |
fprintf(dbfile,DB_PARAMS,SIZE,CUTOFF,ALL); | |
} | |
{ HITS_READ *reads = db.reads; | |
int nreads = db.ureads; | |
int64 totlen; | |
int nblock, ireads, treads, rlen, fno; | |
int i; | |
nblock = 0; | |
totlen = 0; | |
ireads = 0; | |
treads = 0; | |
fprintf(dbfile,DB_BDATA,0,0); | |
if (ALL) | |
for (i = 0; i < nreads; i++) | |
{ rlen = reads[i].rlen; | |
if (rlen >= CUTOFF) | |
{ ireads += 1; | |
treads += 1; | |
totlen += rlen; | |
if (totlen >= SIZE) | |
{ fprintf(dbfile,DB_BDATA,i+1,treads); | |
totlen = 0; | |
ireads = 0; | |
nblock += 1; | |
} | |
} | |
} | |
else | |
for (i = 0; i < nreads; i++) | |
{ rlen = reads[i].rlen; | |
if (rlen >= CUTOFF && (reads[i].flags & DB_BEST) != 0) | |
{ ireads += 1; | |
treads += 1; | |
totlen += rlen; | |
if (totlen >= SIZE) | |
{ fprintf(dbfile,DB_BDATA,i+1,treads); | |
totlen = 0; | |
ireads = 0; | |
nblock += 1; | |
} | |
} | |
} | |
if (ireads > 0) | |
{ fprintf(dbfile,DB_BDATA,nreads,treads); | |
nblock += 1; | |
} | |
fno = fileno(dbfile); | |
if (ftruncate(fno,ftello(dbfile)) < 0) | |
SYSTEM_ERROR | |
fseeko(dbfile,dbpos,SEEK_SET); | |
fprintf(dbfile,DB_NBLOCK,nblock); | |
dbs.cutoff = CUTOFF; | |
dbs.all = ALL; | |
dbs.treads = treads; | |
rewind(ixfile); | |
fwrite(&dbs,sizeof(HITS_DB),1,ixfile); | |
} | |
fclose(ixfile); | |
fclose(dbfile); | |
Close_DB(&db); | |
exit (0); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment