Skip to content

Instantly share code, notes, and snippets.

@pb-cdunn
Created December 14, 2015 14:33
Show Gist options
  • Save pb-cdunn/a71046d8520c195e74e4 to your computer and use it in GitHub Desktop.
Save pb-cdunn/a71046d8520c195e74e4 to your computer and use it in GitHub Desktop.
Split dazzler-db into smaller blocks
/*******************************************************************************************
*
* Split a .db into a set of sub-database blocks for use by the Dazzler:
* Divide the database <path>.db conceptually into a series of blocks referable to on the
* command line as <path>.1.db, <path>.2.db, ... If the -x option is set then all reads
* less than the given length are ignored, and if the -a option is not set then secondary
* reads from a given well are also ignored. The remaining reads are split amongst the
* blocks so that each block is of size -s * 1Mbp except for the last which necessarily
* contains a smaller residual. The default value for -s is 400Mbp because blocks of this
* size can be compared by our "overlapper" dalign in roughly 16Gb of memory. The blocks
* are very space efficient in that their sub-index of the master .idx is computed on the
* fly when loaded, and the .bps file of base pairs is shared with the master DB. Any
* tracks associated with the DB are also computed on the fly when loading a database block.
*
* Author: Gene Myers
* Date : September 2013
* Mod : New splitting definition to support incrementality, and new stub file format
* Date : April 2014
*
********************************************************************************************/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include "DB.h"
#ifdef HIDE_FILES
#define PATHSEP "/."
#else
#define PATHSEP "/"
#endif
static char *Usage = "[-a] [-x<int>] [-s<float(200.)>] <path:db|dam>";
int main(int argc, char *argv[])
{ HITS_DB db, dbs;
int64 dbpos;
FILE *dbfile, *ixfile;
int status;
int ALL;
int CUTOFF;
int64 SIZE;
{ int i, j, k;
int flags[128];
char *eptr;
float size;
ARG_INIT("DBsplit")
CUTOFF = 0;
size = 200;
j = 1;
for (i = 1; i < argc; i++)
if (argv[i][0] == '-')
switch (argv[i][1])
{ default:
ARG_FLAGS("a")
break;
case 'x':
ARG_NON_NEGATIVE(CUTOFF,"Min read length cutoff")
break;
case 's':
ARG_REAL(size)
if (size <= 0.)
{ fprintf(stderr,"%s: Block size must be a positive number\n",Prog_Name);
exit (1);
}
break;
}
else
argv[j++] = argv[i];
argc = j;
SIZE = size*1000000ll;
ALL = flags['a'];
if (argc != 2)
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage);
exit (1);
}
}
// Open db
status = Open_DB(argv[1],&db);
if (status < 0)
exit (1);
if (db.part > 0)
{ fprintf(stderr,"%s: Cannot be called on a block: %s\n",Prog_Name,argv[1]);
exit (1);
}
{ char *pwd, *root;
char buffer[2*MAX_NAME+100];
int nfiles;
int i;
pwd = PathTo(argv[1]);
if (status)
{ root = Root(argv[1],".dam");
dbfile = Fopen(Catenate(pwd,"/",root,".dam"),"r+");
}
else
{ root = Root(argv[1],".db");
dbfile = Fopen(Catenate(pwd,"/",root,".db"),"r+");
}
ixfile = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+");
if (dbfile == NULL || ixfile == NULL)
exit (1);
free(pwd);
free(root);
if (fscanf(dbfile,DB_NFILE,&nfiles) != 1)
SYSTEM_ERROR
for (i = 0; i < nfiles; i++)
if (fgets(buffer,2*MAX_NAME+100,dbfile) == NULL)
SYSTEM_ERROR
if (fread(&dbs,sizeof(HITS_DB),1,ixfile) != 1)
SYSTEM_ERROR
if (dbs.cutoff >= 0)
{ printf("You are about to overwrite the current partition settings. This\n");
printf("will invalidate any tracks, overlaps, and other derivative files.\n");
printf("Are you sure you want to proceed? [Y/N] ");
fflush(stdout);
if (fgets(buffer,100,stdin) == NULL)
SYSTEM_ERROR
if (index(buffer,'n') != NULL || index(buffer,'N') != NULL)
{ printf("Aborted\n");
fflush(stdout);
fclose(dbfile);
exit (1);
}
}
dbpos = ftello(dbfile);
fseeko(dbfile,dbpos,SEEK_SET);
fprintf(dbfile,DB_NBLOCK,0);
fprintf(dbfile,DB_PARAMS,SIZE,CUTOFF,ALL);
}
{ HITS_READ *reads = db.reads;
int nreads = db.ureads;
int64 totlen;
int nblock, ireads, treads, rlen, fno;
int i;
nblock = 0;
totlen = 0;
ireads = 0;
treads = 0;
fprintf(dbfile,DB_BDATA,0,0);
if (ALL)
for (i = 0; i < nreads; i++)
{ rlen = reads[i].rlen;
if (rlen >= CUTOFF)
{ ireads += 1;
treads += 1;
totlen += rlen;
if (totlen >= SIZE)
{ fprintf(dbfile,DB_BDATA,i+1,treads);
totlen = 0;
ireads = 0;
nblock += 1;
}
}
}
else
for (i = 0; i < nreads; i++)
{ rlen = reads[i].rlen;
if (rlen >= CUTOFF && (reads[i].flags & DB_BEST) != 0)
{ ireads += 1;
treads += 1;
totlen += rlen;
if (totlen >= SIZE)
{ fprintf(dbfile,DB_BDATA,i+1,treads);
totlen = 0;
ireads = 0;
nblock += 1;
}
}
}
if (ireads > 0)
{ fprintf(dbfile,DB_BDATA,nreads,treads);
nblock += 1;
}
fno = fileno(dbfile);
if (ftruncate(fno,ftello(dbfile)) < 0)
SYSTEM_ERROR
fseeko(dbfile,dbpos,SEEK_SET);
fprintf(dbfile,DB_NBLOCK,nblock);
dbs.cutoff = CUTOFF;
dbs.all = ALL;
dbs.treads = treads;
rewind(ixfile);
fwrite(&dbs,sizeof(HITS_DB),1,ixfile);
}
fclose(ixfile);
fclose(dbfile);
Close_DB(&db);
exit (0);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment