-
-
Save pb-cdunn/41da6632b0cc4ff8b059 to your computer and use it in GitHub Desktop.
Accept long fasta lines
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/******************************************************************************************* | |
* | |
* Add .fasta files to a DB: | |
* Adds the given fasta files in the given order to <path>.db. If the db does not exist | |
* then it is created. All .fasta files added to a given data base must have the same | |
* header format and follow Pacbio's convention. A file cannot be added twice and this | |
* is enforced. The command either builds or appends to the .<path>.idx and .<path>.bps | |
* files, where the index file (.idx) contains information about each read and their offsets | |
* in the base-pair file (.bps) that holds the sequences where each base is compessed | |
* into 2-bits. The two files are hidden by virtue of their names beginning with a '.'. | |
* <path>.db is effectively a stub file with given name that contains an ASCII listing | |
* of the files added to the DB and possibly the block partitioning for the DB if DBsplit | |
* has been called upon it. | |
* | |
* Author: Gene Myers | |
* Date : May 2013 | |
* Modify: DB upgrade: now *add to* or create a DB depending on whether it exists, read | |
* multiple .fasta files (no longer a stdin pipe). | |
* Date : April 2014 | |
* | |
********************************************************************************************/ | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <string.h> | |
#include <strings.h> | |
#include <sys/stat.h> | |
#include <unistd.h> | |
#include "DB.h" | |
#ifdef HIDE_FILES | |
#define PATHSEP "/." | |
#else | |
#define PATHSEP "/" | |
#endif | |
static char *Usage = "[-v] [-p<string>] <path:string> ( -f<file> | <input:fasta> ... )"; | |
static char number[128] = | |
{ 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 1, 0, 0, 0, 2, | |
0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 3, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 1, 0, 0, 0, 2, | |
0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 3, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, | |
}; | |
typedef struct | |
{ int argc; | |
char **argv; | |
FILE *input; | |
int count; | |
char *name; | |
} File_Iterator; | |
File_Iterator *init_file_iterator(int argc, char **argv, FILE *input, int first) | |
{ File_Iterator *it; | |
it = Malloc(sizeof(File_Iterator),"Allocating file iterator"); | |
it->argc = argc; | |
it->argv = argv; | |
it->input = input; | |
if (input == NULL) | |
it->count = first; | |
else | |
{ it->count = 1; | |
rewind(input); | |
} | |
return (it); | |
} | |
int next_file(File_Iterator *it) | |
{ static char nbuffer[MAX_NAME+8]; | |
if (it->input == NULL) | |
{ if (it->count >= it->argc) | |
return (0); | |
it->name = it->argv[it->count++]; | |
} | |
else | |
{ char *eol; | |
if (fgets(nbuffer,MAX_NAME+8,it->input) == NULL) | |
{ if (feof(it->input)) | |
return (0); | |
SYSTEM_ERROR; | |
} | |
if ((eol = index(nbuffer,'\n')) == NULL) | |
{ fprintf(stderr,"%s: Line %d in file list is longer than %d chars!\n", | |
Prog_Name,it->count,MAX_NAME+7); | |
it->name = NULL; | |
} | |
*eol = '\0'; | |
it->count += 1; | |
it->name = nbuffer; | |
} | |
return (1); | |
} | |
int main(int argc, char *argv[]) | |
{ FILE *istub, *ostub; | |
char *dbname; | |
char *root, *pwd; | |
FILE *bases, *indx; | |
int64 boff, ioff; | |
int ifiles, ofiles; | |
char **flist; | |
HITS_DB db; | |
int ureads; | |
int64 offset; | |
FILE *IFILE; | |
char *PNAME; | |
int VERBOSE; | |
// Usage: [-v] <path:string> ( -f<file> | <input:fasta> ... ) | |
{ int i, j, k; | |
int flags[128]; | |
ARG_INIT("fasta2DB") | |
IFILE = NULL; | |
PNAME = NULL; | |
j = 1; | |
for (i = 1; i < argc; i++) | |
if (argv[i][0] == '-') | |
switch (argv[i][1]) | |
{ default: | |
ARG_FLAGS("v") | |
break; | |
case 'f': | |
IFILE = fopen(argv[i]+2,"r"); | |
if (IFILE == NULL) | |
{ fprintf(stderr,"%s: Cannot open file of inputs '%s'\n",Prog_Name,argv[i]+2); | |
exit (1); | |
} | |
break; | |
case 'p': | |
PNAME = argv[i]+2; | |
break; | |
} | |
else | |
argv[j++] = argv[i]; | |
argc = j; | |
VERBOSE = flags['v']; | |
if ((IFILE == NULL && argc <= 2) || (IFILE != NULL && argc != 2)) | |
{ fprintf(stderr,"Usage: %s %s\n",Prog_Name,Usage); | |
exit (1); | |
} | |
} | |
// Try to open DB file, if present then adding to DB, otherwise creating new DB. Set up | |
// variables as follows: | |
// dbname = full name of db = <pwd>/<root>.db | |
// istub = open db file (if adding) or NULL (if creating) | |
// ostub = new image of db file (will overwrite old image at end) | |
// bases = .bps file positioned for appending | |
// indx = .idx file positioned for appending | |
// ureads = # of reads currently in db | |
// offset = offset in .bps at which to place next sequence | |
// ioff = offset in .idx file to truncate to if command fails | |
// boff = offset in .bps file to truncate to if command fails | |
// ifiles = # of .fasta files to add | |
// ofiles = # of .fasta files already in db | |
// flist = [0..ifiles+ofiles] list of file names (root only) added to db so far | |
{ int i; | |
root = Root(argv[1],".db"); | |
pwd = PathTo(argv[1]); | |
dbname = Strdup(Catenate(pwd,"/",root,".db"),"Allocating db name"); | |
if (dbname == NULL) | |
exit (1); | |
if (IFILE == NULL) | |
ifiles = argc-2; | |
else | |
{ File_Iterator *ng; | |
ifiles = 0; | |
ng = init_file_iterator(argc,argv,IFILE,2); | |
while (next_file(ng)) | |
ifiles += 1; | |
free(ng); | |
} | |
istub = fopen(dbname,"r"); | |
if (istub == NULL) | |
{ ofiles = 0; | |
bases = Fopen(Catenate(pwd,PATHSEP,root,".bps"),"w+"); | |
indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"w+"); | |
if (bases == NULL || indx == NULL) | |
exit (1); | |
fwrite(&db,sizeof(HITS_DB),1,indx); | |
ureads = 0; | |
offset = 0; | |
boff = 0; | |
ioff = 0; | |
} | |
else | |
{ if (fscanf(istub,DB_NFILE,&ofiles) != 1) | |
SYSTEM_ERROR | |
bases = Fopen(Catenate(pwd,PATHSEP,root,".bps"),"r+"); | |
indx = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r+"); | |
if (bases == NULL || indx == NULL) | |
exit (1); | |
if (fread(&db,sizeof(HITS_DB),1,indx) != 1) | |
SYSTEM_ERROR | |
fseeko(bases,0,SEEK_END); | |
fseeko(indx, 0,SEEK_END); | |
ureads = db.ureads; | |
offset = ftello(bases); | |
boff = offset; | |
ioff = ftello(indx); | |
} | |
flist = (char **) Malloc(sizeof(char *)*(ofiles+ifiles),"Allocating file list"); | |
ostub = Fopen(Catenate(pwd,"/",root,".dbx"),"w+"); | |
if (ostub == NULL || flist == NULL) | |
exit (1); | |
fprintf(ostub,DB_NFILE,ofiles+ifiles); | |
for (i = 0; i < ofiles; i++) | |
{ int last; | |
char prolog[MAX_NAME], fname[MAX_NAME]; | |
if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) | |
SYSTEM_ERROR | |
if ((flist[i] = Strdup(fname,"Adding to file list")) == NULL) | |
goto error; | |
fprintf(ostub,DB_FDATA,last,fname,prolog); | |
} | |
} | |
{ int maxlen; | |
int64 totlen, count[4]; | |
int pmax, rmax; | |
HITS_READ *prec; | |
char *read; | |
int c; | |
File_Iterator *ng; | |
// Buffer for reads all in the same well | |
pmax = 100; | |
prec = (HITS_READ *) Malloc(sizeof(HITS_READ)*pmax,"Allocating record buffer"); | |
if (prec == NULL) | |
goto error; | |
// Buffer for accumulating .fasta sequence over multiple lines | |
rmax = MAX_NAME + 60000; | |
read = (char *) Malloc(rmax+1,"Allocating line buffer"); | |
if (read == NULL) | |
goto error; | |
totlen = 0; // total # of bases in new .fasta files | |
maxlen = 0; // longest read in new .fasta files | |
for (c = 0; c < 4; c++) // count of acgt in new .fasta files | |
count[c] = 0; | |
// For each new .fasta file do: | |
ng = init_file_iterator(argc,argv,IFILE,2); | |
while (next_file(ng)) | |
{ FILE *input; | |
char *path, *core, *prolog; | |
int nline, eof, rlen, pcnt; | |
int pwell; | |
if (ng->name == NULL) goto error; | |
// Open it: <path>/<core>.fasta, check that core is not too long, | |
// and checking that it is not already in flist. | |
path = PathTo(ng->name); | |
core = Root(ng->name,".fasta"); | |
if ((input = Fopen(Catenate(path,"/",core,".fasta"),"r")) == NULL) | |
goto error; | |
free(path); | |
if (strlen(core) >= MAX_NAME) | |
{ fprintf(stderr,"%s: File name over %d chars: '%.200s'\n", | |
Prog_Name,MAX_NAME,core); | |
goto error; | |
} | |
{ int j; | |
for (j = 0; j < ofiles; j++) | |
if (strcmp(core,flist[j]) == 0) | |
{ fprintf(stderr,"%s: File %s.fasta is already in database %s.db\n", | |
Prog_Name,core,Root(argv[1],".db")); | |
goto error; | |
} | |
} | |
// Get the header of the first line. If the file is empty skip. | |
pcnt = 0; | |
rlen = 0; | |
nline = 1; | |
eof = (fgets(read,MAX_NAME,input) == NULL); | |
if (eof || strlen(read) < 1) | |
{ fprintf(stderr,"Skipping '%s', file is empty!\n",core); | |
fclose(input); | |
free(core); | |
continue; | |
} | |
// Add the file name to flist | |
if (VERBOSE) | |
{ fprintf(stderr,"Adding '%s' ...\n",core); | |
fflush(stderr); | |
} | |
flist[ofiles++] = core; | |
// Check that the first line has PACBIO format, and record prolog in 'prolog'. | |
if (read[strlen(read)-1] != '\n') | |
{ fprintf(stderr,"File %s.fasta, Line 1: Fasta line is too long (> %d chars)\n", | |
core,MAX_NAME-2); | |
goto error; | |
} | |
if (!eof && read[0] != '>') | |
{ fprintf(stderr,"File %s.fasta, Line 1: First header in fasta file is missing\n",core); | |
goto error; | |
} | |
{ char *find; | |
int well, beg, end, qv; | |
find = index(read+1,'/'); | |
if (find != NULL && sscanf(find+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv) >= 3) | |
{ *find = '\0'; | |
if (PNAME != NULL) | |
prolog = Strdup(PNAME,"Extracting prolog"); | |
else | |
prolog = Strdup(read+1,"Extracting prolog"); | |
*find = '/'; | |
if (prolog == NULL) goto error; | |
} | |
else | |
{ fprintf(stderr,"File %s.fasta, Line %d: Pacbio header line format error\n", | |
core,nline); | |
goto error; | |
} | |
} | |
// Read in all the sequences until end-of-file | |
{ int i, x; | |
pwell = -1; | |
while (!eof) | |
{ int beg, end, clen; | |
int well, qv; | |
char *find; | |
find = index(read+(rlen+1),'/'); | |
if (find == NULL) | |
{ fprintf(stderr,"File %s.fasta, Line %d: Pacbio header line format error\n", | |
core,nline); | |
goto error; | |
} | |
if (PNAME == NULL) | |
{ *find = '\0'; | |
if (strcmp(read+(rlen+1),prolog) != 0) | |
{ fprintf(stderr, | |
"File %s.fasta, Line %d: Pacbio header line name inconsistent\n", | |
core,nline); | |
goto error; | |
} | |
*find = '/'; | |
} | |
x = sscanf(find+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv); | |
if (x < 3) | |
{ fprintf(stderr,"File %s.fasta, Line %d: Pacbio header line format error\n", | |
core,nline); | |
goto error; | |
} | |
else if (x == 3) | |
qv = 0; | |
rlen = 0; | |
while (1) | |
{ eof = (fgets(read+rlen,MAX_NAME,input) == NULL); | |
nline += 1; | |
x = strlen(read+rlen)-1; | |
if (read[rlen+x] != '\n') | |
{ if (read[rlen] == '>') | |
{ fprintf(stderr,"File %s.fasta, Line %d:",core,nline); | |
fprintf(stderr," Fasta header line is too long (> %d chars)\n", | |
MAX_NAME-2); | |
goto error; | |
} | |
else | |
x += 1; | |
} | |
if (eof || read[rlen] == '>') | |
break; | |
rlen += x; | |
if (rlen + MAX_NAME > rmax) | |
{ rmax = ((int) (1.2 * rmax)) + 1000 + MAX_NAME; | |
read = (char *) realloc(read,rmax+1); | |
if (read == NULL) | |
{ fprintf(stderr,"File %s.fasta, Line %d:",core,nline); | |
fprintf(stderr," Out of memory (Allocating line buffer)\n"); | |
goto error; | |
} | |
} | |
} | |
read[rlen] = '\0'; | |
for (i = 0; i < rlen; i++) | |
{ x = number[(int) read[i]]; | |
count[x] += 1; | |
read[i] = (char) x; | |
} | |
ureads += 1; | |
totlen += rlen; | |
if (rlen > maxlen) | |
maxlen = rlen; | |
prec[pcnt].origin = well; | |
prec[pcnt].fpulse = beg; | |
prec[pcnt].rlen = rlen; | |
prec[pcnt].boff = offset; | |
prec[pcnt].coff = -1; | |
prec[pcnt].flags = qv; | |
Compress_Read(rlen,read); | |
clen = COMPRESSED_LEN(rlen); | |
fwrite(read,1,clen,bases); | |
offset += clen; | |
if (pwell == well) | |
{ prec[pcnt].flags |= DB_CSS; | |
pcnt += 1; | |
if (pcnt >= pmax) | |
{ pmax = ((int) (pcnt*1.2)) + 100; | |
prec = (HITS_READ *) realloc(prec,sizeof(HITS_READ)*pmax); | |
if (prec == NULL) | |
{ fprintf(stderr,"File %s.fasta, Line %d: Out of memory",core,nline); | |
fprintf(stderr," (Allocating read records)\n"); | |
goto error; | |
} | |
} | |
} | |
else if (pcnt == 0) | |
pcnt += 1; | |
else | |
{ x = 0; | |
for (i = 1; i < pcnt; i++) | |
if (prec[i].rlen > prec[x].rlen) | |
x = i; | |
prec[x].flags |= DB_BEST; | |
fwrite(prec,sizeof(HITS_READ),pcnt,indx); | |
prec[0] = prec[pcnt]; | |
pcnt = 1; | |
} | |
pwell = well; | |
} | |
// Complete processing of .fasta file: flush last well group, write file line | |
// in db image, free prolog, and close file | |
x = 0; | |
for (i = 1; i < pcnt; i++) | |
if (prec[i].rlen > prec[x].rlen) | |
x = i; | |
prec[x].flags |= DB_BEST; | |
fwrite(prec,sizeof(HITS_READ),pcnt,indx); | |
fprintf(ostub,DB_FDATA,ureads,core,prolog); | |
} | |
free(prolog); | |
fclose(input); | |
} | |
// Finished loading all sequences: update relevant fields in db record | |
db.ureads = ureads; | |
if (istub == NULL) | |
{ for (c = 0; c < 4; c++) | |
db.freq[c] = (float) ((1.*count[c])/totlen); | |
db.totlen = totlen; | |
db.maxlen = maxlen; | |
db.cutoff = -1; | |
} | |
else | |
{ for (c = 0; c < 4; c++) | |
db.freq[c] = (float) ((db.freq[c]*db.totlen + (1.*count[c]))/(db.totlen + totlen)); | |
db.totlen += totlen; | |
if (maxlen > db.maxlen) | |
db.maxlen = maxlen; | |
} | |
} | |
// If db has been previously partitioned then calculate additional partition points and | |
// write to new db file image | |
if (db.cutoff >= 0) | |
{ int64 totlen, dbpos, size; | |
int nblock, ireads, tfirst, rlen; | |
int ufirst, cutoff, allflag; | |
HITS_READ record; | |
int i; | |
if (VERBOSE) | |
{ fprintf(stderr,"Updating block partition ...\n"); | |
fflush(stderr); | |
} | |
// Read the block portion of the existing db image getting the indices of the first | |
// read in the last block of the exisiting db as well as the partition parameters. | |
// Copy the old image block information to the new block information (except for | |
// the indices of the last partial block) | |
if (fscanf(istub,DB_NBLOCK,&nblock) != 1) | |
SYSTEM_ERROR | |
dbpos = ftello(ostub); | |
fprintf(ostub,DB_NBLOCK,0); | |
if (fscanf(istub,DB_PARAMS,&size,&cutoff,&allflag) != 3) | |
SYSTEM_ERROR | |
fprintf(ostub,DB_PARAMS,size,cutoff,allflag); | |
if (allflag) | |
allflag = 0; | |
else | |
allflag = DB_BEST; | |
nblock -= 1; | |
for (i = 0; i <= nblock; i++) | |
{ if (fscanf(istub,DB_BDATA,&ufirst,&tfirst) != 2) | |
SYSTEM_ERROR | |
fprintf(ostub,DB_BDATA,ufirst,tfirst); | |
} | |
// Seek the first record of the last block of the existing db in .idx, and then | |
// compute and record partition indices for the rest of the db from this point | |
// forward. | |
fseeko(indx,sizeof(HITS_DB)+sizeof(HITS_READ)*ufirst,SEEK_SET); | |
totlen = 0; | |
ireads = 0; | |
for (i = ufirst; i < ureads; i++) | |
{ if (fread(&record,sizeof(HITS_READ),1,indx) != 1) | |
SYSTEM_ERROR | |
rlen = record.rlen; | |
if (rlen >= cutoff && (record.flags & DB_BEST) >= allflag) | |
{ ireads += 1; | |
tfirst += 1; | |
totlen += rlen; | |
if (totlen >= size) | |
{ fprintf(ostub," %9d %9d\n",i+1,tfirst); | |
totlen = 0; | |
ireads = 0; | |
nblock += 1; | |
} | |
} | |
} | |
if (ireads > 0) | |
{ fprintf(ostub,DB_BDATA,ureads,tfirst); | |
nblock += 1; | |
} | |
db.treads = tfirst; | |
fseeko(ostub,dbpos,SEEK_SET); | |
fprintf(ostub,DB_NBLOCK,nblock); // Rewind and record the new number of blocks | |
} | |
else | |
db.treads = ureads; | |
rewind(indx); | |
fwrite(&db,sizeof(HITS_DB),1,indx); // Write the finalized db record into .idx | |
rewind(ostub); // Rewrite the number of files actually added | |
fprintf(ostub,DB_NFILE,ofiles); | |
if (istub != NULL) | |
fclose(istub); | |
fclose(ostub); | |
fclose(indx); | |
fclose(bases); | |
rename(Catenate(pwd,"/",root,".dbx"),dbname); // New image replaces old image | |
exit (0); | |
// Error exit: Either truncate or remove the .idx and .bps files as appropriate. | |
// Remove the new image file <pwd>/<root>.dbx | |
error: | |
if (ioff != 0) | |
{ fseeko(indx,0,SEEK_SET); | |
if (ftruncate(fileno(indx),ioff) < 0) | |
SYSTEM_ERROR | |
} | |
if (boff != 0) | |
{ fseeko(bases,0,SEEK_SET); | |
if (ftruncate(fileno(bases),boff) < 0) | |
SYSTEM_ERROR | |
} | |
fclose(indx); | |
fclose(bases); | |
if (ioff == 0) | |
unlink(Catenate(pwd,PATHSEP,root,".idx")); | |
if (boff == 0) | |
unlink(Catenate(pwd,PATHSEP,root,".bps")); | |
if (istub != NULL) | |
fclose(istub); | |
fclose(ostub); | |
unlink(Catenate(pwd,"/",root,".dbx")); | |
exit (1); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment