Created
May 11, 2010 19:39
-
-
Save lindenb/397760 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* A Test for the sqlite C API | |
* Pierre Lindenbaum http://plindenbaum.blogspot.com | |
* reads a HAPMAP file and put it in a relational sqlite DB | |
*/ | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <sqlite3.h> | |
#include <zutil.h> | |
/** | |
* parameters for the program | |
*/ | |
typedef struct hapMap2SQLite_t | |
{ | |
/** name of the sqlite file */ | |
char *fileout; | |
/** hapmap file in */ | |
char *filein; | |
/** input stream*/ | |
gzFile in; | |
/** connection to sqlite */ | |
sqlite3* connection; | |
}HapMap2SQLite,*HapMap2SQLitePtr; | |
/** quick'n dirty (=slow) version of readline . returns a string that should destroyed with 'free(ptr)' */ | |
static char* readline(gzFile in,int *length) | |
{ | |
int c; | |
char* p=NULL; | |
*length=0; | |
while((c=gzgetc(in))!=EOF) | |
{ | |
p=(char*)realloc(p,sizeof(char)*(*length+2)); | |
if(p==NULL) | |
{ | |
fprintf(stderr,"Out of memory\n"); | |
exit(EXIT_FAILURE); | |
} | |
if(c=='\n') | |
{ | |
p[*length]=0; | |
break; | |
} | |
p[*length]=c; | |
(*length)++; | |
p[*length]=0; | |
} | |
return p; | |
} | |
/** call back for the custom sqlite function 'homozygous' */ | |
void is_homozygous( | |
sqlite3_context* context, /* context */ | |
int argc, /* number of arguments */ | |
sqlite3_value** argv /* arguments */ | |
) | |
{ | |
/* one text arg */ | |
if(argc==1 && | |
sqlite3_value_type(argv[0])==SQLITE_TEXT) | |
{ | |
const char *alleles=(const char*)sqlite3_value_text(argv[0]); | |
if( alleles!=NULL && | |
strlen(alleles)==2 && | |
alleles[0]==alleles[1] && | |
alleles[0]!='N') | |
{ | |
/* set result to TRUE */ | |
sqlite3_result_int(context,1); | |
return; | |
} | |
} | |
/* set result to FALSE */ | |
sqlite3_result_int(context,0); | |
} | |
/** callback called by the 'select_homozygous' query */ | |
static int callback_select_homozygous( | |
void* notUsed, | |
int argc,/* number of args */ | |
char** argv,/* arguments as string */ | |
char** columns /* labels for the columns */ | |
) | |
{ | |
int i; | |
/* prints all the columns to stdout*/ | |
for(i=0; i<argc; i++) | |
{ | |
printf( "\"%s\": \"%s\"\n", | |
columns[i], | |
argv[i]!=NULL? argv[i] : "NULL" | |
); | |
} | |
printf("\n"); | |
return 0; | |
} | |
/** | |
* this is a test query. | |
* it selects all the markers/individuals /genotypes homozygous | |
*/ | |
static void select_homozygous(HapMap2SQLitePtr env) | |
{ | |
char* errormsg=NULL; | |
/* open sqlite database */ | |
if(sqlite3_open(env->fileout,&(env->connection))!=SQLITE_OK) | |
{ | |
fprintf(stderr,"Cannot open sqlite file %s.\n",env->fileout); | |
exit(EXIT_FAILURE); | |
} | |
/* create custom function homozygous */ | |
sqlite3_create_function(env->connection, | |
"homozygous",/* function name */ | |
1,/* number of args */ | |
SQLITE_UTF8,/* encoding */ | |
NULL, | |
is_homozygous,/* the implementation callback */ | |
NULL, | |
NULL | |
); | |
/* execute the select query */ | |
if(sqlite3_exec( | |
env->connection, /* An open database */ | |
"select individuals.name," | |
"markers.name," | |
"markers.chrom," | |
"markers.position," | |
"genotypes.alleles " | |
"from " | |
"genotypes," | |
"markers," | |
"individuals " | |
"where " | |
"markers.id=genotypes.marker_id and " | |
"individuals.id=genotypes.individual_id and " | |
"homozygous(genotypes.alleles)=1", /* SQL to be evaluated */ | |
callback_select_homozygous, /* Callback function */ | |
NULL, /* 1st argument to callback */ | |
&errormsg /* Error msg written here */ | |
)!=SQLITE_OK) | |
{ | |
fprintf(stderr,"Cannot select.\n"); | |
exit(EXIT_FAILURE); | |
} | |
/* close connection */ | |
sqlite3_close(env->connection); | |
} | |
/* parse an hapmap genotype file and put it in a sqlite db */ | |
static int hapmap2sqlite(HapMap2SQLitePtr env) | |
{ | |
const int MANDATORY_COLS_COUNT=11; | |
char* line; | |
int length; | |
char *error=NULL; | |
/* prepated statement for insertion */ | |
sqlite3_stmt *pstmt_insert_indi=NULL; | |
sqlite3_stmt *pstmt_insert_marker=NULL; | |
sqlite3_stmt *pstmt_insert_genotype=NULL; | |
/* current line */ | |
int nLine=0; | |
/* number of individual */ | |
int n_individuals=0; | |
/* primary key of individuals */ | |
int *individuals_id=NULL; | |
/* open (gzipped) hapmap file */ | |
if(env->filein==NULL) | |
{ | |
env->in = gzdopen(fileno(stdin),"rb"); | |
} | |
else | |
{ | |
env->in = gzopen(env->filein,"rb"); | |
} | |
if(env->in==NULL) | |
{ | |
fprintf(stderr,"Cannot open input file.\n"); | |
exit(EXIT_FAILURE); | |
} | |
/* open sqlite database */ | |
if(sqlite3_open(env->fileout,&(env->connection))!=SQLITE_OK) | |
{ | |
fprintf(stderr,"Cannot open sqlite file %s.\n",env->fileout); | |
exit(EXIT_FAILURE); | |
} | |
/* create tables for individuals */ | |
if(sqlite3_exec(env->connection, | |
"create table individuals(id INTEGER PRIMARY KEY,name varchar(15) unique)", | |
NULL,NULL,&error | |
)!=SQLITE_OK) | |
{ | |
fprintf(stderr,"Cannot create table individuals: %s\n",error); | |
exit(EXIT_FAILURE); | |
} | |
/* create tables for markers */ | |
if(sqlite3_exec(env->connection, | |
"create table markers(id INTEGER PRIMARY KEY,name varchar(20) unique,chrom varchar(10),position integer)", | |
NULL,NULL,&error | |
)!=SQLITE_OK) | |
{ | |
fprintf(stderr,"Cannot create table markers: %s\n",error); | |
exit(EXIT_FAILURE); | |
} | |
/* create tables for genotypes */ | |
if(sqlite3_exec(env->connection, | |
"create table genotypes(id INTEGER PRIMARY KEY," | |
"individual_id int references individuals(id)," | |
"marker_id int references markers(id)," | |
"alleles varchar(2) )", | |
NULL,NULL,&error | |
)!=SQLITE_OK) | |
{ | |
fprintf(stderr,"Cannot create table genotypes: %s\n",error); | |
exit(EXIT_FAILURE); | |
} | |
/* create prepared statement for individuals */ | |
if(sqlite3_prepare(env->connection, | |
"insert into individuals(name) values(?)", | |
-1,&pstmt_insert_indi,NULL | |
)!=SQLITE_OK) | |
{ | |
fprintf(stderr,"Cannot compile insert into individuals.\n"); | |
exit(EXIT_FAILURE); | |
} | |
/* create prepared statement for markers */ | |
if(sqlite3_prepare(env->connection, | |
"insert into markers(name,chrom,position) values(?,?,?)", | |
-1,&pstmt_insert_marker,NULL | |
)!=SQLITE_OK) | |
{ | |
fprintf(stderr,"Cannot compile insert into markers.\n"); | |
exit(EXIT_FAILURE); | |
} | |
/* create prepared statement and genotypes*/ | |
if(sqlite3_prepare(env->connection, | |
"insert into genotypes(individual_id,marker_id,alleles) values(?,?,?)", | |
-1,&pstmt_insert_genotype,NULL | |
)!=SQLITE_OK) | |
{ | |
fprintf(stderr,"Cannot compile insert into genotypes.\n"); | |
exit(EXIT_FAILURE); | |
} | |
/* loop over each lines */ | |
while((line=readline(env->in,&length))!=NULL) | |
{ | |
/* current marker id */ | |
int marker_id=0; | |
/* number of spaces */ | |
int n_tokens=0; | |
/* position of the spaces */ | |
int *tokens=NULL; | |
int i; | |
++nLine; | |
/* split the line. get the positions of the spaces */ | |
for(i=0;i<=length;++i) | |
{ | |
if(!(i==length || line[i]==' ')) continue; | |
tokens=(int*)realloc(tokens,sizeof(int)*(n_tokens+1)); | |
if(tokens==NULL) | |
{ | |
fprintf(stderr,"Out of memory\n"); | |
exit(EXIT_FAILURE); | |
} | |
tokens[n_tokens]=i; | |
n_tokens++; | |
} | |
if(n_tokens<=MANDATORY_COLS_COUNT) | |
{ | |
fprintf(stderr,"In %s expected at least %d columns but found %d\n", | |
line,(MANDATORY_COLS_COUNT+1),n_tokens); | |
exit(EXIT_FAILURE); | |
} | |
/* this is the header, alloc and insert the individuals */ | |
if(nLine==1) | |
{ | |
n_individuals=n_tokens-MANDATORY_COLS_COUNT; | |
individuals_id=(int*)malloc(sizeof(int)*n_individuals); | |
if(individuals_id==NULL) | |
{ | |
fprintf(stderr,"Cannot malloc individuals_id.\n"); | |
exit(EXIT_FAILURE); | |
} | |
for(i=MANDATORY_COLS_COUNT;i< n_tokens;++i) | |
{ | |
/* bind the parameters of the prepared statement */ | |
if(sqlite3_bind_text( | |
pstmt_insert_indi,1, | |
&line[tokens[i-1]+1], | |
tokens[i]-tokens[i-1]-1, | |
NULL)!=SQLITE_OK) | |
{ | |
fprintf(stderr,"Cannot bind individual's name.\n"); | |
exit(EXIT_FAILURE); | |
} | |
/* execute the prepared statement */ | |
if (sqlite3_step(pstmt_insert_indi) != SQLITE_DONE) { | |
fprintf(stderr,"Could not step (execute) stmt.\n"); | |
exit(EXIT_FAILURE); | |
} | |
/* get the last_insert_id */ | |
individuals_id[i-MANDATORY_COLS_COUNT]=sqlite3_last_insert_rowid(env->connection); | |
/* reset the prepared statement */ | |
sqlite3_reset(pstmt_insert_indi); | |
} | |
continue; | |
} | |
if(n_tokens-MANDATORY_COLS_COUNT!=n_individuals) | |
{ | |
fprintf(stderr,"expected %d genotypes but got %d.\n", | |
n_individuals,n_tokens-MANDATORY_COLS_COUNT); | |
exit(EXIT_FAILURE); | |
} | |
/* insert marker */ | |
if(sqlite3_bind_text( | |
pstmt_insert_marker,1, | |
&line[0], | |
tokens[0], | |
NULL)!=SQLITE_OK) | |
{ | |
fprintf(stderr,"Cannot bind markers's name.\n"); | |
exit(EXIT_FAILURE); | |
} | |
if(sqlite3_bind_text( | |
pstmt_insert_marker,2, | |
&line[tokens[1]+1], | |
tokens[2]-tokens[1]-1, | |
NULL)!=SQLITE_OK) | |
{ | |
fprintf(stderr,"Cannot bind markers's chrom.\n"); | |
exit(EXIT_FAILURE); | |
} | |
if( sqlite3_bind_int( | |
pstmt_insert_marker,3, | |
atoi(&line[tokens[2]+1]) | |
)!=SQLITE_OK) | |
{ | |
fprintf(stderr,"Cannot bind markers's position.\n"); | |
exit(EXIT_FAILURE); | |
} | |
if (sqlite3_step(pstmt_insert_marker) != SQLITE_DONE) { | |
fprintf(stderr,"Could not step insert marker.\n"); | |
exit(EXIT_FAILURE); | |
} | |
marker_id=sqlite3_last_insert_rowid(env->connection); | |
fprintf(stderr,"[LOG]inserted marker ID.%d\n",marker_id); | |
sqlite3_reset(pstmt_insert_marker); | |
/* loop over the genotypes */ | |
for(i=MANDATORY_COLS_COUNT;i< n_tokens;++i) | |
{ | |
/* bind individual-id */ | |
if( sqlite3_bind_int( | |
pstmt_insert_genotype,1, | |
individuals_id[i-MANDATORY_COLS_COUNT] | |
)!=SQLITE_OK) | |
{ | |
fprintf(stderr,"Cannot bind indi-id.\n"); | |
exit(EXIT_FAILURE); | |
} | |
/* bind marker-id */ | |
if( sqlite3_bind_int( | |
pstmt_insert_genotype,2, | |
marker_id | |
)!=SQLITE_OK) | |
{ | |
fprintf(stderr,"Cannot bind marker-id.\n"); | |
exit(EXIT_FAILURE); | |
} | |
/* bind alleles */ | |
if(sqlite3_bind_text( | |
pstmt_insert_genotype,3, | |
&line[tokens[i-1]+1], | |
tokens[i]-tokens[i-1]-1, | |
NULL)!=SQLITE_OK) | |
{ | |
fprintf(stderr,"Cannot bind alleles.\n"); | |
exit(EXIT_FAILURE); | |
} | |
/* insert the genotype */ | |
if (sqlite3_step(pstmt_insert_genotype) != SQLITE_DONE) { | |
fprintf(stderr,"Could not insert genotype.\n"); | |
exit(EXIT_FAILURE); | |
} | |
/* reset statement */ | |
sqlite3_reset(pstmt_insert_genotype); | |
} | |
free(tokens); | |
free(line); | |
} | |
/* cleanup */ | |
if(individuals_id!=NULL) | |
{ | |
free(individuals_id); | |
} | |
if(pstmt_insert_indi!=NULL) | |
{ | |
sqlite3_finalize(pstmt_insert_indi); | |
} | |
if(pstmt_insert_marker!=NULL) | |
{ | |
sqlite3_finalize(pstmt_insert_marker); | |
} | |
if(pstmt_insert_genotype!=NULL) | |
{ | |
sqlite3_finalize(pstmt_insert_genotype); | |
} | |
if(env->connection!=NULL) | |
{ | |
sqlite3_close(env->connection); | |
} | |
if(env->filein!=NULL && env->in!=NULL) | |
{ | |
gzclose(env->in); | |
} | |
return EXIT_SUCCESS; | |
} | |
/** main */ | |
int main(int argc, char **argv) | |
{ | |
HapMap2SQLite app; | |
app.fileout=NULL; | |
app.filein=NULL; | |
app.in=NULL; | |
app.connection=NULL; | |
int ret=0; | |
int optind=1; | |
/* loop over args */ | |
while(optind<argc) | |
{ | |
if(strcmp(argv[optind],"-h")==0) | |
{ | |
return EXIT_SUCCESS; | |
} | |
else if(strcmp(argv[optind],"-o")==0 && optind+1< argc) | |
{ | |
FILE* in; | |
app.fileout=argv[++optind]; | |
in= fopen(app.fileout,"rb"); | |
if(in!=NULL) | |
{ | |
fclose(in); | |
fprintf(stderr,"Error %s already exists.\n",app.fileout); | |
return EXIT_FAILURE; | |
} | |
} | |
else if(strcmp(argv[optind],"--")==0) | |
{ | |
optind++; | |
break; | |
} | |
else if(argv[optind][0]=='-') | |
{ | |
fprintf(stderr,"Illegal option %s.\n",argv[optind]); | |
return EXIT_FAILURE; | |
} | |
else | |
{ | |
break; | |
} | |
++optind; | |
} | |
if(app.fileout==NULL) | |
{ | |
fprintf(stderr,"Undefined file-out\n"); | |
return EXIT_FAILURE; | |
} | |
if(optind==argc) | |
{ | |
//read from stdin | |
app.filein=NULL; | |
ret=hapmap2sqlite(&app); | |
} | |
else if(optind+1!=argc) | |
{ | |
fprintf(stderr,"Illegal number of arguments.\n"); | |
return EXIT_FAILURE; | |
} | |
else | |
{ | |
//read from file | |
app.filein=argv[optind]; | |
ret=hapmap2sqlite(&app); | |
} | |
select_homozygous(&app); | |
return ret; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CFLAGS=-I ${SQLITE3DIR}/include -Wall | |
LDFLAGS=-L ${SQLITE3DIR}/lib | |
#export LD_LIBRARY_PATH=${SQLITE3DIR}/lib | |
test:hapmap2sqlite genotypes.txt.gz | |
rm -f database.db | |
./hapmap2sqlite -o database.db genotypes.txt.gz | |
genotypes.txt.gz: | |
wget -O $@ "http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/latest/forward/non-redundant/genotypes_chrY_CEU_r27_nr.b36_fwd.txt.gz" | |
hapmap2sqlite:hapmap2sqlite.c | |
gcc -o $@ $(CFLAGS) $(LDFLAGS) hapmap2sqlite.c -lsqlite3 -lz | |
clean: | |
rm -f hapmap2sqlite database.db *.o a.out |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment