Skip to content

Instantly share code, notes, and snippets.

@lindenb
Created May 11, 2010 19:39
Show Gist options
  • Save lindenb/397760 to your computer and use it in GitHub Desktop.
Save lindenb/397760 to your computer and use it in GitHub Desktop.
/* A Test for the sqlite C API
* Pierre Lindenbaum http://plindenbaum.blogspot.com
* reads a HAPMAP file and put it in a relational sqlite DB
*/
#include <stdio.h>
#include <stdlib.h>
#include <sqlite3.h>
#include <zutil.h>
/**
* parameters for the program
*/
typedef struct hapMap2SQLite_t
{
/** name of the sqlite file */
char *fileout;
/** hapmap file in */
char *filein;
/** input stream*/
gzFile in;
/** connection to sqlite */
sqlite3* connection;
}HapMap2SQLite,*HapMap2SQLitePtr;
/** quick'n dirty (=slow) version of readline . returns a string that should destroyed with 'free(ptr)' */
static char* readline(gzFile in,int *length)
{
int c;
char* p=NULL;
*length=0;
while((c=gzgetc(in))!=EOF)
{
p=(char*)realloc(p,sizeof(char)*(*length+2));
if(p==NULL)
{
fprintf(stderr,"Out of memory\n");
exit(EXIT_FAILURE);
}
if(c=='\n')
{
p[*length]=0;
break;
}
p[*length]=c;
(*length)++;
p[*length]=0;
}
return p;
}
/** call back for the custom sqlite function 'homozygous' */
void is_homozygous(
sqlite3_context* context, /* context */
int argc, /* number of arguments */
sqlite3_value** argv /* arguments */
)
{
/* one text arg */
if(argc==1 &&
sqlite3_value_type(argv[0])==SQLITE_TEXT)
{
const char *alleles=(const char*)sqlite3_value_text(argv[0]);
if( alleles!=NULL &&
strlen(alleles)==2 &&
alleles[0]==alleles[1] &&
alleles[0]!='N')
{
/* set result to TRUE */
sqlite3_result_int(context,1);
return;
}
}
/* set result to FALSE */
sqlite3_result_int(context,0);
}
/** callback called by the 'select_homozygous' query */
static int callback_select_homozygous(
void* notUsed,
int argc,/* number of args */
char** argv,/* arguments as string */
char** columns /* labels for the columns */
)
{
int i;
/* prints all the columns to stdout*/
for(i=0; i<argc; i++)
{
printf( "\"%s\": \"%s\"\n",
columns[i],
argv[i]!=NULL? argv[i] : "NULL"
);
}
printf("\n");
return 0;
}
/**
* this is a test query.
* it selects all the markers/individuals /genotypes homozygous
*/
static void select_homozygous(HapMap2SQLitePtr env)
{
char* errormsg=NULL;
/* open sqlite database */
if(sqlite3_open(env->fileout,&(env->connection))!=SQLITE_OK)
{
fprintf(stderr,"Cannot open sqlite file %s.\n",env->fileout);
exit(EXIT_FAILURE);
}
/* create custom function homozygous */
sqlite3_create_function(env->connection,
"homozygous",/* function name */
1,/* number of args */
SQLITE_UTF8,/* encoding */
NULL,
is_homozygous,/* the implementation callback */
NULL,
NULL
);
/* execute the select query */
if(sqlite3_exec(
env->connection, /* An open database */
"select individuals.name,"
"markers.name,"
"markers.chrom,"
"markers.position,"
"genotypes.alleles "
"from "
"genotypes,"
"markers,"
"individuals "
"where "
"markers.id=genotypes.marker_id and "
"individuals.id=genotypes.individual_id and "
"homozygous(genotypes.alleles)=1", /* SQL to be evaluated */
callback_select_homozygous, /* Callback function */
NULL, /* 1st argument to callback */
&errormsg /* Error msg written here */
)!=SQLITE_OK)
{
fprintf(stderr,"Cannot select.\n");
exit(EXIT_FAILURE);
}
/* close connection */
sqlite3_close(env->connection);
}
/* parse an hapmap genotype file and put it in a sqlite db */
static int hapmap2sqlite(HapMap2SQLitePtr env)
{
const int MANDATORY_COLS_COUNT=11;
char* line;
int length;
char *error=NULL;
/* prepated statement for insertion */
sqlite3_stmt *pstmt_insert_indi=NULL;
sqlite3_stmt *pstmt_insert_marker=NULL;
sqlite3_stmt *pstmt_insert_genotype=NULL;
/* current line */
int nLine=0;
/* number of individual */
int n_individuals=0;
/* primary key of individuals */
int *individuals_id=NULL;
/* open (gzipped) hapmap file */
if(env->filein==NULL)
{
env->in = gzdopen(fileno(stdin),"rb");
}
else
{
env->in = gzopen(env->filein,"rb");
}
if(env->in==NULL)
{
fprintf(stderr,"Cannot open input file.\n");
exit(EXIT_FAILURE);
}
/* open sqlite database */
if(sqlite3_open(env->fileout,&(env->connection))!=SQLITE_OK)
{
fprintf(stderr,"Cannot open sqlite file %s.\n",env->fileout);
exit(EXIT_FAILURE);
}
/* create tables for individuals */
if(sqlite3_exec(env->connection,
"create table individuals(id INTEGER PRIMARY KEY,name varchar(15) unique)",
NULL,NULL,&error
)!=SQLITE_OK)
{
fprintf(stderr,"Cannot create table individuals: %s\n",error);
exit(EXIT_FAILURE);
}
/* create tables for markers */
if(sqlite3_exec(env->connection,
"create table markers(id INTEGER PRIMARY KEY,name varchar(20) unique,chrom varchar(10),position integer)",
NULL,NULL,&error
)!=SQLITE_OK)
{
fprintf(stderr,"Cannot create table markers: %s\n",error);
exit(EXIT_FAILURE);
}
/* create tables for genotypes */
if(sqlite3_exec(env->connection,
"create table genotypes(id INTEGER PRIMARY KEY,"
"individual_id int references individuals(id),"
"marker_id int references markers(id),"
"alleles varchar(2) )",
NULL,NULL,&error
)!=SQLITE_OK)
{
fprintf(stderr,"Cannot create table genotypes: %s\n",error);
exit(EXIT_FAILURE);
}
/* create prepared statement for individuals */
if(sqlite3_prepare(env->connection,
"insert into individuals(name) values(?)",
-1,&pstmt_insert_indi,NULL
)!=SQLITE_OK)
{
fprintf(stderr,"Cannot compile insert into individuals.\n");
exit(EXIT_FAILURE);
}
/* create prepared statement for markers */
if(sqlite3_prepare(env->connection,
"insert into markers(name,chrom,position) values(?,?,?)",
-1,&pstmt_insert_marker,NULL
)!=SQLITE_OK)
{
fprintf(stderr,"Cannot compile insert into markers.\n");
exit(EXIT_FAILURE);
}
/* create prepared statement and genotypes*/
if(sqlite3_prepare(env->connection,
"insert into genotypes(individual_id,marker_id,alleles) values(?,?,?)",
-1,&pstmt_insert_genotype,NULL
)!=SQLITE_OK)
{
fprintf(stderr,"Cannot compile insert into genotypes.\n");
exit(EXIT_FAILURE);
}
/* loop over each lines */
while((line=readline(env->in,&length))!=NULL)
{
/* current marker id */
int marker_id=0;
/* number of spaces */
int n_tokens=0;
/* position of the spaces */
int *tokens=NULL;
int i;
++nLine;
/* split the line. get the positions of the spaces */
for(i=0;i<=length;++i)
{
if(!(i==length || line[i]==' ')) continue;
tokens=(int*)realloc(tokens,sizeof(int)*(n_tokens+1));
if(tokens==NULL)
{
fprintf(stderr,"Out of memory\n");
exit(EXIT_FAILURE);
}
tokens[n_tokens]=i;
n_tokens++;
}
if(n_tokens<=MANDATORY_COLS_COUNT)
{
fprintf(stderr,"In %s expected at least %d columns but found %d\n",
line,(MANDATORY_COLS_COUNT+1),n_tokens);
exit(EXIT_FAILURE);
}
/* this is the header, alloc and insert the individuals */
if(nLine==1)
{
n_individuals=n_tokens-MANDATORY_COLS_COUNT;
individuals_id=(int*)malloc(sizeof(int)*n_individuals);
if(individuals_id==NULL)
{
fprintf(stderr,"Cannot malloc individuals_id.\n");
exit(EXIT_FAILURE);
}
for(i=MANDATORY_COLS_COUNT;i< n_tokens;++i)
{
/* bind the parameters of the prepared statement */
if(sqlite3_bind_text(
pstmt_insert_indi,1,
&line[tokens[i-1]+1],
tokens[i]-tokens[i-1]-1,
NULL)!=SQLITE_OK)
{
fprintf(stderr,"Cannot bind individual's name.\n");
exit(EXIT_FAILURE);
}
/* execute the prepared statement */
if (sqlite3_step(pstmt_insert_indi) != SQLITE_DONE) {
fprintf(stderr,"Could not step (execute) stmt.\n");
exit(EXIT_FAILURE);
}
/* get the last_insert_id */
individuals_id[i-MANDATORY_COLS_COUNT]=sqlite3_last_insert_rowid(env->connection);
/* reset the prepared statement */
sqlite3_reset(pstmt_insert_indi);
}
continue;
}
if(n_tokens-MANDATORY_COLS_COUNT!=n_individuals)
{
fprintf(stderr,"expected %d genotypes but got %d.\n",
n_individuals,n_tokens-MANDATORY_COLS_COUNT);
exit(EXIT_FAILURE);
}
/* insert marker */
if(sqlite3_bind_text(
pstmt_insert_marker,1,
&line[0],
tokens[0],
NULL)!=SQLITE_OK)
{
fprintf(stderr,"Cannot bind markers's name.\n");
exit(EXIT_FAILURE);
}
if(sqlite3_bind_text(
pstmt_insert_marker,2,
&line[tokens[1]+1],
tokens[2]-tokens[1]-1,
NULL)!=SQLITE_OK)
{
fprintf(stderr,"Cannot bind markers's chrom.\n");
exit(EXIT_FAILURE);
}
if( sqlite3_bind_int(
pstmt_insert_marker,3,
atoi(&line[tokens[2]+1])
)!=SQLITE_OK)
{
fprintf(stderr,"Cannot bind markers's position.\n");
exit(EXIT_FAILURE);
}
if (sqlite3_step(pstmt_insert_marker) != SQLITE_DONE) {
fprintf(stderr,"Could not step insert marker.\n");
exit(EXIT_FAILURE);
}
marker_id=sqlite3_last_insert_rowid(env->connection);
fprintf(stderr,"[LOG]inserted marker ID.%d\n",marker_id);
sqlite3_reset(pstmt_insert_marker);
/* loop over the genotypes */
for(i=MANDATORY_COLS_COUNT;i< n_tokens;++i)
{
/* bind individual-id */
if( sqlite3_bind_int(
pstmt_insert_genotype,1,
individuals_id[i-MANDATORY_COLS_COUNT]
)!=SQLITE_OK)
{
fprintf(stderr,"Cannot bind indi-id.\n");
exit(EXIT_FAILURE);
}
/* bind marker-id */
if( sqlite3_bind_int(
pstmt_insert_genotype,2,
marker_id
)!=SQLITE_OK)
{
fprintf(stderr,"Cannot bind marker-id.\n");
exit(EXIT_FAILURE);
}
/* bind alleles */
if(sqlite3_bind_text(
pstmt_insert_genotype,3,
&line[tokens[i-1]+1],
tokens[i]-tokens[i-1]-1,
NULL)!=SQLITE_OK)
{
fprintf(stderr,"Cannot bind alleles.\n");
exit(EXIT_FAILURE);
}
/* insert the genotype */
if (sqlite3_step(pstmt_insert_genotype) != SQLITE_DONE) {
fprintf(stderr,"Could not insert genotype.\n");
exit(EXIT_FAILURE);
}
/* reset statement */
sqlite3_reset(pstmt_insert_genotype);
}
free(tokens);
free(line);
}
/* cleanup */
if(individuals_id!=NULL)
{
free(individuals_id);
}
if(pstmt_insert_indi!=NULL)
{
sqlite3_finalize(pstmt_insert_indi);
}
if(pstmt_insert_marker!=NULL)
{
sqlite3_finalize(pstmt_insert_marker);
}
if(pstmt_insert_genotype!=NULL)
{
sqlite3_finalize(pstmt_insert_genotype);
}
if(env->connection!=NULL)
{
sqlite3_close(env->connection);
}
if(env->filein!=NULL && env->in!=NULL)
{
gzclose(env->in);
}
return EXIT_SUCCESS;
}
/** main */
int main(int argc, char **argv)
{
HapMap2SQLite app;
app.fileout=NULL;
app.filein=NULL;
app.in=NULL;
app.connection=NULL;
int ret=0;
int optind=1;
/* loop over args */
while(optind<argc)
{
if(strcmp(argv[optind],"-h")==0)
{
return EXIT_SUCCESS;
}
else if(strcmp(argv[optind],"-o")==0 && optind+1< argc)
{
FILE* in;
app.fileout=argv[++optind];
in= fopen(app.fileout,"rb");
if(in!=NULL)
{
fclose(in);
fprintf(stderr,"Error %s already exists.\n",app.fileout);
return EXIT_FAILURE;
}
}
else if(strcmp(argv[optind],"--")==0)
{
optind++;
break;
}
else if(argv[optind][0]=='-')
{
fprintf(stderr,"Illegal option %s.\n",argv[optind]);
return EXIT_FAILURE;
}
else
{
break;
}
++optind;
}
if(app.fileout==NULL)
{
fprintf(stderr,"Undefined file-out\n");
return EXIT_FAILURE;
}
if(optind==argc)
{
//read from stdin
app.filein=NULL;
ret=hapmap2sqlite(&app);
}
else if(optind+1!=argc)
{
fprintf(stderr,"Illegal number of arguments.\n");
return EXIT_FAILURE;
}
else
{
//read from file
app.filein=argv[optind];
ret=hapmap2sqlite(&app);
}
select_homozygous(&app);
return ret;
}
CFLAGS=-I ${SQLITE3DIR}/include -Wall
LDFLAGS=-L ${SQLITE3DIR}/lib
#export LD_LIBRARY_PATH=${SQLITE3DIR}/lib
test:hapmap2sqlite genotypes.txt.gz
rm -f database.db
./hapmap2sqlite -o database.db genotypes.txt.gz
genotypes.txt.gz:
wget -O $@ "http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/latest/forward/non-redundant/genotypes_chrY_CEU_r27_nr.b36_fwd.txt.gz"
hapmap2sqlite:hapmap2sqlite.c
gcc -o $@ $(CFLAGS) $(LDFLAGS) hapmap2sqlite.c -lsqlite3 -lz
clean:
rm -f hapmap2sqlite database.db *.o a.out
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment