Skip to content

Instantly share code, notes, and snippets.

@lindenb
Created April 16, 2010 07:33
Show Gist options
  • Save lindenb/368136 to your computer and use it in GitHub Desktop.
Save lindenb/368136 to your computer and use it in GitHub Desktop.
/**
naive program showing how to manage a (large) linkage file with a binary file
Pierre Lindenbaum
*/
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <vector>
#include <iostream>
using namespace std;
#define MAX_NAME 50
#define MAX_INDIVIDUAL_NAME MAX_NAME
#define MAX_SNP_NAME MAX_NAME
/** gender */
enum gender
{
unknown=0,
male=1,
female=2
};
/** A SNP mapped on the genome */
typedef struct marker_t
{
/** rs Name */
char name[MAX_SNP_NAME];
/** Y index in the table */
int rowIndex;
/** chromosome identifier */
int chromosomeId;
/** position in the chromosome */
int position;
}Marker,*MarkerPtr;
/** A Individual genotyped */
typedef struct individual_t
{
/** X column index in the table */
int columnIndex;
/** his name */
char name[MAX_INDIVIDUAL_NAME];
/** index of the father of -1 if unknown */
int fatherIndex;
/** index of the mother of -1 if unknown */
int motherIndex;
/** George Michael wants it */
gender sex;
}Individual,*IndividualPtr;
/** Alleles , just two bases */
typedef struct alleles_t
{
char alleles[2];
}Alleles,*AllelesPtr;
/**
a class handling a 'linkage' binary file
X axis are the individuals (in memory)
Y axis are the SNP
values are the Marker
both Marker and Alleles are accessed using a random access
structure of the file
1 int = count(individuals)
1 int = count(markers)
count(individuals)*Individuals = Individuals
count(markers)*( Marker + alleles[count(individuals)]) = markers+genotypes
*/
class Linkage
{
private:
/** FILE pointer to the binary file */
FILE* io_ptr;
/** All individuals in the pedigree */
vector<IndividualPtr> individuals;
/** number of markers in the binary file */
int count_markers;
/** last SNP fetched ,in memory, avoid to call fseek each time for the same marker */
MarkerPtr current_snp;
/** last row of genotypes fetched, used in memory, avoid to call fseek each time for the same marker */
AllelesPtr current_alleles;
private:
/** fetch the rowIndex-th SNP and the rowIndex-th array of Alleles */
void _fetchRow(int rowIndex)
{
//current SNP is already in memory
if(current_snp->rowIndex==rowIndex) return;
//change the file position
fseek( io_ptr, (2*sizeof(int) + //nbr individual + nbr markers
sizeof(Individual)*individuals.size()+ //individual names
(sizeof(Alleles)*individuals.size()+sizeof(Marker))*rowIndex //previous rows (Marker+alleles[])
),
SEEK_SET
);
//read the row:
fread(current_snp,sizeof(Marker),1,io_ptr);//read the current_snp
fread(current_alleles,sizeof(Alleles),individuals.size(),io_ptr);//read the array of alleles
}
public:
Linkage():io_ptr(NULL),count_markers(-1),current_snp(NULL),current_alleles(NULL)
{
}
~Linkage()
{
close();
}
void open(const char* filename,bool read_only)
{
//close if it was already open
close();
//open as binary
io_ptr=fopen(filename,(read_only?"rb":"wb"));
if(io_ptr==NULL) throw "cannot open file";
if(read_only)
{
int n_individuals=0;
//read the number of individuals
fread(&n_individuals,sizeof(int),1,io_ptr);
//alloc the buffer for current_snp
current_snp=new Marker;
current_snp->rowIndex=-1;
//alloc the buffer for current_allele
current_alleles=new Alleles[n_individuals];
//read the number of SNP
fread(&count_markers,sizeof(int),1,io_ptr);
//read the definition of the individuals
for(int i=0;i< n_individuals;++i)
{
IndividualPtr indi=new Individual;
fread(indi,sizeof(Individual),1,io_ptr);
//add it into the pedigree
individuals.push_back(indi);
}
}
}
void close()
{
//clean individuals
while(!individuals.empty())
{
delete individuals.back();
individuals.pop_back();
}
//close file
if(io_ptr!=NULL) fclose(io_ptr);
io_ptr=NULL;
//clean current_snp and current_alleles
if(current_snp!=NULL) delete current_snp;
current_snp=NULL;
if(current_alleles!=NULL) delete [] current_alleles;
current_alleles=NULL;
}
//create a random binary file
void random()
{
//create a pedigree
while(individuals.size()<100)
{
//create a new individual
IndividualPtr newindi=new Individual;
newindi->columnIndex=individuals.size();
newindi->fatherIndex=-1;
newindi->motherIndex=-1;
snprintf(newindi->name,MAX_INDIVIDUAL_NAME,"IndiId.%03d",individuals.size());
//add this new individual
individuals.push_back(newindi);
}
int n=individuals.size();
int n_markers=100000;
//save the number of individuals
fwrite(&n,sizeof(int),1,io_ptr);
//save the number of snps
fwrite(&n_markers,sizeof(int),1,io_ptr);
//write the individuals
for(int i=0; i< (int)individuals.size();++i)
{
fwrite(individuals[i],sizeof(Individual),1,io_ptr);
}
//write all the SNPs
for(int i=0;i< n_markers;++i)
{
Marker snp;
snprintf(snp.name,MAX_SNP_NAME,"rs%d",(i+1));
snp.chromosomeId=12;
snp.position=i*2;
snp.rowIndex=i;
//write this snp
fwrite(&snp,sizeof(Marker),1,io_ptr);
//for each SNP , we add an array of Alleles[individuals.size()]
for(int j=0;j< (int)individuals.size();++j)
{
Alleles a;
a.alleles[0]=(j%3==0?'A':(i%2==0?'A':'T'));
a.alleles[1]=(i%13==0?'C':'T');;
fwrite(&a,sizeof(Alleles),1,io_ptr);
}
}
//flush
fflush(io_ptr);
}
//get the marker at a given Y index
const MarkerPtr getMarker(int marker_index)
{
_fetchRow(marker_index);
return current_snp;
}
//get the genotype for the given individual and the given snp
const AllelesPtr get(const IndividualPtr indi,const MarkerPtr snp)
{
_fetchRow(snp->rowIndex);
return &current_alleles[indi->columnIndex];
}
void test()
{
//loop over the markers
for(int i=0;i< this->count_markers;i+=7)
{
//get the i-th marker
const MarkerPtr snp=getMarker(i);
//loop over the individual
for(int j=0;j< (int)individuals.size();j+=10)
{
const IndividualPtr indi =individuals[j];
//get the genotype for this(individual/snp)
const AllelesPtr var= get(indi,snp);
cout << snp->name << " for " << indi->name << " is (" << var->alleles[0]<<"/"<< var->alleles[1] <<")" << endl;
}
}
}
};
int main(int argc,char **argv)
{
Linkage linkage;
char* filename=NULL;
char* program=NULL;
int optind=1;
while(optind<argc)
{
if(strcmp(argv[optind],"-h")==0)
{
std::cerr << "Usage: "<< argv[0] << "[options]" << endl
<<" -f <filename>" << endl
<<" -p <program= read|write>" << endl
;
return EXIT_SUCCESS;
}
else if(strcmp(argv[optind],"-f")==0 && optind+1<argc)
{
filename=argv[++optind];
}
else if(strcmp(argv[optind],"-p")==0 && optind+1<argc)
{
program=argv[++optind];
}
else if(strcmp(argv[optind],"--")==0)
{
++optind;
break;
}
else if(argv[optind][0]=='-')
{
std::cerr << "unknown option "<< argv[optind] << endl;
return EXIT_FAILURE;
}
else
{
break;
}
++optind;
}
if(optind!=argc)
{
std::cerr << "illegal number of args"<< endl;
return EXIT_FAILURE;
}
if(filename==NULL)
{
std::cerr << "filename not defined"<< endl;
return EXIT_FAILURE;
}
if(program==NULL)
{
std::cerr << "program not defined"<< endl;
return EXIT_FAILURE;
}
else if(strcmp(program,"write")==0)
{
linkage.open(filename,false);
linkage.random();
}
else if(strcmp(program,"read")==0)
{
linkage.open(filename,true);
linkage.test();
}
else
{
std::cerr << "unknown program";
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
all:
g++ -Wall binlinkage.cpp
./a.out -f linkage.bin -p write
./a.out -f linkage.bin -p read
rm linkage.bin
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment