Skip to content

Instantly share code, notes, and snippets.

@molpopgen
Last active August 29, 2015 14:01
Show Gist options
  • Save molpopgen/0541edae5f19ecd6103b to your computer and use it in GitHub Desktop.
Save molpopgen/0541edae5f19ecd6103b to your computer and use it in GitHub Desktop.
Making PLINK's permutation code "cluster-friendly". For v1.07. Replace the files in that version with these, then recompile.
//////////////////////////////////////////////////////////////////
// //
// PLINK (c) 2005-2008 Shaun Purcell //
// //
// This file is distributed under the GNU General Public //
// License, Version 2. Please see the file COPYING for more //
// details //
// //
//////////////////////////////////////////////////////////////////
#include <iostream>
#include <cmath>
#include <algorithm>
#include "perm.h"
#include "helper.h"
#include "stats.h"
#include "sets.h"
Perm::Perm(Plink & pref) : P(pref)
{
if (par::adaptive_perm)
{
adaptive = true;
replicates = par::adaptive_max;
}
else
{
adaptive = false;
replicates = par::replicates;
}
count = par::perm_count;
min = par::adaptive_min;
interval = par::adaptive_interval;
performed = 0;
dump_all = par::mperm_save_all;
dump_best = par::mperm_save_best;
permoutfilename = par::output_file_name;
if (dump_all)
{
permoutfilename += string(".mperm.dump.all.gz");
}
else if (dump_best)
{
permoutfilename += string(".mperm.dump.best.gz");
}
POUTSTREAM = gzopen( permoutfilename.c_str(), "w" );
gzclose( POUTSTREAM );
}
void Perm::setTests(int x)
{
performed = 0;
t = x;
R.clear();
N.clear();
test.clear();
snp_test.clear();
maxR.clear();
R.resize(t,0);
if (adaptive)
{
N.resize(t,0);
test.resize(t,true);
snp_test.resize(t,true);
if ( par::set_test )
snp_test.resize(P.nl_all,true);
// Given t tests, set the threshold to be
// p +/- Phi^{-1} (1 - \gamma/2t ) sqrt( p(1-p)/N )
zt = ltqnorm( 1 - par::adaptive_ci / ( 2 * t ) ) ;
}
else
{
maxR.resize(t,0);
}
// For gene-dropping, set up some family-information
if (par::perm_genedrop)
preGeneDrop();
}
// Redundant
void Perm::setAdaptiveSetSNPs(int x)
{
// snp_test.clear();
// snp_test.resize(x,true);
}
void Perm::originalOrder()
{
for (int i=0; i<P.n; i++)
P.sample[i]->pperson = P.sample[i];
}
bool Perm::finished()
{
if (performed>=replicates) return true;
else return false;
}
void Perm::permuteInCluster()
{
// Store remapped IDs
vector<vector< long int> > i(ns);
// Permute phenotypes, within cluster
for (int k=0; k<ns; k++)
{
vector<long int> p(s[k].size());
permute(p);
i[k]=p;
}
//////////////////////////
// Post-permutation:
// Iterate over clusters { s[][] }
// i[][] holds the permuted codes
// s[][] points to individuals (non-missing)
// Genotype = sample[s[j][k]];
// Matching phenotype = sample[s[j][(int)i[j][k]]];
// Create pheno[] with label-swapped codes
for (int j=0; j<s.size(); j++)
for (int k=0; k<s[j].size(); k++)
P.sample[s[j][k]]->pperson = P.sample[s[j][(int)i[j][k]]];
}
void Perm::setPermClusters(Plink & P)
{
// Permute within clusters only
// (stored in sample[i]->sol)
// Get list of non-missing individuals, and number of solutions
// These are always numbered 0,1,2,3,..
// -1 indicates do not permute this individual
// 0..ns indicate cluster numbers
// Count the number of clusters: 'ns'
ns=-1;
for (int i=0; i<P.n; i++)
if ((!P.sample[i]->missing) && P.sample[i]->sol > ns)
ns=P.sample[i]->sol;
ns++;
// store set membership is 's'
s.resize(ns);
for (int i=0; i<P.n; i++)
if (!P.sample[i]->missing && P.sample[i]->sol>=0)
s[P.sample[i]->sol].push_back(i);
pheno.resize(P.n);
if (par::permute && ! par::QTDT_test )
P.printLOG("Set to permute within "+int2str(ns)+" cluster(s)\n");
}
void Perm::setOriginalRanking(vector_t & original)
{
vector<Pair2> o;
for (int i=0; i<original.size(); i++)
{
Pair2 p;
p.p = original[i];
p.l = i;
o.push_back(p);
}
sort(o.begin(),o.end());
order.clear();
reorder.resize(original.size());
for(int i=0; i<original.size(); i++)
{
order.push_back(o[i].l);
reorder[o[i].l] = i;
}
}
bool Perm::update(vector<double> & result, vector<double> & original)
{
// Increment number of permutations performed
performed++;
// Finished all perms?
bool done = false;
//////////////////////////////
// Update number of successes
if (!adaptive)
{
for (int l=0; l<t; l++)
if (result[l] >= original[l] || !realnum(original[l]) ) R[l]++;
}
else
{
for (int l=0; l<t; l++)
if (test[l])
{
if (result[l] >= original[l] || !realnum(original[l]) ) R[l]++;
N[l]++;
}
}
// Stopping rules for adaptive permutation?
int todo = 0;
if (adaptive &&
performed > min &&
performed % interval == 0)
{
// Update interval
interval = (int)(par::adaptive_interval + performed * par::adaptive_interval2);
// Consider each test
for (int l=0; l<t; l++)
{
if (test[l])
{
// Check for at least one success
if (R[l]>0)
{
double pv = (double)(R[l]+1)/(double)(performed+1);
double sd = sqrt( pv * (1-pv) / performed );
double lower = pv - zt * sd;
double upper = pv + zt * sd;
//double cv = sd/(performed*pv);
if (lower<0) lower = 0;
if (lower>1) upper = 0;
// Is lower bound greater than threshold, or
// upper bound smaller than threshold?
if (lower > par::adaptive_alpha || upper < par::adaptive_alpha )
{
N[l] = performed;
test[l] = false;
if ( par::set_test )
{
for (int j=0;j< P.pS->snpset[l].size();j++)
snp_test[ P.pS->snpset[l][j] ] = false;
}
else
snp_test[l] = false;
}
else
todo++;
}
else todo++;
}
}
if (!par::silent)
{
if ( par::set_test )
cout << "Adaptive permutation: "
<< performed
<< " of (max) "
<< replicates
<< " : "
<< todo
<< " sets left"
<< " \r";
else
cout << "Adaptive permutation: "
<< performed
<< " of (max) "
<< replicates
<< " : "
<< todo
<< " SNPs left"
<< " \r";
}
if (todo==0) done = true;
}
///////////////////////////////////////////////////////////
// For non-adaptive permutation, keep track of the maximum
if (!adaptive)
{
if (dump_all)
{
if (performed==1)
{
PDUMP << 0 << " ";
for (int l=0; l<t; l++)
{
if( realnum( original[l] ) )
PDUMP << original[l] << " ";
else
PDUMP << "NA" << " ";
}
PDUMP << "\n";
}
PDUMP << performed << " ";
}
if (dump_best)
{
if (performed==1)
{
PDUMP << 0 << " ";
double mx=0;
for (int l=0; l<t; l++)
if (original[l]>mx)
if ( realnum(mx) )
mx=original[l];
PDUMP << mx << "\n";
}
PDUMP << performed << " ";
}
// Find maximum, or sort all results
double mx=0;
if ( par::mperm_rank )
{
// Ranked permutation
// populate mx vector
// Set any NA to 0
for (int l=0; l<t; l++)
if ( ! realnum(result[l]) )
result[l] = 0;
// Sort (ascending)
sort(result.begin(),result.end());
// Save max in any case
mx = result[result.size()-1];
}
else
{
// Standard max(T)
// populate single mx variable
for (int l=0; l<t; l++)
if (result[l]>mx)
if ( realnum(mx) )
mx=result[l];
}
if (dump_best)
PDUMP << mx << "\n";
else if (dump_all)
{
for (int l=0; l<t; l++)
{
if ( realnum( result[l] ) )
PDUMP << result[l] << " ";
else
PDUMP << "NA" << " ";
}
PDUMP << "\n";
}
//Check if buffer is large. If so, dump and clear
if( PDUMP.str().size() >= MAXBUFFERSIZE )
{
POUTSTREAM = gzopen( permoutfilename.c_str(), "a" );
gzwrite( POUTSTREAM, PDUMP.str().c_str(), PDUMP.str().size() );
gzclose(POUTSTREAM);
PDUMP.str(string());
}
// Max(T) permutation -- compare against best
if ( ! par::mperm_rank )
{
for (int l=0; l<t; l++)
if (mx >= original[l] || !realnum(original[l]) ) maxR[l]++;
}
// Rank(T) permutation -- compare against similar rank
else
{
for (int l=0; l<t; l++)
{
//cout << "P " << order[l] << " " << original[order[l]] << " , pr = " << result[l] << " ";
if (result[l] >= original[order[l]] || !realnum(original[order[l]]) )
maxR[order[l]]++;
}
}
if (!par::silent)
{
cout << "maxT permutation: "
<< performed
<< " of "
<< par::replicates
<< " \r";
cout.flush();
}
}
// Have we hit the maximum number of replicates?
if (performed>=replicates) done = true;
return done;
}
void Perm::nextSNP()
{
// Reset Perm class for next SNP when in adaptive, SNP-by-SNP mode
performed = 0;
originalOrder();
}
bool Perm::updateSNP(double result, double original, int l)
{
/////////////////////////////////////////
// Single SNP adaptive permutation update
// for QFAM -- do not allow set-based tests
// here
/////////////////////////////////////////
// Increment number of permutations performed for this SNP
performed++;
// Finished all perms for this SNP?
bool done = false;
//////////////////////////////
// Update number of successes
if (test[l])
{
if (result >= original || !realnum(original) ) R[l]++;
N[l]++;
}
// Stopping rules for adaptive permutation?
if (adaptive &&
performed > min &&
performed % interval == 0)
{
// Update interval
interval = (int)(par::adaptive_interval + performed * par::adaptive_interval2);
// Consider this specific SNP
if (test[l])
{
// Check for at least one success
if (R[l]>0)
{
double pv = (double)(R[l]+1)/(double)(performed+1);
double sd = sqrt( pv * (1-pv) / performed );
double lower = pv - zt * sd;
double upper = pv + zt * sd;
//double cv = sd/(performed*pv);
if (lower<0) lower = 0;
if (lower>1) upper = 0;
// Is lower bound greater than threshold, or
// upper bound smaller than threshold?
if (lower > par::adaptive_alpha || upper < par::adaptive_alpha )
{
N[l] = performed;
test[l] = false;
done = true;
}
}
}
}
// Have we hit the maximum number of replicates?
if (performed>=replicates) done = true;
return done;
}
int Perm::rank(int l)
{
if ( ! par::mperm_rank )
return 0;
else
return t - reorder[l];
}
double Perm::pvalue(int l)
{
if (count)
return (double)R[l];
if (adaptive)
return (double)(R[l]+1) / (double)(N[l]+1);
else
return (double)(R[l]+1) / (double)(replicates+1);
}
double Perm::max_pvalue(int l)
{
if (adaptive)
return -1;
else
return (double)(maxR[l]+1) / (double)(replicates+1);
}
int Perm::reps_done(int l)
{
if (adaptive)
return N[l];
else
return replicates;
}
//////////////////////////////////////////////////////////////////
// //
// PLINK (c) 2005-2006 Shaun Purcell //
// //
// This file is distributed under the GNU General Public //
// License, Version 2. Please see the file COPYING for more //
// details //
// //
//////////////////////////////////////////////////////////////////
#ifndef __PERM_H__
#define __PERM_H__
#include "options.h"
#include <sstream> //KRT
#include <zlib.h>
using namespace std;
class Perm {
int t; // Number of tests
long int replicates; // Basic number of replicates
long int performed; // Number of replicates actually performed
bool count; // Output counts, not p-values
ostringstream PDUMP; //KRT Verbose permutation dump
//ofstream POUTSTREAM;
gzFile POUTSTREAM;
string permoutfilename; //specific name for output file
//ofstream PDUMP; // Verbose permutation dump
bool dump_best; // Record just best permutation result
bool dump_all; // Record all permutation results
vector<int> order; // For --rank, record order of original
vector<int> reorder; // For --rank, record order of original, reverse mapping
/////////////////////////////
// Gene-dropping permutation
bool genedrop;
map<Individual*,int> idmap;
///////////////////////////////////////////
// Standard phenotype-swapping permutation
// Parameters for adaptive permutation
bool adaptive;
int min; // Minimum number of permutations
double zt; // SD CI range (based on par::adaptive_alpha)
int interval; // Prune tests every (I+N*I2) permutations
// Main storage
vector<int> R; // number of successes
vector<int> maxR; // number of genome-wide successes
vector<long int> N; // number of trials (adaptive)
// Cluster information
vector< vector<int> > s;
int ns; // number of clusters
Plink & P; // reference to Plink class
static const size_t MAXBUFFERSIZE = 10000000;
//static const size_t MAXBUFFERSIZE = 10000;
public:
Perm(Plink &);
void closeDUMP()
{
if (dump_all || dump_best)
{
if( ! PDUMP.str().empty() )
{
POUTSTREAM = gzopen( permoutfilename.c_str(), "a" );
gzwrite( POUTSTREAM, PDUMP.str().c_str(), PDUMP.str().size() );
gzclose( POUTSTREAM );
//POUTSTREAM.open( permoutfilename.c_str(), ios::app );
//POUTSTREAM << PDUMP.str();
}
//POUTSTREAM.close();
}
}
// For basic permutation
vector<int> pheno; // label-swapped phenotype
vector<int> geno; // label-swapped phenotype
vector<bool> test; // whether to stop with this test
vector<bool> snp_test; // whether to skip these SNPs in
// a set-based test
void setTests(int x);
void setAdaptiveSetSNPs(int x);
void originalOrder();
void setOriginalRanking(vector_t&);
bool finished();
bool update(vector<double>&, vector<double>&);
bool updateSNP(double,double,int);
void nextSNP();
vector<double> & report();
int current_reps() { return performed; }
int reps_done(int);
double pvalue(int);
double max_pvalue(int);
int rank(int);
void permuteInCluster();
void setPermClusters(Plink &);
void preGeneDrop();
void geneDrop();
void dropAlleles(Plink &,
Individual*,
int,
int,
vector<bool>&,
vector<bool>&,
vector<bool>&,
map<Individual*,int> &);
};
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment