Skip to content

Instantly share code, notes, and snippets.

@maehler
Last active February 13, 2019 14:21
Show Gist options
  • Save maehler/239e18e7d9f2b53c792c05f2aca5cebd to your computer and use it in GitHub Desktop.
Save maehler/239e18e7d9f2b53c792c05f2aca5cebd to your computer and use it in GitHub Desktop.
GWAS gene set enrichment analysis
#include <Rcpp.h>
#include <random>
using namespace Rcpp;
struct gsea {
float score;
NumericVector running_sum;
int N;
int N_h;
float N_r;
NumericVector gene_index;
CharacterVector geneset_subset;
};
gsea enrichment_score(NumericVector pvalues, CharacterVector genes, std::set<char*> geneset, int power) {
float N_r;
int N;
int N_h;
int gene_count;
float cumsum;
float min_score;
float max_score;
N = pvalues.length();
NumericVector running_sum(N);
N_r = 0;
N_h = 0;
for (int i = 0; i < genes.length(); i++) {
if (geneset.count(genes[i])) {
N_h++;
N_r += std::pow(-std::log10(pvalues[i]), power);
}
}
NumericVector gene_index(N_h);
CharacterVector geneset_subset(N_h);
gene_count = 0;
cumsum = 0;
min_score = std::numeric_limits<float>::max();
max_score = std::numeric_limits<float>::lowest();
for (int i = 0; i < N; i++) {
if (geneset.count(genes[i])) {
cumsum += std::pow(-std::log10(pvalues[i]), power) / (float)N_r;
gene_index[gene_count] = i + 1;
geneset_subset[gene_count++] = genes[i];
} else {
cumsum += -1 / (float)(N - N_h);
}
if (cumsum > max_score) {
max_score = cumsum;
} else if (cumsum < min_score) {
min_score = cumsum;
}
running_sum[i] = cumsum;
}
if (std::abs(min_score) > max_score) {
return {min_score, running_sum, N, N_h, N_r, gene_index, geneset_subset};
}
return {max_score, running_sum, N, N_h, N_r, gene_index, geneset_subset};
}
// [[Rcpp::export]]
List gsea(NumericVector pvalues, CharacterVector geneset,
int permutations = 1000, int power = 1) {
std::set<char*> gset;
float n_eq_gt;
float pvalue;
NumericVector pvalues_sorted;
CharacterVector genes;
CharacterVector random_genes;
std::random_device rd;
std::mt19937 g(rd());
pvalues_sorted = pvalues.sort();
genes = pvalues_sorted.names();
for (int i = 0; i < geneset.length(); ++i) {
gset.insert(geneset[i]);
}
auto base_score = enrichment_score(pvalues_sorted, genes, gset, power);
n_eq_gt = 0;
for (int i = 0; i < permutations; i++) {
random_genes = clone(genes);
std::shuffle(random_genes.begin(), random_genes.end(), g);
auto random_score = enrichment_score(pvalues_sorted, random_genes, gset, power);
if (base_score.score > 0 && random_score.score >= base_score.score) {
n_eq_gt++;
} else if (base_score.score < 0 && random_score.score <= base_score.score) {
n_eq_gt++;
}
}
pvalue = std::max(1 / (float)permutations, n_eq_gt / float(permutations));
return List::create(_["score"] = base_score.score,
_["running_sum"] = base_score.running_sum,
_["pvalue"] = pvalue,
_["geneset"] = base_score.geneset_subset,
_["geneset_pos"] = base_score.gene_index,
_["N"] = base_score.N,
_["N_h"] = base_score.N_h,
_["N_r"] = base_score.N_r,
_["power"] = power,
_["permutations"] = permutations);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment