Last active
February 13, 2019 14:21
-
-
Save maehler/239e18e7d9f2b53c792c05f2aca5cebd to your computer and use it in GitHub Desktop.
GWAS gene set enrichment analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <Rcpp.h> | |
#include <random> | |
using namespace Rcpp; | |
struct gsea { | |
float score; | |
NumericVector running_sum; | |
int N; | |
int N_h; | |
float N_r; | |
NumericVector gene_index; | |
CharacterVector geneset_subset; | |
}; | |
gsea enrichment_score(NumericVector pvalues, CharacterVector genes, std::set<char*> geneset, int power) { | |
float N_r; | |
int N; | |
int N_h; | |
int gene_count; | |
float cumsum; | |
float min_score; | |
float max_score; | |
N = pvalues.length(); | |
NumericVector running_sum(N); | |
N_r = 0; | |
N_h = 0; | |
for (int i = 0; i < genes.length(); i++) { | |
if (geneset.count(genes[i])) { | |
N_h++; | |
N_r += std::pow(-std::log10(pvalues[i]), power); | |
} | |
} | |
NumericVector gene_index(N_h); | |
CharacterVector geneset_subset(N_h); | |
gene_count = 0; | |
cumsum = 0; | |
min_score = std::numeric_limits<float>::max(); | |
max_score = std::numeric_limits<float>::lowest(); | |
for (int i = 0; i < N; i++) { | |
if (geneset.count(genes[i])) { | |
cumsum += std::pow(-std::log10(pvalues[i]), power) / (float)N_r; | |
gene_index[gene_count] = i + 1; | |
geneset_subset[gene_count++] = genes[i]; | |
} else { | |
cumsum += -1 / (float)(N - N_h); | |
} | |
if (cumsum > max_score) { | |
max_score = cumsum; | |
} else if (cumsum < min_score) { | |
min_score = cumsum; | |
} | |
running_sum[i] = cumsum; | |
} | |
if (std::abs(min_score) > max_score) { | |
return {min_score, running_sum, N, N_h, N_r, gene_index, geneset_subset}; | |
} | |
return {max_score, running_sum, N, N_h, N_r, gene_index, geneset_subset}; | |
} | |
// [[Rcpp::export]] | |
List gsea(NumericVector pvalues, CharacterVector geneset, | |
int permutations = 1000, int power = 1) { | |
std::set<char*> gset; | |
float n_eq_gt; | |
float pvalue; | |
NumericVector pvalues_sorted; | |
CharacterVector genes; | |
CharacterVector random_genes; | |
std::random_device rd; | |
std::mt19937 g(rd()); | |
pvalues_sorted = pvalues.sort(); | |
genes = pvalues_sorted.names(); | |
for (int i = 0; i < geneset.length(); ++i) { | |
gset.insert(geneset[i]); | |
} | |
auto base_score = enrichment_score(pvalues_sorted, genes, gset, power); | |
n_eq_gt = 0; | |
for (int i = 0; i < permutations; i++) { | |
random_genes = clone(genes); | |
std::shuffle(random_genes.begin(), random_genes.end(), g); | |
auto random_score = enrichment_score(pvalues_sorted, random_genes, gset, power); | |
if (base_score.score > 0 && random_score.score >= base_score.score) { | |
n_eq_gt++; | |
} else if (base_score.score < 0 && random_score.score <= base_score.score) { | |
n_eq_gt++; | |
} | |
} | |
pvalue = std::max(1 / (float)permutations, n_eq_gt / float(permutations)); | |
return List::create(_["score"] = base_score.score, | |
_["running_sum"] = base_score.running_sum, | |
_["pvalue"] = pvalue, | |
_["geneset"] = base_score.geneset_subset, | |
_["geneset_pos"] = base_score.gene_index, | |
_["N"] = base_score.N, | |
_["N_h"] = base_score.N_h, | |
_["N_r"] = base_score.N_r, | |
_["power"] = power, | |
_["permutations"] = permutations); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment