Created
June 25, 2021 17:42
Star
You must be signed in to star a gist
Data generating model for mutational signatures
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rafalib) | |
J <- 100 # number of samples | |
# X - covariates | |
X <- cbind(rep(c(0,1,0,1),each=J/4),rep(0:1,each=J/2)) | |
imagemat(X) | |
K <- 3 # number of signatures | |
# beta - associations with K = 3 signatures | |
beta <- cbind(c(1, -1), c(-1, 0), c(0, 1)) | |
sigma <- 0.1 | |
# some variation | |
eps <- matrix(rnorm(K * J,0,sigma), nrow=J) | |
# alpha - K vector for each sample used to define its contribution | |
alpha <- X %*% beta + eps | |
# now these sum to 1, so are contributions | |
sm_alpha <- t(apply(alpha, 1, function(x) exp(x)/sum(exp(x)))) | |
all.equal(rowSums(sm_alpha), rep(1,J)) | |
imagemat(alpha) | |
imagemat(sm_alpha) | |
# now map from K signature space to 96-dim COSMIC space | |
C <- matrix(runif(96*K), ncol=K) | |
C <- t(t(C)/colSums(C)) | |
# order rows for ease of visualization of the demo (wouldnt do in real analysis) | |
hc <- hclust(dist(C)) | |
C <- C[hc$order,] | |
imagemat(C) | |
# phi - the prob of each mutation type for each sample | |
phi <- C %*% t(sm_alpha) | |
all.equal(colSums(phi), rep(1,J)) | |
imagemat(phi) | |
# N - number of mutations per sample | |
N <- rnbinom(J, mu=300, size=1) | |
N[N == 0] <- 1 | |
# Y - the observed counts | |
Y <- sapply(seq_len(J), function(j) rmultinom(1, size=N[j], prob=phi[,j])) | |
all(colSums(Y) == N) | |
imagemat(Y) | |
# super naive analysis: pick the top mutation type for each signature | |
# what's wrong here: | |
# - doesn't account for the multidimensional nature of signature | |
# so big loss of signal | |
# (and dealing w/ same mut type can occur in mult signatures) | |
# - doesn't account for count variability, low information from low counts | |
top <- apply(scale(t(C)), 1, which.max) | |
hist(Y[top[1],]) # how many counts per sample | |
signal <- t(Y[top,])/N # scale by number of mutations | |
x1 <- X[,1] | |
x2 <- X[,2] | |
tstats <- apply(signal, 2, function(z) { | |
summary(lm(z ~ x1 + x2))$coefficients[2:3,3] | |
}) | |
tstats | |
beta |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment