set.seed(1)
s <- t(rmultinom(100,1,c(0.3,0.7)))
head(s)
s2 <- apply(s==1,1,which)
mu <- c(-2,2)
sigma <- c(1,1)
y <- rnorm(100,mu[s2],sigma[s2])
hist(y)
install.packages("mclust")
library(mclust)
set.seed(1)
mctest <- Mclust(y,2)
mctest$parameters$mean
mctest$parameters$pro
library(readr)
datFlow <- read_tsv("Landrigan.tsv")
head(datFlow)
flowmat <- as.matrix(datFlow[,-5])
flowmat <- scale(flowmat)
set.seed(1)
flowBIC <- mclustBIC(flowmat, G=2:9,
modelNames = "VVV")
plot(flowBIC)
which.max(flowBIC)+1
set.seed(1)
res_flow_5 <- Mclust(flowmat, G=5,
modelNames = "VVV")
muhat <- res_flow_5$parameters$mean
rownames(muhat) <- colnames(flowmat)
heatmap(muhat,margins = c(2,15),
col=cm.colors(256))
clusterdf <- data.frame(
treat=datFlow$treat,
cluster=factor(res_flow_5$classification))
library(ggplot2)
ggplot(clusterdf,aes(x=treat,fill=cluster))+
geom_bar(position = "fill")+
theme_bw(20)
関数の定義
VBDirMult <-function(Y,L=2,alpha=rep(1,ncol(Y)),beta=rep(1,L),maxit=2000,seed=1234){
set.seed(1)
N <- nrow(Y)
K <- ncol(Y)
EelW <- matrix(rgamma(N*L,beta),N,L,byrow = TRUE)
EelW <- EelW/rowSums(EelW)
EelH <- matrix(rgamma(L*K,alpha),L,K,byrow = TRUE)
EelH <- EelH/rowSums(EelH)
for(iter in 1:maxit){
Sw <- EelW * (((Y)/(EelW %*% EelH)) %*% t(EelH))
Sh <- EelH * (t(EelW) %*% (Y/(EelW %*% EelH)))
beta_W <- beta + Sw
alpha_H <- alpha + Sh
EelW <-exp(sweep(digamma(beta_W),1,digamma(rowSums(beta_W))))
EelH <-exp(sweep(digamma(alpha_H),1,digamma(rowSums(alpha_H))))
}
EW <- sweep(beta_W,1,rowSums(beta_W),"/")
EH <- sweep(alpha_H,1,rowSums(alpha_H),"/")
ELBO <- sum(lgamma(rowSums(Y)+1)) - sum(lgamma(Y+1))+
sum(-Y*(((EelW*log(EelW))%*%EelH + EelW%*%(EelH*log(EelH)))/(EelW%*%EelH)-log(EelW%*%EelH))) +
L*sum(lgamma(sum(alpha))-sum(lgamma(alpha))) +
sum(-lgamma(rowSums(alpha_H))+rowSums(lgamma(alpha_H))) +
N*sum(lgamma(sum(beta))-sum(lgamma(beta))) +
sum(-lgamma(rowSums(beta_W))+rowSums(lgamma(beta_W)))
return(list(W=EW,H=EH,ELBO=ELBO))
}
メタゲノム解析
library(curatedMetagenomicData)
dat_Zeller <- ZellerG_2014.metaphlan_bugs_list.stool()
exp_Zeller <- exprs(dat_Zeller)
dim(exp_Zeller)
namelist <- strsplit(row.names(exp_Zeller),"\\|")
namelen <- sapply(namelist, length)
genusname <- sapply(namelist, function(x)x[6])
Y <- exp_Zeller[namelen==6,]
rownames(Y) <- genusname[namelen==6]
Y <- t(Y)
count <- round(Y*dat_Zeller$number_reads/100 )
set.seed(1)
fit_multmix <- multmixEM(Y,k=3)
Pmat <- fit_multmix$theta
colnames(Pmat) <- colnames(Y)
Pdf <- as.data.frame(fit$H) %>%
mutate(cluster=factor(row_number())) %>%
gather(genus,value,-cluster) %>%
group_by(cluster) %>%
arrange(desc(value)) %>%
mutate(ord=row_number()) %>%
group_by(genus) %>%
dplyr::filter(any(ord<=5)) %>%
ungroup() %>%
arrange(genus)
ggplot(Pdf,aes(x=cluster,y=genus,fill=value))+
geom_tile()+
scale_fill_continuous(low="white",high="cornflowerblue")+
theme_bw()
ggplot(Pdf,aes(x=cluster,y=value))+
geom_col()+
facet_wrap(~genus)+
theme_bw()
disease <- dat_Zeller$disease
age <- dat_Zeller$age
gender <- dat_Zeller$gender
country <- dat_Zeller$country
fitDM <- VBDirMult(Y,L=3)
Hdf <- as.data.frame(fit$H) %>%
mutate(topic=factor(row_number())) %>%
gather(genus,value,-topic) %>%
group_by(topic) %>%
arrange(desc(value)) %>%
mutate(ord=row_number()) %>%
group_by(genus) %>%
dplyr::filter(any(ord<=5)) %>%
ungroup() %>%
arrange(genus)
ggplot(Hdf,aes(x=topic,y=genus,fill=value))+
geom_tile()
ggplot(Hdf,aes(x=topic,y=value))+
geom_col()+
facet_wrap(~genus)+
theme_bw(14)
Wdf <- as.data.frame(fit$W) %>%
arrange(V1,V1+V2) %>%
set_names(1:3) %>%
mutate(id=factor(row_number()),
disease=factor(dat_Zeller$disease,levels=c("healthy","adenoma","CRC"))) %>%
gather(topic,value,-id,-disease)
ggplot(Wdf,aes(x=id,y=value,fill=topic))+
geom_col()+
facet_grid(~disease,scales = "free",space = "free")
ggplot(Wdf,aes(x=disease,y=value))+
geom_boxplot()+
facet_wrap(~topic)
###付録
datGSE <- read_tsv("GSE60450_Lactation-GenewiseCounts.txt")
dat_info <- read.csv("SampleInfo.txt",sep = "\t")
Y <- t(datGSE[,-c(1:2)])
Ti <- (10^3)*sweep(Y,2,datGSE$Length,"/")
varTi <- apply(Ti,2,var)
Ti<- Ti[,sort.list(varTi,decreasing = TRUE)[1:8000]]
LDAout4 <- VBDirMult(Ti,4)
barplot(t(LDAout4$W))
基本
?ToothGrowth
head(ToothGrowth)
plot(ToothGrowth)
formula1 <- len~supp+dose
lm_TG <- lm(formula1,data=ToothGrowth)
summary(lm_TG)
summary(resid(lm_TG))
信頼区間
library(tidyverse)
df_lm_TG <- data.frame(confint(lm_TG)) %>%
set_names(c("upper","lower")) %>%
rownames_to_column("variable") %>%
mutate(estimate=coef(lm_TG))
ggplot(df_lm_TG,aes(x=variable,y=estimate,ymin=lower,ymax=upper))+
geom_pointrange()+
geom_hline(yintercept = 0,linetype=2)+
theme_bw(24)
説明変数の扱い
X_TG <- model.matrix(formula1,data=ToothGrowth)
head(X_TG)
X_TG[,2]
ToothGrowth$supp
X_TG[1,]%*%coef(lm_TG)
predict(lm_TG)[1]
fac1 <- factor(c("A","B","C"))
model.matrix(~fac1)
lm(len~supp+dose-1,data=ToothGrowth)
ToothGrowth2 <- ToothGrowth %>%
mutate(supp=factor(ToothGrowth$supp,levels = c("VC","OJ")))
lm_TG2 <- lm(len~supp+dose,data=ToothGrowth2)
summary(lm_TG2)
交互作用
model.matrix(len~supp*dose,data=ToothGrowth)
lm_TG_ct <- lm(len~supp*dose,data=ToothGrowth)
summary(lm_TG_ct)
対数を取る
lm_TG_log <- lm(len~supp+log2(dose),data=ToothGrowth)
summary(lm_TG_log)
log2(ToothGrowth$dose)
plot(len~dose, data=ToothGrowth)
plot(len~log2(dose), data=ToothGrowth)
curve(1/x,xlim=c(0,2))
AICとBIC
AIC(lm_TG)
AIC(lm_TG_ct)
AIC(lm_TG_log)
BIC(lm_TG)
BIC(lm_TG_ct)
BIC(lm_TG_log)
res_lm_TG <- resid(lm_TG)
lm_TG_log <- lm(log(len)~supp+dose,data=ToothGrowth)
summary(lm_TG_log)
hist(resid(lm_TG),freq = FALSE)
summary(lm_TG)
summary(resid(lm_TG))
X_TG <- model.matrix(len~supp+dose,data=ToothGrowth)
gam_TG_log <- glm(log(len)~supp+dose,data=ToothGrowth,dist=Gamma("log"))
head(X_TG)
ToothGrowth$supp
X_TG[,"suppVC"]
lm_TG_ct <- lm(len~supp:dose,data=ToothGrowth)
summary(lm_TG_ct)
AIC(lm_TG_ct)
AIC(lm_TG)
rsTG <- resid(lm_TG)
hist(rsTG,freq=FALSE,main="",xlab = "residuals")
curve(dnorm(x,0,sd(rsTG)),add=TRUE,lwd=2,col="royalblue")
library(tidyverse)
ggplot(InsectSprays,aes(x=spray,y=count))+
geom_violin()
# hist(InsectSprays$count[InsectSprays$spray=="A"],breaks=10,freq = FALSE)
# meanA <- mean(InsectSprays$count[InsectSprays$spray=="A"])
# points(5:25,dpois(5:25,meanA))
pois_IS <- glm(count~spray,data=InsectSprays,family =poisson)
summary(pois_IS)
pois_IS <- glm(count~spray-1,data=InsectSprays,family =poisson)
pois_IS_df <- as.data.frame(confint(pois_IS)) %>%
set_names(c("lower","upper")) %>%
mutate(spray=unique(InsectSprays$spray)) %>%
mutate(count=exp(coef(pois_IS))) %>%
mutate(lower_pred=qpois(0.025,exp(lower)),
upper_pred=qpois(0.975,exp(upper)))
ggplot(InsectSprays,aes(x=spray,y=count))+
geom_pointrange(data=pois_IS_df,aes(ymin=lower_pred,ymax=upper_pred),colour="royalblue")+
geom_jitter(height = 0)+
theme_bw(24)
lm_IS <- lm(count~spray-1,data=InsectSprays)
sd_lm_IS <- sd(resid(lm_IS))
lm_IS_df <- as.data.frame(confint(lm_IS)) %>%
set_names(c("lower","upper")) %>%
mutate(spray=unique(InsectSprays$spray)) %>%
mutate(count=coef(lm_IS)) %>%
mutate(lower_pred=qnorm(0.025,lower,sd_lm_IS),
upper_pred=qnorm(0.975,upper,sd_lm_IS))
ggplot(InsectSprays,aes(x=spray,y=count))+
geom_pointrange(data=lm_IS_df,aes(ymin=lower_pred,ymax=upper_pred),colour="royalblue")+
geom_jitter(height = 0)+
theme_bw(24)
fit_pois_IS <- predict(pois_IS,type="response")
set.seed(1)
sim_po_IS <- rpois(length(fit_pois_IS),fit_pois_IS)
par(mfrow=c(2,1))
hist(InsectSprays$count)
hist(sim_po_IS)
fit_lm_IS <- fitted(lm_IS)
set.seed(1)
sim_lm_IS <- rnorm(length(fit_lm_IS),fit_lm_IS,sd_IS)
par(mfrow=c(2,1))
hist(InsectSprays$count)
hist(sim_lm_IS)
sim_lm_IS
matcoef <- matrix(0,2000,6)
for (i in 1:2000) {
sim_po_IS <- rpois(length(fit_pois_IS),fit_pois_IS)
fit_sim_po_IS <- glm(sim_po_IS~spray-1,data=InsectSprays,family =poisson)
matcoef[i,] <- coef(fit_sim_po_IS)
}
apply(matcoef,2,quantile,prob=c(0.025,0.975))
confint(pois_IS)
curve(dpois(4,x,log=TRUE),1,10)
abline(v=4,lty=2)
abline(h=dpois(4,4,log=TRUE),lty=2)
summary(pois_IS)
-2*sum(dpois(InsectSprays$count,fit_pois_IS ,log=TRUE))+2*length(coef(pois_IS))
AIC(pois_IS)
-2*sum(dpois(InsectSprays$count,fit_pois_IS ,log=TRUE))+
log(nrow(InsectSprays))*length(coef(pois_IS))
BIC(pois_IS)
#####
#binomial distribution
curve(1/(1+exp(-x)),-4,4)
bi_inf <- glm(case ~ spontaneous+induced, data = infert, family = binomial)
bi_inf2 <- glm(case ~ age+parity+education+spontaneous+induced,
data = infert, family = binomial)
tab1 <- table(predict(bi_inf,type="response")>0.5,infert$case)
library(ROCR)
pred <- prediction(predict(bi_inf,type="response"), infert$case)
perf <- performance(pred, "tpr", "fpr")
plot(perf)
performance(pred, "auc")
pred2 <- prediction(predict(bi_inf2,type="response"), infert$case)
perf2 <- performance(pred2, "tpr", "fpr")
plot(perf2)
performance(pred2, "auc")
bi_inf_tmp <- glm(case ~ spontaneous+induced, data = infert[-1,], family = binomial)
p_tmp1 <- predict(bi_inf_tmp,newdata=infert[1,],type="response")
dbinom(infert$case[1],1,p_tmp1,log=TRUE)
bi_inf_tmp2 <- glm(case ~ age+parity+education+spontaneous+induced,
data = infert[-1,], family = binomial)
p_tmp2 <- predict(bi_inf_tmp2,newdata=infert[1,],type="response")
dbinom(infert$case[1],1,p_tmp2,log=TRUE)
#CV
N <- nrow(infert)
logloss1 <- numeric(N)
logloss2 <- numeric(N)
for(i in 1:N){
bi_inf_tmp <- glm(case ~ spontaneous+induced, data = infert[-i,], family = binomial)
p_tmp1 <- predict(bi_inf_tmp,newdata=infert[i,],type="response")
logloss1[i] <- dbinom(infert$case[i],1,p_tmp1,log=TRUE)
## adjusted for other potential confounders:
bi_inf_tmp2 <- glm(case ~ age+parity+education+spontaneous+induced,
data = infert[-i,], family = binomial)
p_tmp2 <- predict(bi_inf_tmp2,newdata=infert[i,],type="response")
logloss2[i] <- dbinom(infert$case[i],1,p_tmp2,log=TRUE)
}
AIC(bi_inf)
-2*sum(logloss1)
AIC(bi_inf2)
-2*sum(logloss2)
# install.packages("faraway")
data(babyfood,package="faraway")
help(babyfood,package="faraway")
bi_bodyfood <-glm(cbind(disease,nondisease)~sex+food, family=binomial,data= babyfood)
summary(bi_bodyfood)
#########
library(mixtools)
s <- sample.int(2,size = 100, replace = TRUE,prob = c(0.3,0.7))
mu <- c(-2,2)
sigma <- c(1,1)
y <- rnorm(100,mu[s],sigma[s])
hist(y)
fit_simnorm <- normalmixEM(y,k = 2)
fit_simnorm$mu
fit_simnorm$sigma
fit_simnorm$lambda
# library(flowCore)
# file.name <- system.file("extdata","0877408774.B08", package="flowCore")
# datflow <- read.FCS(file.name)
# str(datflow)
# plot(as.data.frame(datflow@exprs))
# head(datflow@exprs)
# fitflow3 <- mvnormalmixEM(datflow@exprs[,-8],k=3,arbvar = FALSE)
#
# ?mvnormalmixEM
library(curatedMetagenomicData)
dat_Zeller <- ZellerG_2014.metaphlan_bugs_list.stool()
exp_Zeller <- exprs(dat_Zeller)
dim(exp_Zeller)
namelist <- strsplit(row.names(exp_Zeller),"\\|")
namelen <- sapply(namelist, length)
genusname <- sapply(namelist, function(x)x[6])
Y <- exp_Zeller[namelen==6,]
rownames(Y) <- genusname[namelen==6]
Y <- t(Y)
count <- round(Y*dat_Zeller$number_reads/100 )
set.seed(1)
fit_multmix <- multmixEM(Y,k=3)
Pmat <- fit_multmix$theta
colnames(Pmat) <- colnames(Y)
Pdf <- as.data.frame(fit$H) %>%
mutate(cluster=factor(row_number())) %>%
gather(genus,value,-cluster) %>%
group_by(cluster) %>%
arrange(desc(value)) %>%
mutate(ord=row_number()) %>%
group_by(genus) %>%
dplyr::filter(any(ord<=5)) %>%
ungroup() %>%
arrange(genus)
ggplot(Pdf,aes(x=cluster,y=genus,fill=value))+
geom_tile()+
scale_fill_continuous(low="white",high="cornflowerblue")+
theme_bw()
ggplot(Pdf,aes(x=cluster,y=value))+
geom_col()+
facet_wrap(~genus)+
theme_bw()
disease <- dat_Zeller$disease
age <- dat_Zeller$age
gender <- dat_Zeller$gender
country <- dat_Zeller$country
VBDirMult <-function(Y,L=2,alpha=rep(1,ncol(Y)),beta=rep(1,L),maxit=1000,seed=1){
set.seed(1)
N <- nrow(Y)
K <- ncol(Y)
EelW <- gtools::rdirichlet(N,beta)
EelH <- gtools::rdirichlet(L,alpha)
for(iter in 1:maxit){
Sw <- EelW * (((Y)/(EelW %*% EelH)) %*% t(EelH))
Sh <- EelH * (t(EelW) %*% (Y/(EelW %*% EelH)))
beta_W <- beta + Sw
alpha_H <- alpha + Sh
EelW <-exp(sweep(digamma(beta_W),1,digamma(rowSums(beta_W))))
EelH <-exp(sweep(digamma(alpha_H),1,digamma(rowSums(alpha_H))))
}
EW <- sweep(beta_W,1,rowSums(beta_W),"/")
EH <- sweep(alpha_H,1,rowSums(alpha_H),"/")
return(list(W=EW,H=EH))
}
fitDM <- VBDirMult(Y,L=3)
Hdf <- as.data.frame(fit$H) %>%
mutate(topic=factor(row_number())) %>%
gather(genus,value,-topic) %>%
group_by(topic) %>%
arrange(desc(value)) %>%
mutate(ord=row_number()) %>%
group_by(genus) %>%
dplyr::filter(any(ord<=5)) %>%
ungroup() %>%
arrange(genus)
ggplot(Hdf,aes(x=topic,y=genus,fill=value))+
geom_tile()
ggplot(Hdf,aes(x=topic,y=value))+
geom_col()+
facet_wrap(~genus)+
theme_bw(14)
Wdf <- as.data.frame(fit$W) %>%
arrange(V1,V1+V2) %>%
set_names(1:3) %>%
mutate(id=factor(row_number()),
disease=factor(dat_Zeller$disease,levels=c("healthy","adenoma","CRC"))) %>%
gather(topic,value,-id,-disease)
ggplot(Wdf,aes(x=id,y=value,fill=topic))+
geom_col()+
facet_grid(~disease,scales = "free",space = "free")
ggplot(Wdf,aes(x=disease,y=value))+
geom_boxplot()+
facet_wrap(~topic)
########
X <- model.matrix(~disease+age+gender+country)
dim(count)
dim(X)
cl <- makeCluster(2,type=ifelse(.Platform$OS.type=="unix","FORK","PSOCK"))
print(cl)
fits <- dmr(cl, covars=X[,-1], counts=count)
stopCluster(cl)
B <- coef(fits)
image(as.matrix(exp(X%*%B))*dat_Zeller$number_reads)
image(count)
barplot(B["diseaseCRC",])
## Fitted probability by true response
par(mfrow=c(1,1))
P <- predict(B, X[,-1], type="response")
plot(P,Y/100)
boxplot(P[cbind(1:214,fgl$type)]~fgl$type,
ylab="fitted prob of true class")
######
lm_sleep <- lm(extra~group+ID,data=sleep)
summary(lm_sleep)
with(sleep,t.test(extra[group == 2],extra[group == 1], paired = TRUE))
library(nlme)
library(lme4)
ggplot(Orthodont,aes(x=age,y=distance,group=Subject,colour=Sex))+
geom_line()+
theme_bw(14)
ggplot(Orthodont,aes(x=age,y=distance,colour=Sex))+
stat_summary(fun.y="mean",geom="line")+
stat_summary()+
theme_bw(14)
?Orthodont
ageseq <- seq(8,14,by=2)
pvalue <- numeric(4)
for (i in 1:4) {
pvalue[i] <- t.test(distance~Sex,data=Orthodont[Orthodont$age==ageseq[i],])$p.val
}
fit_Orth <- lmer(distance~factor(age)+Sex+(1|Subject),data=Orthodont)
confint(fit_Orth,method="boot")