Last active
September 26, 2017 17:49
-
-
Save Gibbsdavidl/8a20097aaf8bece8fc586310795b54da to your computer and use it in GitHub Desktop.
ANOVA comparing expression values between samples with and without a SNP.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This shows the contents of my global.R file, that contains the BigQuery SQL, googleAuthR code, and plots. | |
# global.R | |
#http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/ | |
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) { | |
library(grid) | |
# Make a list from the ... arguments and plotlist | |
plots <- c(list(...), plotlist) | |
numPlots = length(plots) | |
# If layout is NULL, then use 'cols' to determine layout | |
if (is.null(layout)) { | |
# Make the panel | |
# ncol: Number of columns of plots | |
# nrow: Number of rows needed, calculated from # of cols | |
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), | |
ncol = cols, nrow = ceiling(numPlots/cols)) | |
} | |
if (numPlots==1) { | |
print(plots[[1]]) | |
} else { | |
# Set up the page | |
grid.newpage() | |
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) | |
# Make each plot, in the correct location | |
for (i in 1:numPlots) { | |
# Get the i,j matrix positions of the regions that contain this subplot | |
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) | |
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, | |
layout.pos.col = matchidx$col)) | |
} | |
} | |
} | |
buildAndRunQuery <- function(varName, aproject, cohort) { | |
q <- paste( | |
" | |
WITH | |
-- | |
-- | |
-- | |
cohortExpr AS ( | |
SELECT | |
sample_barcode, | |
LOG10(normalized_count) AS expr | |
FROM | |
`isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM` | |
WHERE | |
project_short_name = '",cohort,"' | |
AND HGNC_gene_symbol = '",varName,"' | |
AND normalized_count IS NOT NULL | |
AND normalized_count > 0), | |
-- | |
-- | |
-- | |
cohortVar AS ( ( | |
SELECT | |
sample_barcode_tumor AS sample_barcode, | |
'Mut' AS mutation_status | |
FROM | |
`isb-cgc.TCGA_hg19_data_v0.Somatic_Mutation_MC3` | |
WHERE | |
SYMBOL = '",varName,"' | |
AND Variant_Classification <> 'Silent' | |
AND Variant_Type = 'SNP' | |
AND IMPACT <> 'LOW') | |
UNION ALL ( | |
SELECT | |
sample_barcode, | |
'WT' AS mutation_status | |
FROM | |
cohortExpr | |
WHERE | |
sample_barcode NOT IN ( | |
SELECT | |
sample_barcode_tumor AS sample_barcode | |
FROM | |
`isb-cgc.TCGA_hg19_data_v0.Somatic_Mutation_MC3` | |
WHERE | |
SYMBOL = '",varName,"') ) ), | |
-- | |
-- | |
cohort AS ( | |
SELECT | |
cohortExpr.sample_barcode AS sample_barcode, | |
mutation_status, | |
expr | |
FROM | |
cohortExpr | |
JOIN | |
cohortVar | |
ON | |
cohortExpr.sample_barcode = cohortVar.sample_barcode ), | |
-- | |
-- | |
grandMeanTable AS ( | |
SELECT | |
AVG(expr) AS grand_mean | |
FROM | |
cohort ), | |
-- | |
-- | |
groupMeans1 AS ( | |
SELECT | |
'dummy' AS dummy, | |
'WT' AS group_name, | |
AVG(expr) AS group_mean, | |
STDDEV(expr) AS group_stddev, | |
COUNT(sample_barcode) AS n | |
FROM | |
cohort | |
WHERE | |
mutation_status = 'WT' ), | |
-- | |
-- | |
groupMeans2 AS ( | |
SELECT | |
'dummy' AS dummy, | |
'Mut' AS group_name, | |
AVG(expr) AS group_mean, | |
STDDEV(expr) AS group_stddev, | |
COUNT(sample_barcode) AS n | |
FROM | |
cohort | |
WHERE | |
mutation_status = 'Mut' ), | |
-- | |
-- | |
groupMeansTable AS ( ( | |
SELECT | |
* | |
FROM | |
groupMeans1) | |
UNION ALL ( | |
SELECT | |
* | |
FROM | |
groupMeans2) ), | |
-- | |
-- | |
groupMeansTableForPlots AS ( | |
SELECT | |
a.dummy AS dummy, | |
a.group_mean AS wt_group_mean, | |
a.group_stddev AS wt_group_stddev, | |
a.n AS wt_n, | |
b.group_mean AS mut_group_mean, | |
b.group_stddev AS mut_group_stddev, | |
b.n AS mut_n | |
FROM | |
groupMeans1 AS a | |
JOIN | |
groupMeans2 AS b | |
ON | |
a.dummy = b.dummy), | |
-- | |
-- to get the variance between | |
-- we sum the n*(group_mean - grand_mean)^2 | |
-- | |
ssBetween AS ( | |
SELECT | |
group_name, | |
group_mean, | |
grand_mean, | |
n, | |
n*POW(group_mean - grand_mean,2) AS n_diff_sq | |
FROM | |
groupMeansTable | |
CROSS JOIN | |
grandMeanTable ), | |
-- | |
-- Then join the group_means with the expression values | |
-- for samples with WT status | |
-- | |
ssWithin AS ( | |
SELECT | |
* | |
FROM ( | |
SELECT | |
sample_barcode, | |
mutation_status, | |
expr, | |
group_mean, | |
POW(expr - group_mean,2) AS x_minus_xbar_sq | |
FROM ( | |
SELECT | |
sample_barcode, | |
mutation_status, | |
expr | |
FROM | |
cohort | |
WHERE | |
mutation_status = 'WT' ) | |
CROSS JOIN ( | |
SELECT | |
group_mean | |
FROM | |
ssBetween | |
WHERE | |
group_name = 'WT' ) ) | |
-- | |
-- | |
UNION ALL | |
-- | |
( | |
SELECT | |
sample_barcode, | |
mutation_status, | |
expr, | |
group_mean, | |
POW(expr - group_mean,2) AS x_minus_xbar_sq | |
FROM ( | |
SELECT | |
sample_barcode, | |
mutation_status, | |
expr | |
FROM | |
cohort | |
WHERE | |
mutation_status = 'Mut' ) | |
CROSS JOIN ( | |
SELECT | |
group_mean | |
FROM | |
ssBetween | |
WHERE | |
group_name = 'Mut' ) ) ), | |
-- | |
-- | |
-- | |
numerator AS ( | |
SELECT | |
'dummy' AS dummy, | |
SUM(n_diff_sq) AS s_sq_between | |
FROM | |
ssBetween ), | |
-- | |
-- | |
-- | |
denominator AS ( | |
SELECT | |
'dummy' AS dummy, | |
COUNT(sample_barcode) AS n, | |
'2' AS k, | |
SUM(x_minus_xbar_sq)/(COUNT(sample_barcode)-2) AS s_sq_within | |
FROM | |
ssWithin ), | |
-- | |
-- | |
-- | |
Ftable AS ( | |
SELECT | |
n, | |
k, | |
s_sq_between, | |
s_sq_within, | |
s_sq_between / s_sq_within AS F, | |
c.wt_group_mean, | |
c.wt_group_stddev, | |
c.wt_n, | |
c.mut_group_mean, | |
c.mut_group_stddev, | |
c.mut_n | |
FROM | |
numerator | |
JOIN | |
denominator | |
ON | |
numerator.dummy = denominator.dummy | |
JOIN | |
groupMeansTableForPlots AS c | |
ON | |
numerator.dummy = c.dummy ) | |
SELECT | |
* | |
FROM | |
Ftable", | |
sep="") | |
#define body for the POST request Google BigQuery API | |
body = list( | |
query=q, | |
defaultDataset.projectId=aproject, | |
useLegacySql = F | |
) | |
#create a function to interact with Google BigQuery | |
f = gar_api_generator("https://www.googleapis.com/bigquery/v2", | |
"POST", | |
path_args = list(projects=aproject, queries="")) | |
#call function with body as input argument | |
response = f(the_body=body) | |
dat <- data.frame() | |
if(!is.null(response)) | |
{ | |
dat = as.data.frame(do.call("rbind",lapply(response$content$rows$f,FUN = t)), stringsAsFactors = F) | |
} | |
if (ncol(dat) == 11) { | |
colnames(dat) <- c("n", "k", "var_between", "var_within", "F_stat", | |
"wt_group_mean","wt_group_stddev","wt_n","mut_group_mean","mut_group_stddev","mut_n") | |
dat <- lapply(dat, "as.numeric") | |
} | |
return(dat) | |
} | |
getData <- function(project, cohort, varname) { | |
dat <- buildAndRunQuery(varname, project, cohort) | |
print(dat) | |
return(dat) | |
} | |
drawPlot <- function(dat) { | |
fstat <- dat$F_stat | |
kval <- dat$k | |
nval <- dat$n | |
mval1 <- dat$wt_group_mean | |
sval1 <- dat$wt_group_stddev | |
mval2 <- dat$mut_group_mean | |
sval2 <- dat$mut_group_stddev | |
x <- seq(from=0, to=fstat+2, by=0.1) | |
y <- df(df1=(nval-kval),df2=kval, x=x) | |
pval <- 1-pf(q=fstat, df1=(kval-1), df2=(nval-kval)) | |
titlestring = paste("F(", kval, ", ", (nval-kval), "), stat = ", fstat , ", p-value = ", pval, sep="") | |
p1 <- qplot(x=x, y=y, geom="line") + geom_vline(xintercept=fstat, col="red", lwd=2) + ylab("") + xlab("F Statistic") + ggtitle(titlestring) | |
p2 <- qplot(x=factor(c("WT","Mut")), y=c(mval1, mval2), xlab="WT and Mutated", ylab="log10 RNA-seq") + | |
geom_errorbar(aes( | |
ymin=c(mval1-sval1,mval2-sval2), | |
ymax=c(mval1+sval1, mval2+sval2)), width=.2) | |
multiplot(p1, p2) | |
} | |
errorPlot <- function() { | |
plot(x=1,y=1,col=0,xlab="",ylab="", axes=FALSE); | |
text(1,1,"please log in using your google ID", cex = 2) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment