Skip to content

Instantly share code, notes, and snippets.

@Gibbsdavidl
Last active September 26, 2017 17:49
Show Gist options
  • Save Gibbsdavidl/8a20097aaf8bece8fc586310795b54da to your computer and use it in GitHub Desktop.
Save Gibbsdavidl/8a20097aaf8bece8fc586310795b54da to your computer and use it in GitHub Desktop.
ANOVA comparing expression values between samples with and without a SNP.
# This shows the contents of my global.R file, that contains the BigQuery SQL, googleAuthR code, and plots.
# global.R
#http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
buildAndRunQuery <- function(varName, aproject, cohort) {
q <- paste(
"
WITH
--
--
--
cohortExpr AS (
SELECT
sample_barcode,
LOG10(normalized_count) AS expr
FROM
`isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM`
WHERE
project_short_name = '",cohort,"'
AND HGNC_gene_symbol = '",varName,"'
AND normalized_count IS NOT NULL
AND normalized_count > 0),
--
--
--
cohortVar AS ( (
SELECT
sample_barcode_tumor AS sample_barcode,
'Mut' AS mutation_status
FROM
`isb-cgc.TCGA_hg19_data_v0.Somatic_Mutation_MC3`
WHERE
SYMBOL = '",varName,"'
AND Variant_Classification <> 'Silent'
AND Variant_Type = 'SNP'
AND IMPACT <> 'LOW')
UNION ALL (
SELECT
sample_barcode,
'WT' AS mutation_status
FROM
cohortExpr
WHERE
sample_barcode NOT IN (
SELECT
sample_barcode_tumor AS sample_barcode
FROM
`isb-cgc.TCGA_hg19_data_v0.Somatic_Mutation_MC3`
WHERE
SYMBOL = '",varName,"') ) ),
--
--
cohort AS (
SELECT
cohortExpr.sample_barcode AS sample_barcode,
mutation_status,
expr
FROM
cohortExpr
JOIN
cohortVar
ON
cohortExpr.sample_barcode = cohortVar.sample_barcode ),
--
--
grandMeanTable AS (
SELECT
AVG(expr) AS grand_mean
FROM
cohort ),
--
--
groupMeans1 AS (
SELECT
'dummy' AS dummy,
'WT' AS group_name,
AVG(expr) AS group_mean,
STDDEV(expr) AS group_stddev,
COUNT(sample_barcode) AS n
FROM
cohort
WHERE
mutation_status = 'WT' ),
--
--
groupMeans2 AS (
SELECT
'dummy' AS dummy,
'Mut' AS group_name,
AVG(expr) AS group_mean,
STDDEV(expr) AS group_stddev,
COUNT(sample_barcode) AS n
FROM
cohort
WHERE
mutation_status = 'Mut' ),
--
--
groupMeansTable AS ( (
SELECT
*
FROM
groupMeans1)
UNION ALL (
SELECT
*
FROM
groupMeans2) ),
--
--
groupMeansTableForPlots AS (
SELECT
a.dummy AS dummy,
a.group_mean AS wt_group_mean,
a.group_stddev AS wt_group_stddev,
a.n AS wt_n,
b.group_mean AS mut_group_mean,
b.group_stddev AS mut_group_stddev,
b.n AS mut_n
FROM
groupMeans1 AS a
JOIN
groupMeans2 AS b
ON
a.dummy = b.dummy),
--
-- to get the variance between
-- we sum the n*(group_mean - grand_mean)^2
--
ssBetween AS (
SELECT
group_name,
group_mean,
grand_mean,
n,
n*POW(group_mean - grand_mean,2) AS n_diff_sq
FROM
groupMeansTable
CROSS JOIN
grandMeanTable ),
--
-- Then join the group_means with the expression values
-- for samples with WT status
--
ssWithin AS (
SELECT
*
FROM (
SELECT
sample_barcode,
mutation_status,
expr,
group_mean,
POW(expr - group_mean,2) AS x_minus_xbar_sq
FROM (
SELECT
sample_barcode,
mutation_status,
expr
FROM
cohort
WHERE
mutation_status = 'WT' )
CROSS JOIN (
SELECT
group_mean
FROM
ssBetween
WHERE
group_name = 'WT' ) )
--
--
UNION ALL
--
(
SELECT
sample_barcode,
mutation_status,
expr,
group_mean,
POW(expr - group_mean,2) AS x_minus_xbar_sq
FROM (
SELECT
sample_barcode,
mutation_status,
expr
FROM
cohort
WHERE
mutation_status = 'Mut' )
CROSS JOIN (
SELECT
group_mean
FROM
ssBetween
WHERE
group_name = 'Mut' ) ) ),
--
--
--
numerator AS (
SELECT
'dummy' AS dummy,
SUM(n_diff_sq) AS s_sq_between
FROM
ssBetween ),
--
--
--
denominator AS (
SELECT
'dummy' AS dummy,
COUNT(sample_barcode) AS n,
'2' AS k,
SUM(x_minus_xbar_sq)/(COUNT(sample_barcode)-2) AS s_sq_within
FROM
ssWithin ),
--
--
--
Ftable AS (
SELECT
n,
k,
s_sq_between,
s_sq_within,
s_sq_between / s_sq_within AS F,
c.wt_group_mean,
c.wt_group_stddev,
c.wt_n,
c.mut_group_mean,
c.mut_group_stddev,
c.mut_n
FROM
numerator
JOIN
denominator
ON
numerator.dummy = denominator.dummy
JOIN
groupMeansTableForPlots AS c
ON
numerator.dummy = c.dummy )
SELECT
*
FROM
Ftable",
sep="")
#define body for the POST request Google BigQuery API
body = list(
query=q,
defaultDataset.projectId=aproject,
useLegacySql = F
)
#create a function to interact with Google BigQuery
f = gar_api_generator("https://www.googleapis.com/bigquery/v2",
"POST",
path_args = list(projects=aproject, queries=""))
#call function with body as input argument
response = f(the_body=body)
dat <- data.frame()
if(!is.null(response))
{
dat = as.data.frame(do.call("rbind",lapply(response$content$rows$f,FUN = t)), stringsAsFactors = F)
}
if (ncol(dat) == 11) {
colnames(dat) <- c("n", "k", "var_between", "var_within", "F_stat",
"wt_group_mean","wt_group_stddev","wt_n","mut_group_mean","mut_group_stddev","mut_n")
dat <- lapply(dat, "as.numeric")
}
return(dat)
}
getData <- function(project, cohort, varname) {
dat <- buildAndRunQuery(varname, project, cohort)
print(dat)
return(dat)
}
drawPlot <- function(dat) {
fstat <- dat$F_stat
kval <- dat$k
nval <- dat$n
mval1 <- dat$wt_group_mean
sval1 <- dat$wt_group_stddev
mval2 <- dat$mut_group_mean
sval2 <- dat$mut_group_stddev
x <- seq(from=0, to=fstat+2, by=0.1)
y <- df(df1=(nval-kval),df2=kval, x=x)
pval <- 1-pf(q=fstat, df1=(kval-1), df2=(nval-kval))
titlestring = paste("F(", kval, ", ", (nval-kval), "), stat = ", fstat , ", p-value = ", pval, sep="")
p1 <- qplot(x=x, y=y, geom="line") + geom_vline(xintercept=fstat, col="red", lwd=2) + ylab("") + xlab("F Statistic") + ggtitle(titlestring)
p2 <- qplot(x=factor(c("WT","Mut")), y=c(mval1, mval2), xlab="WT and Mutated", ylab="log10 RNA-seq") +
geom_errorbar(aes(
ymin=c(mval1-sval1,mval2-sval2),
ymax=c(mval1+sval1, mval2+sval2)), width=.2)
multiplot(p1, p2)
}
errorPlot <- function() {
plot(x=1,y=1,col=0,xlab="",ylab="", axes=FALSE);
text(1,1,"please log in using your google ID", cex = 2)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment