GabrielHoffman/mash_analysis.R

## mash_analysis.R

# Example code for analysis of
# 1) Trait specificity
# 2) Cell-type specificity


# 1) Trait specificity
######################

library(arrow)
library(tidyverse)
library(dreamlet)

df_prob = read_parquet('mashr_posterior.parquet')

# probabilities from first dataset
df1 = df_prob %>%
		filter( Dataset == "MSSM",
			AnnoLevel == "class",
			coef == "c02xAD - c02xControl"	) %>%
		pivot_wider(names_from = "assay", values_from="Probability")

# probabilities from second dataset
df2 = df_prob %>%
		filter( Dataset == "RUSH",
			AnnoLevel == "class",
			coef == "c03xAD - c03xControl"	) %>%
		pivot_wider(names_from = "assay", values_from="Probability")

df_join = inner_join(df1, df2, by="ID") %>%
			select( -coef.x, -AnnoLevel.x, -Dataset.x,
				 	-coef.y, -AnnoLevel.y, -Dataset.y)

# Identify genes differentially expressed in Immune in both datsets, but no other cell types
include = c('Immune.x', 'Immune.y')
exclude = setdiff(colnames(df_join)[-1], include)

res = df_join %>%
		column_to_rownames(var='ID') %>%
		filter(rowSums(is.na(.)) != ncol(.)) %>%
		compositePosteriorTest(
			include = include,
			exclude = exclude,
			test = "all")

# top genes
head(sort(res, decreasing=TRUE))


# Identify genes differentially expressed in Immune in both datsets, regardless of results in other cell types
include = c('Immune.x', 'Immune.y')
exclude = NULL

res = df_join %>%
		column_to_rownames(var='ID') %>%
		filter(rowSums(is.na(.)) != ncol(.)) %>%
		compositePosteriorTest(
			include = include,
			exclude = exclude,
			test = "all")

# top genes
head(sort(res, decreasing=TRUE))


# Identify genes differentially expressed in at least one neuron type
include = c('EN.x', 'EN.y', "IN.x", "IN.y")
exclude = setdiff(colnames(df_join)[-1], include)

res = df_join %>%
		column_to_rownames(var='ID') %>%
		filter(rowSums(is.na(.)) != ncol(.)) %>%
		compositePosteriorTest(
			include = include,
			exclude = exclude)

# top genes
head(sort(res, decreasing=TRUE))


# 2) Cell-type specificity
##########################

library(tidyverse)
library(arrow)
library(ggplot2)
library(dreamlet)

df = read_parquet("mashr_posterior.parquet")

plot_heat = function(df, ids){

	df %>%
		filter(ID %in% ids) %>%
		ggplot(aes(ID, assay, fill=Probability)) +
			geom_tile() +
			theme_classic(22) +
			scale_fill_gradient(limits=c(0, 1),low="white", high="red",na.value = "grey92", name="Probability\nnon-zero") +
			coord_equal() +
			 theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
			 xlab('') +
			 ylab('')
}


# filter
df_sub = df %>%
	filter(Dataset == 'meta',
		coef == 'm01x', # AD vs controls
		AnnoLevel == 'subclass')

# Define query
# Microglia
cellTypes = unique(df_sub$assay)
include = c("Micro")
exclude = c(cellTypes[cellTypes!=include])

# rank genes based on query
res = df_sub %>%
	pivot_wider(names_from = "assay", values_from="Probability")  %>%
		select( -coef, -AnnoLevel, -Dataset) %>%
	column_to_rownames(var='ID') %>%
	filter(rowSums(is.na(.)) != ncol(.)) %>%
	compositePosteriorTest(include, exclude) %>%
	data.frame(prob=.) %>%
	arrange(prob)

genes = rownames(tail(res))

plot_heat( df_sub, genes )


# Only EN
include = grep("^EN", cellTypes, value=TRUE)
exclude = cellTypes[!cellTypes %in% include]

# rank genes based on query
res = df_sub %>%
	pivot_wider(names_from = "assay", values_from="Probability")  %>%
		select( -coef, -AnnoLevel, -Dataset) %>%
	column_to_rownames(var='ID') %>%
	filter(rowSums(is.na(.)) != ncol(.)) %>%
	filter(rowSums(!is.na(.)) >= 7) %>% # gene must be expressed in at least 7 cell types
	compositePosteriorTest(include, exclude) %>%
	data.frame(prob=.) %>%
	arrange(prob)

genes = rownames(tail(res))

plot_heat( df_sub, genes )

	# Example code for analysis of
	# 1) Trait specificity
	# 2) Cell-type specificity


	# 1) Trait specificity
	######################

	library(arrow)
	library(tidyverse)
	library(dreamlet)

	df_prob = read_parquet('mashr_posterior.parquet')

	# probabilities from first dataset
	df1 = df_prob %>%
	filter( Dataset == "MSSM",
	AnnoLevel == "class",
	coef == "c02xAD - c02xControl" ) %>%
	pivot_wider(names_from = "assay", values_from="Probability")

	# probabilities from second dataset
	df2 = df_prob %>%
	filter( Dataset == "RUSH",
	AnnoLevel == "class",
	coef == "c03xAD - c03xControl" ) %>%
	pivot_wider(names_from = "assay", values_from="Probability")

	df_join = inner_join(df1, df2, by="ID") %>%
	select( -coef.x, -AnnoLevel.x, -Dataset.x,
	-coef.y, -AnnoLevel.y, -Dataset.y)

	# Identify genes differentially expressed in Immune in both datsets, but no other cell types
	include = c('Immune.x', 'Immune.y')
	exclude = setdiff(colnames(df_join)[-1], include)

	res = df_join %>%
	column_to_rownames(var='ID') %>%
	filter(rowSums(is.na(.)) != ncol(.)) %>%
	compositePosteriorTest(
	include = include,
	exclude = exclude,
	test = "all")

	# top genes
	head(sort(res, decreasing=TRUE))


	# Identify genes differentially expressed in Immune in both datsets, regardless of results in other cell types
	include = c('Immune.x', 'Immune.y')
	exclude = NULL

	res = df_join %>%
	column_to_rownames(var='ID') %>%
	filter(rowSums(is.na(.)) != ncol(.)) %>%
	compositePosteriorTest(
	include = include,
	exclude = exclude,
	test = "all")

	# top genes
	head(sort(res, decreasing=TRUE))


	# Identify genes differentially expressed in at least one neuron type
	include = c('EN.x', 'EN.y', "IN.x", "IN.y")
	exclude = setdiff(colnames(df_join)[-1], include)

	res = df_join %>%
	column_to_rownames(var='ID') %>%
	filter(rowSums(is.na(.)) != ncol(.)) %>%
	compositePosteriorTest(
	include = include,
	exclude = exclude)

	# top genes
	head(sort(res, decreasing=TRUE))



	# 2) Cell-type specificity
	##########################

	library(tidyverse)
	library(arrow)
	library(ggplot2)
	library(dreamlet)

	df = read_parquet("mashr_posterior.parquet")

	plot_heat = function(df, ids){

	df %>%
	filter(ID %in% ids) %>%
	ggplot(aes(ID, assay, fill=Probability)) +
	geom_tile() +
	theme_classic(22) +
	scale_fill_gradient(limits=c(0, 1),low="white", high="red",na.value = "grey92", name="Probability\nnon-zero") +
	coord_equal() +
	theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
	xlab('') +
	ylab('')
	}


	# filter
	df_sub = df %>%
	filter(Dataset == 'meta',
	coef == 'm01x', # AD vs controls
	AnnoLevel == 'subclass')

	# Define query
	# Microglia
	cellTypes = unique(df_sub$assay)
	include = c("Micro")
	exclude = c(cellTypes[cellTypes!=include])

	# rank genes based on query
	res = df_sub %>%
	pivot_wider(names_from = "assay", values_from="Probability") %>%
	select( -coef, -AnnoLevel, -Dataset) %>%
	column_to_rownames(var='ID') %>%
	filter(rowSums(is.na(.)) != ncol(.)) %>%
	compositePosteriorTest(include, exclude) %>%
	data.frame(prob=.) %>%
	arrange(prob)

	genes = rownames(tail(res))

	plot_heat( df_sub, genes )


	# Only EN
	include = grep("^EN", cellTypes, value=TRUE)
	exclude = cellTypes[!cellTypes %in% include]

	# rank genes based on query
	res = df_sub %>%
	pivot_wider(names_from = "assay", values_from="Probability") %>%
	select( -coef, -AnnoLevel, -Dataset) %>%
	column_to_rownames(var='ID') %>%
	filter(rowSums(is.na(.)) != ncol(.)) %>%
	filter(rowSums(!is.na(.)) >= 7) %>% # gene must be expressed in at least 7 cell types
	compositePosteriorTest(include, exclude) %>%
	data.frame(prob=.) %>%
	arrange(prob)

	genes = rownames(tail(res))

	plot_heat( df_sub, genes )