mbarnkob/2014-Barnkob-Frontiers_in_Genetics.R

## 2014-Barnkob-Frontiers_in_Genetics.R
#CODE FOR Barnkob et al, Frontiers in Genetics, 2014 article
#Mike Barnkob, 2014.
#www.mikebarnkob.dk

#ANALYSIS of mRNA-PROTEIN HALF-LIFE DATA
#=======================================

#Info and packages
#===========

#Data for nature10098-s5.csv file is found here: http://www-ncbi-nlm-nih-gov/pubmed/21593866
#in supp. table 5.

#Needed packages
library(ggplot2)

#Bioconductor with libraries
source("http://bioconductor.org/biocLite.R")
biocLite()
biocLite(c("AnnotationDbi","org.Hs.eg.db","hom.Hs.inp.db", "org.Mm.eg.db"))

#Setup
#=====

setwd("~/Dropbox/Forskning/2014/Projects/Biosurf/R/mRNA-Protein analyse")

df_nature <- read.csv2("nature10098-s5.csv", as.is = TRUE, header = TRUE, sep = ";", row.names = NULL)
df_nature_sub <- df_nature[df_nature$Protein < 100,] #Subset with protein values below 100

vec_gsurface <- read.csv("gsurface2.csv",header=F, strip.white=TRUE)$V1 #Load all CD-id's, with whit space trimmed
vec_gsurface <- as.character(vec_gsurface) #Converted to character vector

#EDA
#===

#Histograms for all proteins / mRNA

plot(df_nature_sub$Protein, log="x", type='h', lwd=10, lend=1)
plot(df_nature_sub$mRNA, log="x", type='h', lwd=10, lend=1)
hist(log(df_nature_sub$Protein), lwd=1, lend=1)
hist(log(df_nature_sub$mRNA), lwd=1, lend=1)

library('ggplot2')
ggplot(df_nature_sub) +
  geom_histogram(aes(x=Protein),fill = "red", alpha = 0.6, binwidth = 1) +
  geom_histogram(aes(x=mRNA),fill = "blue", alpha = 0.6, binwidth = 1) +
  theme(panel.grid.minor=element_blank())

#Span
nrow(df_nature[df_nature$Protein < 100,])
summary(df_nature$Protein)
summary(df_nature$mRNA)

#UNIPROT CD-MARKER SUBSET
#========================

#Convert CD markers from human to mouse ID's
library(AnnotationDbi)
library("org.Hs.eg.db")
vec_gsurface_mouse <- idConverter(vec_gsurface, "HOMSA", "MUSMU", srcIDType="UNIPROT", destIDType="UNIPROT")
vec_gsurface_mouse <- unlist(vec_gsurface_mouse, use.names = FALSE) #create vector instead of list

#Find CD ID's in Nature data-set using grepl
#vec_gsurface <- c("Q8CGP5", "B2RRX1", "Q64525") #test vector with values known to be in df_nature for troubleshooting
df_combined <- sapply(vec_gsurface_mouse, grepl, df_nature$Uniprot_Ids, ignore.case=TRUE)

idx <- rowSums(df_combined) > 0L #Find all rows with at least 1 TRUE column

#Subset df_nature based on logic-matrix 'idx'
df_nature_cdmarkers <- df_nature[idx,]

#Summary
df_nature_cdmarkers
summary(df_nature_cdmarkers)
t.test(df_nature_cdmarkers$Protein, df_nature_cdmarkers$mRNA)

#Save df
write.table (df_nature_cdmarkers, "Output-CD-markers.csv", sep  = ";")


#REORDERING DATA FOR BETTER VISUALISATION + ANALYSIS
#===================================================

df_nature_sub <- df_nature[df_nature$Protein < 1000,] #Subset with protein values below 100
x1 <- data.frame(cond = "All Protein's", value = df_nature_sub$Protein)
x2 <- data.frame(cond = "All mRNA", value = df_nature_sub$mRNA)
x3 <- data.frame(cond = "CD Protein's", value = df_nature_cdmarkers$Protein)
x4 <- data.frame(cond = "CD mRNA", value = df_nature_cdmarkers$mRNA)
x5 <- data.frame(cond = "All Protein's", value = df_nature$Protein)

#VISUALIZATION

library(plyr)
df_all <- rbind.fill(x1,x2,x3,x4)
summary(df_all)
df_cd <- rbind.fill(x3,x4)

#Visualization
library(ggplot2)

#All
ggplot(df_all, aes(x=value, fill=cond)) +
  geom_histogram(binwidth=0.05, alpha=.3, position="identity", colour="black") +
  scale_fill_brewer(palette="Set1") +
  scale_x_log10()

#Only CD markers
ggplot(df_cd, aes(x=value, fill=cond)) +
  geom_histogram(binwidth=0.05, alpha=.3, position="identity", colour="black") +
  scale_fill_brewer(palette="Set1") +
  scale_x_log10()

#FINALE Figure - All with facet

gg_color_hue <- function(n) {
  hues = seq(15, 375, length=n+1)
  hcl(h=hues, l=65, c=100)[1:n]
}
cols = gg_color_hue(10)
rhg_cols <- c("#F8766D", "#00b0f6", "#F8766D", "#00b0f6")

#Version 2
img_all <- ggplot(df_all, aes(x=value, fill=cond)) +
  geom_histogram(binwidth=0.05, alpha=1, position="identity", colour="grey") +
  #scale_fill_brewer(palette="Set1") +
  scale_fill_manual(values = rhg_cols) +
  scale_x_log10(name = "Mean half-life (hours)", breaks = c(1, 10, 100, 1000)) +
  scale_y_continuous(name = "Count") +
  facet_grid(cond ~ ., scales="free_y") +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = "black"),
        title = element_text(color = "black", size = 14, face="bold"),
        strip.text.y = element_text(size = 7
        )) +
  guides(fill=FALSE) #no legend

img_all

#Save image
tiff(filename="mRNA-Protein-ver4.tiff",
     res = 900,
     height = 85,
     width = 100,
     units = "mm"
     )
img_all
dev.off()

#Citation
citation("AnnotationDbi")

#ANALYSIS

mean(x1$value, na.rm = T)
mean(x2$value, na.rm = T)
mean(x3$value, na.rm = T)
mean(x4$value, na.rm = T)


#MONTE CARLO ANALYSIS ON mRNA-Protein data
#Used in http://www.ncbi.nlm.nih.gov/pubmed/25309582
#Mike Barnkob, 2014
#=========================================

#Setup
#=====

setwd("~/Dropbox/Forskning/2014/Projects/Biosurf/R/mRNA-Protein analyse")

#Simulation
#==========

n <- 10000
avrg <- vector("numeric", n)
for (i in 1:n)
{
  x <- sample(x5$value, 30, replace=TRUE)
  avrg[i] <- mean(x)
}

hist(avrg, main = "10.000 random samples of 30 proteins", xlab = "Averages of half-lives")
abline(v=min(avrg),col="red")
abline(v=max(avrg), col="red")

sum(avrg < 0.18)/n
min(avrg)
max(avrg)
	#CODE FOR Barnkob et al, Frontiers in Genetics, 2014 article
	#Mike Barnkob, 2014.
	#www.mikebarnkob.dk

	#ANALYSIS of mRNA-PROTEIN HALF-LIFE DATA
	#=======================================

	#Info and packages
	#===========

	#Data for nature10098-s5.csv file is found here: http://www-ncbi-nlm-nih-gov/pubmed/21593866
	#in supp. table 5.

	#Needed packages
	library(ggplot2)

	#Bioconductor with libraries
	source("http://bioconductor.org/biocLite.R")
	biocLite()
	biocLite(c("AnnotationDbi","org.Hs.eg.db","hom.Hs.inp.db", "org.Mm.eg.db"))

	#Setup
	#=====

	setwd("~/Dropbox/Forskning/2014/Projects/Biosurf/R/mRNA-Protein analyse")

	df_nature <- read.csv2("nature10098-s5.csv", as.is = TRUE, header = TRUE, sep = ";", row.names = NULL)
	df_nature_sub <- df_nature[df_nature$Protein < 100,] #Subset with protein values below 100

	vec_gsurface <- read.csv("gsurface2.csv",header=F, strip.white=TRUE)$V1 #Load all CD-id's, with whit space trimmed
	vec_gsurface <- as.character(vec_gsurface) #Converted to character vector

	#EDA
	#===

	#Histograms for all proteins / mRNA

	plot(df_nature_sub$Protein, log="x", type='h', lwd=10, lend=1)
	plot(df_nature_sub$mRNA, log="x", type='h', lwd=10, lend=1)
	hist(log(df_nature_sub$Protein), lwd=1, lend=1)
	hist(log(df_nature_sub$mRNA), lwd=1, lend=1)

	library('ggplot2')
	ggplot(df_nature_sub) +
	geom_histogram(aes(x=Protein),fill = "red", alpha = 0.6, binwidth = 1) +
	geom_histogram(aes(x=mRNA),fill = "blue", alpha = 0.6, binwidth = 1) +
	theme(panel.grid.minor=element_blank())

	#Span
	nrow(df_nature[df_nature$Protein < 100,])
	summary(df_nature$Protein)
	summary(df_nature$mRNA)

	#UNIPROT CD-MARKER SUBSET
	#========================

	#Convert CD markers from human to mouse ID's
	library(AnnotationDbi)
	library("org.Hs.eg.db")
	vec_gsurface_mouse <- idConverter(vec_gsurface, "HOMSA", "MUSMU", srcIDType="UNIPROT", destIDType="UNIPROT")
	vec_gsurface_mouse <- unlist(vec_gsurface_mouse, use.names = FALSE) #create vector instead of list

	#Find CD ID's in Nature data-set using grepl
	#vec_gsurface <- c("Q8CGP5", "B2RRX1", "Q64525") #test vector with values known to be in df_nature for troubleshooting
	df_combined <- sapply(vec_gsurface_mouse, grepl, df_nature$Uniprot_Ids, ignore.case=TRUE)

	idx <- rowSums(df_combined) > 0L #Find all rows with at least 1 TRUE column

	#Subset df_nature based on logic-matrix 'idx'
	df_nature_cdmarkers <- df_nature[idx,]

	#Summary
	df_nature_cdmarkers
	summary(df_nature_cdmarkers)
	t.test(df_nature_cdmarkers$Protein, df_nature_cdmarkers$mRNA)

	#Save df
	write.table (df_nature_cdmarkers, "Output-CD-markers.csv", sep = ";")


	#REORDERING DATA FOR BETTER VISUALISATION + ANALYSIS
	#===================================================

	df_nature_sub <- df_nature[df_nature$Protein < 1000,] #Subset with protein values below 100
	x1 <- data.frame(cond = "All Protein's", value = df_nature_sub$Protein)
	x2 <- data.frame(cond = "All mRNA", value = df_nature_sub$mRNA)
	x3 <- data.frame(cond = "CD Protein's", value = df_nature_cdmarkers$Protein)
	x4 <- data.frame(cond = "CD mRNA", value = df_nature_cdmarkers$mRNA)
	x5 <- data.frame(cond = "All Protein's", value = df_nature$Protein)

	#VISUALIZATION

	library(plyr)
	df_all <- rbind.fill(x1,x2,x3,x4)
	summary(df_all)
	df_cd <- rbind.fill(x3,x4)

	#Visualization
	library(ggplot2)

	#All
	ggplot(df_all, aes(x=value, fill=cond)) +
	geom_histogram(binwidth=0.05, alpha=.3, position="identity", colour="black") +
	scale_fill_brewer(palette="Set1") +
	scale_x_log10()

	#Only CD markers
	ggplot(df_cd, aes(x=value, fill=cond)) +
	geom_histogram(binwidth=0.05, alpha=.3, position="identity", colour="black") +
	scale_fill_brewer(palette="Set1") +
	scale_x_log10()

	#FINALE Figure - All with facet

	gg_color_hue <- function(n) {
	hues = seq(15, 375, length=n+1)
	hcl(h=hues, l=65, c=100)[1:n]
	}
	cols = gg_color_hue(10)
	rhg_cols <- c("#F8766D", "#00b0f6", "#F8766D", "#00b0f6")

	#Version 2
	img_all <- ggplot(df_all, aes(x=value, fill=cond)) +
	geom_histogram(binwidth=0.05, alpha=1, position="identity", colour="grey") +
	#scale_fill_brewer(palette="Set1") +
	scale_fill_manual(values = rhg_cols) +
	scale_x_log10(name = "Mean half-life (hours)", breaks = c(1, 10, 100, 1000)) +
	scale_y_continuous(name = "Count") +
	facet_grid(cond ~ ., scales="free_y") +
	theme(panel.grid.major = element_blank(),
	panel.grid.minor = element_blank(),
	panel.background = element_blank(),
	axis.line = element_line(colour = "black"),
	title = element_text(color = "black", size = 14, face="bold"),
	strip.text.y = element_text(size = 7
	)) +
	guides(fill=FALSE) #no legend

	img_all

	#Save image
	tiff(filename="mRNA-Protein-ver4.tiff",
	res = 900,
	height = 85,
	width = 100,
	units = "mm"
	)
	img_all
	dev.off()

	#Citation
	citation("AnnotationDbi")

	#ANALYSIS

	mean(x1$value, na.rm = T)
	mean(x2$value, na.rm = T)
	mean(x3$value, na.rm = T)
	mean(x4$value, na.rm = T)


	#MONTE CARLO ANALYSIS ON mRNA-Protein data
	#Used in http://www.ncbi.nlm.nih.gov/pubmed/25309582
	#Mike Barnkob, 2014
	#=========================================

	#Setup
	#=====

	setwd("~/Dropbox/Forskning/2014/Projects/Biosurf/R/mRNA-Protein analyse")

	#Simulation
	#==========

	n <- 10000
	avrg <- vector("numeric", n)
	for (i in 1:n)
	{
	x <- sample(x5$value, 30, replace=TRUE)
	avrg[i] <- mean(x)
	}

	hist(avrg, main = "10.000 random samples of 30 proteins", xlab = "Averages of half-lives")
	abline(v=min(avrg),col="red")
	abline(v=max(avrg), col="red")

	sum(avrg < 0.18)/n
	min(avrg)
	max(avrg)