R code for analysing half-life data and Monte Carlo analysis used in Barnkob et al, Frontiers in Genetics, 2014.
#CODE FOR Barnkob et al, Frontiers in Genetics, 2014 article | |
#Mike Barnkob, 2014. | |
#www.mikebarnkob.dk | |
#ANALYSIS of mRNA-PROTEIN HALF-LIFE DATA | |
#======================================= | |
#Info and packages | |
#=========== | |
#Data for nature10098-s5.csv file is found here: http://www-ncbi-nlm-nih-gov/pubmed/21593866 | |
#in supp. table 5. | |
#Needed packages | |
library(ggplot2) | |
#Bioconductor with libraries | |
source("http://bioconductor.org/biocLite.R") | |
biocLite() | |
biocLite(c("AnnotationDbi","org.Hs.eg.db","hom.Hs.inp.db", "org.Mm.eg.db")) | |
#Setup | |
#===== | |
setwd("~/Dropbox/Forskning/2014/Projects/Biosurf/R/mRNA-Protein analyse") | |
df_nature <- read.csv2("nature10098-s5.csv", as.is = TRUE, header = TRUE, sep = ";", row.names = NULL) | |
df_nature_sub <- df_nature[df_nature$Protein < 100,] #Subset with protein values below 100 | |
vec_gsurface <- read.csv("gsurface2.csv",header=F, strip.white=TRUE)$V1 #Load all CD-id's, with whit space trimmed | |
vec_gsurface <- as.character(vec_gsurface) #Converted to character vector | |
#EDA | |
#=== | |
#Histograms for all proteins / mRNA | |
plot(df_nature_sub$Protein, log="x", type='h', lwd=10, lend=1) | |
plot(df_nature_sub$mRNA, log="x", type='h', lwd=10, lend=1) | |
hist(log(df_nature_sub$Protein), lwd=1, lend=1) | |
hist(log(df_nature_sub$mRNA), lwd=1, lend=1) | |
library('ggplot2') | |
ggplot(df_nature_sub) + | |
geom_histogram(aes(x=Protein),fill = "red", alpha = 0.6, binwidth = 1) + | |
geom_histogram(aes(x=mRNA),fill = "blue", alpha = 0.6, binwidth = 1) + | |
theme(panel.grid.minor=element_blank()) | |
#Span | |
nrow(df_nature[df_nature$Protein < 100,]) | |
summary(df_nature$Protein) | |
summary(df_nature$mRNA) | |
#UNIPROT CD-MARKER SUBSET | |
#======================== | |
#Convert CD markers from human to mouse ID's | |
library(AnnotationDbi) | |
library("org.Hs.eg.db") | |
vec_gsurface_mouse <- idConverter(vec_gsurface, "HOMSA", "MUSMU", srcIDType="UNIPROT", destIDType="UNIPROT") | |
vec_gsurface_mouse <- unlist(vec_gsurface_mouse, use.names = FALSE) #create vector instead of list | |
#Find CD ID's in Nature data-set using grepl | |
#vec_gsurface <- c("Q8CGP5", "B2RRX1", "Q64525") #test vector with values known to be in df_nature for troubleshooting | |
df_combined <- sapply(vec_gsurface_mouse, grepl, df_nature$Uniprot_Ids, ignore.case=TRUE) | |
idx <- rowSums(df_combined) > 0L #Find all rows with at least 1 TRUE column | |
#Subset df_nature based on logic-matrix 'idx' | |
df_nature_cdmarkers <- df_nature[idx,] | |
#Summary | |
df_nature_cdmarkers | |
summary(df_nature_cdmarkers) | |
t.test(df_nature_cdmarkers$Protein, df_nature_cdmarkers$mRNA) | |
#Save df | |
write.table (df_nature_cdmarkers, "Output-CD-markers.csv", sep = ";") | |
#REORDERING DATA FOR BETTER VISUALISATION + ANALYSIS | |
#=================================================== | |
df_nature_sub <- df_nature[df_nature$Protein < 1000,] #Subset with protein values below 100 | |
x1 <- data.frame(cond = "All Protein's", value = df_nature_sub$Protein) | |
x2 <- data.frame(cond = "All mRNA", value = df_nature_sub$mRNA) | |
x3 <- data.frame(cond = "CD Protein's", value = df_nature_cdmarkers$Protein) | |
x4 <- data.frame(cond = "CD mRNA", value = df_nature_cdmarkers$mRNA) | |
x5 <- data.frame(cond = "All Protein's", value = df_nature$Protein) | |
#VISUALIZATION | |
library(plyr) | |
df_all <- rbind.fill(x1,x2,x3,x4) | |
summary(df_all) | |
df_cd <- rbind.fill(x3,x4) | |
#Visualization | |
library(ggplot2) | |
#All | |
ggplot(df_all, aes(x=value, fill=cond)) + | |
geom_histogram(binwidth=0.05, alpha=.3, position="identity", colour="black") + | |
scale_fill_brewer(palette="Set1") + | |
scale_x_log10() | |
#Only CD markers | |
ggplot(df_cd, aes(x=value, fill=cond)) + | |
geom_histogram(binwidth=0.05, alpha=.3, position="identity", colour="black") + | |
scale_fill_brewer(palette="Set1") + | |
scale_x_log10() | |
#FINALE Figure - All with facet | |
gg_color_hue <- function(n) { | |
hues = seq(15, 375, length=n+1) | |
hcl(h=hues, l=65, c=100)[1:n] | |
} | |
cols = gg_color_hue(10) | |
rhg_cols <- c("#F8766D", "#00b0f6", "#F8766D", "#00b0f6") | |
#Version 2 | |
img_all <- ggplot(df_all, aes(x=value, fill=cond)) + | |
geom_histogram(binwidth=0.05, alpha=1, position="identity", colour="grey") + | |
#scale_fill_brewer(palette="Set1") + | |
scale_fill_manual(values = rhg_cols) + | |
scale_x_log10(name = "Mean half-life (hours)", breaks = c(1, 10, 100, 1000)) + | |
scale_y_continuous(name = "Count") + | |
facet_grid(cond ~ ., scales="free_y") + | |
theme(panel.grid.major = element_blank(), | |
panel.grid.minor = element_blank(), | |
panel.background = element_blank(), | |
axis.line = element_line(colour = "black"), | |
title = element_text(color = "black", size = 14, face="bold"), | |
strip.text.y = element_text(size = 7 | |
)) + | |
guides(fill=FALSE) #no legend | |
img_all | |
#Save image | |
tiff(filename="mRNA-Protein-ver4.tiff", | |
res = 900, | |
height = 85, | |
width = 100, | |
units = "mm" | |
) | |
img_all | |
dev.off() | |
#Citation | |
citation("AnnotationDbi") | |
#ANALYSIS | |
mean(x1$value, na.rm = T) | |
mean(x2$value, na.rm = T) | |
mean(x3$value, na.rm = T) | |
mean(x4$value, na.rm = T) | |
#MONTE CARLO ANALYSIS ON mRNA-Protein data | |
#Used in http://www.ncbi.nlm.nih.gov/pubmed/25309582 | |
#Mike Barnkob, 2014 | |
#========================================= | |
#Setup | |
#===== | |
setwd("~/Dropbox/Forskning/2014/Projects/Biosurf/R/mRNA-Protein analyse") | |
#Simulation | |
#========== | |
n <- 10000 | |
avrg <- vector("numeric", n) | |
for (i in 1:n) | |
{ | |
x <- sample(x5$value, 30, replace=TRUE) | |
avrg[i] <- mean(x) | |
} | |
hist(avrg, main = "10.000 random samples of 30 proteins", xlab = "Averages of half-lives") | |
abline(v=min(avrg),col="red") | |
abline(v=max(avrg), col="red") | |
sum(avrg < 0.18)/n | |
min(avrg) | |
max(avrg) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment