R code for analysing half-life data and Monte Carlo analysis used in Barnkob et al, Frontiers in Genetics, 2014.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#CODE FOR Barnkob et al, Frontiers in Genetics, 2014 article | |
#Mike Barnkob, 2014. | |
#www.mikebarnkob.dk | |
#ANALYSIS of mRNA-PROTEIN HALF-LIFE DATA | |
#======================================= | |
#Info and packages | |
#=========== | |
#Data for nature10098-s5.csv file is found here: http://www-ncbi-nlm-nih-gov/pubmed/21593866 | |
#in supp. table 5. | |
#Needed packages | |
library(ggplot2) | |
#Bioconductor with libraries | |
source("http://bioconductor.org/biocLite.R") | |
biocLite() | |
biocLite(c("AnnotationDbi","org.Hs.eg.db","hom.Hs.inp.db", "org.Mm.eg.db")) | |
#Setup | |
#===== | |
setwd("~/Dropbox/Forskning/2014/Projects/Biosurf/R/mRNA-Protein analyse") | |
df_nature <- read.csv2("nature10098-s5.csv", as.is = TRUE, header = TRUE, sep = ";", row.names = NULL) | |
df_nature_sub <- df_nature[df_nature$Protein < 100,] #Subset with protein values below 100 | |
vec_gsurface <- read.csv("gsurface2.csv",header=F, strip.white=TRUE)$V1 #Load all CD-id's, with whit space trimmed | |
vec_gsurface <- as.character(vec_gsurface) #Converted to character vector | |
#EDA | |
#=== | |
#Histograms for all proteins / mRNA | |
plot(df_nature_sub$Protein, log="x", type='h', lwd=10, lend=1) | |
plot(df_nature_sub$mRNA, log="x", type='h', lwd=10, lend=1) | |
hist(log(df_nature_sub$Protein), lwd=1, lend=1) | |
hist(log(df_nature_sub$mRNA), lwd=1, lend=1) | |
library('ggplot2') | |
ggplot(df_nature_sub) + | |
geom_histogram(aes(x=Protein),fill = "red", alpha = 0.6, binwidth = 1) + | |
geom_histogram(aes(x=mRNA),fill = "blue", alpha = 0.6, binwidth = 1) + | |
theme(panel.grid.minor=element_blank()) | |
#Span | |
nrow(df_nature[df_nature$Protein < 100,]) | |
summary(df_nature$Protein) | |
summary(df_nature$mRNA) | |
#UNIPROT CD-MARKER SUBSET | |
#======================== | |
#Convert CD markers from human to mouse ID's | |
library(AnnotationDbi) | |
library("org.Hs.eg.db") | |
vec_gsurface_mouse <- idConverter(vec_gsurface, "HOMSA", "MUSMU", srcIDType="UNIPROT", destIDType="UNIPROT") | |
vec_gsurface_mouse <- unlist(vec_gsurface_mouse, use.names = FALSE) #create vector instead of list | |
#Find CD ID's in Nature data-set using grepl | |
#vec_gsurface <- c("Q8CGP5", "B2RRX1", "Q64525") #test vector with values known to be in df_nature for troubleshooting | |
df_combined <- sapply(vec_gsurface_mouse, grepl, df_nature$Uniprot_Ids, ignore.case=TRUE) | |
idx <- rowSums(df_combined) > 0L #Find all rows with at least 1 TRUE column | |
#Subset df_nature based on logic-matrix 'idx' | |
df_nature_cdmarkers <- df_nature[idx,] | |
#Summary | |
df_nature_cdmarkers | |
summary(df_nature_cdmarkers) | |
t.test(df_nature_cdmarkers$Protein, df_nature_cdmarkers$mRNA) | |
#Save df | |
write.table (df_nature_cdmarkers, "Output-CD-markers.csv", sep = ";") | |
#REORDERING DATA FOR BETTER VISUALISATION + ANALYSIS | |
#=================================================== | |
df_nature_sub <- df_nature[df_nature$Protein < 1000,] #Subset with protein values below 100 | |
x1 <- data.frame(cond = "All Protein's", value = df_nature_sub$Protein) | |
x2 <- data.frame(cond = "All mRNA", value = df_nature_sub$mRNA) | |
x3 <- data.frame(cond = "CD Protein's", value = df_nature_cdmarkers$Protein) | |
x4 <- data.frame(cond = "CD mRNA", value = df_nature_cdmarkers$mRNA) | |
x5 <- data.frame(cond = "All Protein's", value = df_nature$Protein) | |
#VISUALIZATION | |
library(plyr) | |
df_all <- rbind.fill(x1,x2,x3,x4) | |
summary(df_all) | |
df_cd <- rbind.fill(x3,x4) | |
#Visualization | |
library(ggplot2) | |
#All | |
ggplot(df_all, aes(x=value, fill=cond)) + | |
geom_histogram(binwidth=0.05, alpha=.3, position="identity", colour="black") + | |
scale_fill_brewer(palette="Set1") + | |
scale_x_log10() | |
#Only CD markers | |
ggplot(df_cd, aes(x=value, fill=cond)) + | |
geom_histogram(binwidth=0.05, alpha=.3, position="identity", colour="black") + | |
scale_fill_brewer(palette="Set1") + | |
scale_x_log10() | |
#FINALE Figure - All with facet | |
gg_color_hue <- function(n) { | |
hues = seq(15, 375, length=n+1) | |
hcl(h=hues, l=65, c=100)[1:n] | |
} | |
cols = gg_color_hue(10) | |
rhg_cols <- c("#F8766D", "#00b0f6", "#F8766D", "#00b0f6") | |
#Version 2 | |
img_all <- ggplot(df_all, aes(x=value, fill=cond)) + | |
geom_histogram(binwidth=0.05, alpha=1, position="identity", colour="grey") + | |
#scale_fill_brewer(palette="Set1") + | |
scale_fill_manual(values = rhg_cols) + | |
scale_x_log10(name = "Mean half-life (hours)", breaks = c(1, 10, 100, 1000)) + | |
scale_y_continuous(name = "Count") + | |
facet_grid(cond ~ ., scales="free_y") + | |
theme(panel.grid.major = element_blank(), | |
panel.grid.minor = element_blank(), | |
panel.background = element_blank(), | |
axis.line = element_line(colour = "black"), | |
title = element_text(color = "black", size = 14, face="bold"), | |
strip.text.y = element_text(size = 7 | |
)) + | |
guides(fill=FALSE) #no legend | |
img_all | |
#Save image | |
tiff(filename="mRNA-Protein-ver4.tiff", | |
res = 900, | |
height = 85, | |
width = 100, | |
units = "mm" | |
) | |
img_all | |
dev.off() | |
#Citation | |
citation("AnnotationDbi") | |
#ANALYSIS | |
mean(x1$value, na.rm = T) | |
mean(x2$value, na.rm = T) | |
mean(x3$value, na.rm = T) | |
mean(x4$value, na.rm = T) | |
#MONTE CARLO ANALYSIS ON mRNA-Protein data | |
#Used in http://www.ncbi.nlm.nih.gov/pubmed/25309582 | |
#Mike Barnkob, 2014 | |
#========================================= | |
#Setup | |
#===== | |
setwd("~/Dropbox/Forskning/2014/Projects/Biosurf/R/mRNA-Protein analyse") | |
#Simulation | |
#========== | |
n <- 10000 | |
avrg <- vector("numeric", n) | |
for (i in 1:n) | |
{ | |
x <- sample(x5$value, 30, replace=TRUE) | |
avrg[i] <- mean(x) | |
} | |
hist(avrg, main = "10.000 random samples of 30 proteins", xlab = "Averages of half-lives") | |
abline(v=min(avrg),col="red") | |
abline(v=max(avrg), col="red") | |
sum(avrg < 0.18)/n | |
min(avrg) | |
max(avrg) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment