Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@mbarnkob
Last active January 2, 2016 03:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mbarnkob/70ea02e4011c4a442b6a to your computer and use it in GitHub Desktop.
Save mbarnkob/70ea02e4011c4a442b6a to your computer and use it in GitHub Desktop.
R code for analysing half-life data and Monte Carlo analysis used in Barnkob et al, Frontiers in Genetics, 2014.
#CODE FOR Barnkob et al, Frontiers in Genetics, 2014 article
#Mike Barnkob, 2014.
#www.mikebarnkob.dk
#ANALYSIS of mRNA-PROTEIN HALF-LIFE DATA
#=======================================
#Info and packages
#===========
#Data for nature10098-s5.csv file is found here: http://www-ncbi-nlm-nih-gov/pubmed/21593866
#in supp. table 5.
#Needed packages
library(ggplot2)
#Bioconductor with libraries
source("http://bioconductor.org/biocLite.R")
biocLite()
biocLite(c("AnnotationDbi","org.Hs.eg.db","hom.Hs.inp.db", "org.Mm.eg.db"))
#Setup
#=====
setwd("~/Dropbox/Forskning/2014/Projects/Biosurf/R/mRNA-Protein analyse")
df_nature <- read.csv2("nature10098-s5.csv", as.is = TRUE, header = TRUE, sep = ";", row.names = NULL)
df_nature_sub <- df_nature[df_nature$Protein < 100,] #Subset with protein values below 100
vec_gsurface <- read.csv("gsurface2.csv",header=F, strip.white=TRUE)$V1 #Load all CD-id's, with whit space trimmed
vec_gsurface <- as.character(vec_gsurface) #Converted to character vector
#EDA
#===
#Histograms for all proteins / mRNA
plot(df_nature_sub$Protein, log="x", type='h', lwd=10, lend=1)
plot(df_nature_sub$mRNA, log="x", type='h', lwd=10, lend=1)
hist(log(df_nature_sub$Protein), lwd=1, lend=1)
hist(log(df_nature_sub$mRNA), lwd=1, lend=1)
library('ggplot2')
ggplot(df_nature_sub) +
geom_histogram(aes(x=Protein),fill = "red", alpha = 0.6, binwidth = 1) +
geom_histogram(aes(x=mRNA),fill = "blue", alpha = 0.6, binwidth = 1) +
theme(panel.grid.minor=element_blank())
#Span
nrow(df_nature[df_nature$Protein < 100,])
summary(df_nature$Protein)
summary(df_nature$mRNA)
#UNIPROT CD-MARKER SUBSET
#========================
#Convert CD markers from human to mouse ID's
library(AnnotationDbi)
library("org.Hs.eg.db")
vec_gsurface_mouse <- idConverter(vec_gsurface, "HOMSA", "MUSMU", srcIDType="UNIPROT", destIDType="UNIPROT")
vec_gsurface_mouse <- unlist(vec_gsurface_mouse, use.names = FALSE) #create vector instead of list
#Find CD ID's in Nature data-set using grepl
#vec_gsurface <- c("Q8CGP5", "B2RRX1", "Q64525") #test vector with values known to be in df_nature for troubleshooting
df_combined <- sapply(vec_gsurface_mouse, grepl, df_nature$Uniprot_Ids, ignore.case=TRUE)
idx <- rowSums(df_combined) > 0L #Find all rows with at least 1 TRUE column
#Subset df_nature based on logic-matrix 'idx'
df_nature_cdmarkers <- df_nature[idx,]
#Summary
df_nature_cdmarkers
summary(df_nature_cdmarkers)
t.test(df_nature_cdmarkers$Protein, df_nature_cdmarkers$mRNA)
#Save df
write.table (df_nature_cdmarkers, "Output-CD-markers.csv", sep = ";")
#REORDERING DATA FOR BETTER VISUALISATION + ANALYSIS
#===================================================
df_nature_sub <- df_nature[df_nature$Protein < 1000,] #Subset with protein values below 100
x1 <- data.frame(cond = "All Protein's", value = df_nature_sub$Protein)
x2 <- data.frame(cond = "All mRNA", value = df_nature_sub$mRNA)
x3 <- data.frame(cond = "CD Protein's", value = df_nature_cdmarkers$Protein)
x4 <- data.frame(cond = "CD mRNA", value = df_nature_cdmarkers$mRNA)
x5 <- data.frame(cond = "All Protein's", value = df_nature$Protein)
#VISUALIZATION
library(plyr)
df_all <- rbind.fill(x1,x2,x3,x4)
summary(df_all)
df_cd <- rbind.fill(x3,x4)
#Visualization
library(ggplot2)
#All
ggplot(df_all, aes(x=value, fill=cond)) +
geom_histogram(binwidth=0.05, alpha=.3, position="identity", colour="black") +
scale_fill_brewer(palette="Set1") +
scale_x_log10()
#Only CD markers
ggplot(df_cd, aes(x=value, fill=cond)) +
geom_histogram(binwidth=0.05, alpha=.3, position="identity", colour="black") +
scale_fill_brewer(palette="Set1") +
scale_x_log10()
#FINALE Figure - All with facet
gg_color_hue <- function(n) {
hues = seq(15, 375, length=n+1)
hcl(h=hues, l=65, c=100)[1:n]
}
cols = gg_color_hue(10)
rhg_cols <- c("#F8766D", "#00b0f6", "#F8766D", "#00b0f6")
#Version 2
img_all <- ggplot(df_all, aes(x=value, fill=cond)) +
geom_histogram(binwidth=0.05, alpha=1, position="identity", colour="grey") +
#scale_fill_brewer(palette="Set1") +
scale_fill_manual(values = rhg_cols) +
scale_x_log10(name = "Mean half-life (hours)", breaks = c(1, 10, 100, 1000)) +
scale_y_continuous(name = "Count") +
facet_grid(cond ~ ., scales="free_y") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"),
title = element_text(color = "black", size = 14, face="bold"),
strip.text.y = element_text(size = 7
)) +
guides(fill=FALSE) #no legend
img_all
#Save image
tiff(filename="mRNA-Protein-ver4.tiff",
res = 900,
height = 85,
width = 100,
units = "mm"
)
img_all
dev.off()
#Citation
citation("AnnotationDbi")
#ANALYSIS
mean(x1$value, na.rm = T)
mean(x2$value, na.rm = T)
mean(x3$value, na.rm = T)
mean(x4$value, na.rm = T)
#MONTE CARLO ANALYSIS ON mRNA-Protein data
#Used in http://www.ncbi.nlm.nih.gov/pubmed/25309582
#Mike Barnkob, 2014
#=========================================
#Setup
#=====
setwd("~/Dropbox/Forskning/2014/Projects/Biosurf/R/mRNA-Protein analyse")
#Simulation
#==========
n <- 10000
avrg <- vector("numeric", n)
for (i in 1:n)
{
x <- sample(x5$value, 30, replace=TRUE)
avrg[i] <- mean(x)
}
hist(avrg, main = "10.000 random samples of 30 proteins", xlab = "Averages of half-lives")
abline(v=min(avrg),col="red")
abline(v=max(avrg), col="red")
sum(avrg < 0.18)/n
min(avrg)
max(avrg)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment