Last active
September 22, 2017 10:22
-
-
Save ipurusho/53eab4d68d9744a3fbdc to your computer and use it in GitHub Desktop.
Impute missing values and run t test for proteomic data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
analyze.proteomic.data<-function(intensities,meta_data,condA,condB){ | |
#parse conditions to prepare for t test with replicates | |
colnames(intensities)<-meta_data | |
condA.regex<-paste("^",condA,"$",sep="") | |
condB.regex<-paste("^",condB,"$",sep="") | |
condA.indices<-grep(condA.regex,colnames(intensities)) | |
condB.indices<-grep(condB.regex,colnames(intensities)) | |
#meta_data<-as.factor(meta_data) | |
#create a matrix of binary values to represent present and missing values | |
binary_mat<-as.data.frame(ifelse(intensities!="NaN", 1, 0)) | |
#split the data set such that >half missing values will be imputed by min value technique | |
#and less than half imputed by k-nearest neighbor | |
knn.impute.bin<-filtered[rowSums(filtered)>ncol(intensities)/2,] | |
min.impute.bin<-filtered[rowSums(filtered)<ncol(intensities)/2,] | |
#use R impute package to impute by knn using default parameters | |
library(impute) | |
knn.impute<-intensities[rownames(knn.impute.bin),] | |
imputation.by.knn<-impute.knn(as.matrix(knn.impute))$data | |
knn.t.test<-apply(imputation.by.knn, 1, function(data) {t.test(x = data[condA.indices], y = data[condB.indices])$p.value}) | |
knn.results<-as.matrix(knn.t.test) | |
#get minimum value of data set | |
min.impute.min.val<-min(intensities,na.rm=TRUE) | |
#since we are using rnorm to replace missing values, t-test should be performed over | |
#user-defined number of iterations to get an approximate p-value. | |
#reset seed overwritten by knn | |
as.numeric(Sys.time())-> t; set.seed((t - floor(t)) * 1e8 -> seed); print(seed) | |
#randomize and test over iterations | |
all.p.vals<-list() | |
for(i in 1:10){ | |
min.impute<-intensities[rownames(min.impute.bin),] | |
min.impute[min.impute == "NaN"] <- rnorm(min.impute[min.impute == "NaN"], m= min.impute.min.val,sd=sqrt(min.impute.min.val)) | |
min.t.test<-apply(min.impute, 1, function(data) {t.test(x = data[condA.indices], y = data[condB.indices])$p.value}) | |
all.p.vals[[i]]<-min.t.test | |
} | |
p.val.mat<-data.frame(matrix(unlist(all.p.vals), nrow=nrow(min.impute), byrow=T)) | |
min.results<-as.matrix(rowMeans(p.val.mat)) | |
rownames(min.results)<-rownames(min.impute) | |
pvals<-rbind(knn.results,min.results) | |
return(pvals) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment