sgibb/findMSeEMRTs.R

## findMSeEMRTs.R
library("synapter")
library("synapterdata")

## map some private functions to global NAMESPACE
doHDMSePredictions <- synapter:::doHDMSePredictions
error.ppm <- synapter:::error.ppm
findMSeEMRTs <- synapter:::findMSeEMRTs

###########################################################################
## some boring stuff to prepare the Synapter object
###########################################################################

hdmsefile <- getHDMSeFinalPeptide()[2]
msefile <- getMSeFinalPeptide()[2]
msepep3dfile <- getMSePep3D()[2]
fas <- getFasta()

input <- list(identpeptide = hdmsefile,
              quantpeptide = msefile,
              quantpep3d = msepep3dfile,
              fasta = fas)

s <- Synapter(input)

## apply default filtering
filterUniqueDbPeptides(s, verbose=TRUE)

fdr <- fpr <- 0.05
ppm <- 20

filterIdentPepScore(s, method="BH", fdr=fdr)
filterQuantPepScore(s, method="BH", fdr=fdr)
filterIdentProtFpr(s, fpr=fpr)
filterQuantProtFpr(s, fpr=fpr)
filterIdentPpmError(s, ppm=ppm)
filterQuantPpmError(s, ppm=ppm)

mergePeptides(s)

modelRt(s, span=0.05)

## prepare some values
grid.ppm.from <- 5
grid.ppm.to <- 20
grid.ppm.by <- 2
grid.nsd.from <- 0.5
grid.nsd.to <- 5.0
grid.nsd.by <- 0.5
grid.subset <- 1
ppms <- seq(grid.ppm.from, grid.ppm.to, by=grid.ppm.by)
nsds <- seq(grid.nsd.from, grid.nsd.to, by=grid.nsd.by)

###########################################################################
## END OF preprocessing
###########################################################################

findMSeEMRTs2 <- function(identpep, pep3d, mergedpep, nsd,
                         ppmthreshold,
                         model,
                         mergedEMRTs) {
  hdmseData <- doHDMSePredictions(identpep, model, nsd)
  ## sanity checking - v 0.4.6
  stopifnot(all(hdmseData$lower <= hdmseData$upper))
  stopifnot(length(hdmseData$lower) == length(hdmseData$upper))
  n <- length(hdmseData$lower)

  ###########################################################################
  ## new part
  ###########################################################################

  sortedPep3d <- pep3d
  ## add additional index row to avoid ugly calculation for subindices in the
  ## apply call
  sortedPep3d$idx <- 1:nrow(pep3d)
  sortedPep3d <- sortedPep3d[order(pep3d$rt_min),]
  lowerIdx <- findInterval(hdmseData$lower, sortedPep3d$rt_min)+1
  upperIdx <- findInterval(hdmseData$upper, sortedPep3d$rt_min)

  ## just to be sure
  lowerIdx <- pmin(lowerIdx, upperIdx)

  ## create matrix with boundaries (col 1 and 2) and the rownumbers
  idxs <- cbind(lower=lowerIdx, upper=upperIdx, r=1:n)

  res <- apply(idxs, 1, function(i) {
    selPpm <- which((abs(error.ppm(obs = sortedPep3d$mwHPlus[i[1]:i[2]] ,
                                   theo = hdmseData$mass[i[3]])) < ppmthreshold))
    sortedPep3d$idx[i[1]:i[2]][selPpm]
  })
  k <- sapply(res, length)

  ## Those that match *1* spectumIDs will be transferred
  ## BUT there is no guarantee that with *1* unique match,
  ##     we get the correct one, even for those that were
  ##     part of the matched high confidence ident+quant
  ##     identified subset!

  ## #############################################################
  ## n <- length(k) ## already have n
  m <- ncol(pep3d)
  ## to initialise the new pep3d2 with with n rows
  ## and same nb of columns than pep3d
  pep3d2 <- matrix(NA, nrow=n, ncol=m, dimnames=list(c(), colnames(pep3d)))
  pep3d2[, 1] <- k
  k1 <- which(k == 1)
  pep3d2[k1, ] <- unlist(pep3d[unlist(res[k1]), ])

  ###########################################################################
  ## END OF new part
  ###########################################################################

  ## #############################################################

  ans <- cbind(identpep, pep3d2)

  ans$matched.quant.spectrumIDs <- sapply(res, paste0, collapse = ",")

  ans$precursor.leID.quant <- NA
  idx <- match(mergedpep$precursor.leID.ident, ans$precursor.leID)

  ans$precursor.leID.quant[idx] <- mergedpep$precursor.leID.quant
  i <- grep("precursor.leID$", names(ans))
  names(ans)[i] <- "precursor.leID.ident" ## to avoid any confusion

  ans$idSource <- "transfer"


  if (mergedEMRTs == "rescue") {
    ## these are those that were in the merged data set but that
    ## did NOT get transferred because they did NOT uniquely matched
    ## a pep3D EMRT
    lost <- ans$Function != 1 & ans$precursor.leID.ident %in% mergedpep$precursor.leID.ident
    ## rescue these by adding their quant straight from QuantPeptideData
    lostids <- ans$precursor.leID.ident[lost]
    ans[lost, "Counts"] <-
      mergedpep[match(lostids, mergedpep$precursor.leID.ident), "precursor.inten.quant"]
    ans[lost, "idSource"] <- "rescue"
  } else if (mergedEMRTs == "copy") {
    allmerged <- ans$precursor.leID.ident %in% mergedpep$precursor.leID.ident
    allmergedids <- ans$precursor.leID.ident[allmerged]
    ans[allmerged, "Counts"] <-
      mergedpep[match(allmergedids, mergedpep$precursor.leID.ident), "precursor.inten.quant"]
    ans[allmerged, "idSource"] <- "copy"
  } ## else if (fromQuant == "transfer") keep as is

  ## since v 0.5.0 - removing multiply matched EMRTs
  ## dupIDs <- ans$spectrumID[ans$Function == 1 & duplicated(ans$spectrumID)]
  ## dupRows <- ans$spectrumID %in% dupIDs
  ## ans[dupRows, (ncol(identpep)+1) : ncol(ans)] <- -1

  return(ans)
}

system.time(
  fm1 <- findMSeEMRTs(s$IdentPeptideData, s$QuantPep3DData,
                      s$MergedFeatures[TRUE, ], ppms[1], nsds[1],
                      s$RtModel, mergedEMRTs="transfer"))
#   user  system elapsed
# 31.018   0.464  31.876
system.time(
  fm2 <- findMSeEMRTs2(s$IdentPeptideData, s$QuantPep3DData,
                      s$MergedFeatures[TRUE, ], ppms[1], nsds[1],
                      s$RtModel, mergedEMRTs="transfer"))
#   user  system elapsed
#  0.440   0.008   0.456

all.equal(fm1, fm2)
# TRUE

system.time(
  fm3 <- findMSeEMRTs(s$IdentPeptideData, s$QuantPep3DData,
                      s$MergedFeatures[TRUE, ], ppms[8], nsds[10],
                      s$RtModel, mergedEMRTs="transfer"))
#   user  system elapsed
# 38.674   0.124  38.914
system.time(
  fm4 <- findMSeEMRTs2(s$IdentPeptideData, s$QuantPep3DData,
                      s$MergedFeatures[TRUE, ], ppms[8], nsds[10],
                      s$RtModel, mergedEMRTs="transfer"))
#   user  system elapsed
#  0.900   0.012   0.925

all.equal(fm3, fm4)
# TRUE
	library("synapter")
	library("synapterdata")

	## map some private functions to global NAMESPACE
	doHDMSePredictions <- synapter:::doHDMSePredictions
	error.ppm <- synapter:::error.ppm
	findMSeEMRTs <- synapter:::findMSeEMRTs

	###########################################################################
	## some boring stuff to prepare the Synapter object
	###########################################################################

	hdmsefile <- getHDMSeFinalPeptide()[2]
	msefile <- getMSeFinalPeptide()[2]
	msepep3dfile <- getMSePep3D()[2]
	fas <- getFasta()

	input <- list(identpeptide = hdmsefile,
	quantpeptide = msefile,
	quantpep3d = msepep3dfile,
	fasta = fas)

	s <- Synapter(input)

	## apply default filtering
	filterUniqueDbPeptides(s, verbose=TRUE)

	fdr <- fpr <- 0.05
	ppm <- 20

	filterIdentPepScore(s, method="BH", fdr=fdr)
	filterQuantPepScore(s, method="BH", fdr=fdr)
	filterIdentProtFpr(s, fpr=fpr)
	filterQuantProtFpr(s, fpr=fpr)
	filterIdentPpmError(s, ppm=ppm)
	filterQuantPpmError(s, ppm=ppm)

	mergePeptides(s)

	modelRt(s, span=0.05)

	## prepare some values
	grid.ppm.from <- 5
	grid.ppm.to <- 20
	grid.ppm.by <- 2
	grid.nsd.from <- 0.5
	grid.nsd.to <- 5.0
	grid.nsd.by <- 0.5
	grid.subset <- 1
	ppms <- seq(grid.ppm.from, grid.ppm.to, by=grid.ppm.by)
	nsds <- seq(grid.nsd.from, grid.nsd.to, by=grid.nsd.by)

	###########################################################################
	## END OF preprocessing
	###########################################################################

	findMSeEMRTs2 <- function(identpep, pep3d, mergedpep, nsd,
	ppmthreshold,
	model,
	mergedEMRTs) {
	hdmseData <- doHDMSePredictions(identpep, model, nsd)
	## sanity checking - v 0.4.6
	stopifnot(all(hdmseData$lower <= hdmseData$upper))
	stopifnot(length(hdmseData$lower) == length(hdmseData$upper))
	n <- length(hdmseData$lower)

	###########################################################################
	## new part
	###########################################################################

	sortedPep3d <- pep3d
	## add additional index row to avoid ugly calculation for subindices in the
	## apply call
	sortedPep3d$idx <- 1:nrow(pep3d)
	sortedPep3d <- sortedPep3d[order(pep3d$rt_min),]
	lowerIdx <- findInterval(hdmseData$lower, sortedPep3d$rt_min)+1
	upperIdx <- findInterval(hdmseData$upper, sortedPep3d$rt_min)

	## just to be sure
	lowerIdx <- pmin(lowerIdx, upperIdx)

	## create matrix with boundaries (col 1 and 2) and the rownumbers
	idxs <- cbind(lower=lowerIdx, upper=upperIdx, r=1:n)

	res <- apply(idxs, 1, function(i) {
	selPpm <- which((abs(error.ppm(obs = sortedPep3d$mwHPlus[i[1]:i[2]] ,
	theo = hdmseData$mass[i[3]])) < ppmthreshold))
	sortedPep3d$idx[i[1]:i[2]][selPpm]
	})
	k <- sapply(res, length)

	## Those that match 1 spectumIDs will be transferred
	## BUT there is no guarantee that with 1 unique match,
	## we get the correct one, even for those that were
	## part of the matched high confidence ident+quant
	## identified subset!

	## #############################################################
	## n <- length(k) ## already have n
	m <- ncol(pep3d)
	## to initialise the new pep3d2 with with n rows
	## and same nb of columns than pep3d
	pep3d2 <- matrix(NA, nrow=n, ncol=m, dimnames=list(c(), colnames(pep3d)))
	pep3d2[, 1] <- k
	k1 <- which(k == 1)
	pep3d2[k1, ] <- unlist(pep3d[unlist(res[k1]), ])

	###########################################################################
	## END OF new part
	###########################################################################

	## #############################################################

	ans <- cbind(identpep, pep3d2)

	ans$matched.quant.spectrumIDs <- sapply(res, paste0, collapse = ",")

	ans$precursor.leID.quant <- NA
	idx <- match(mergedpep$precursor.leID.ident, ans$precursor.leID)

	ans$precursor.leID.quant[idx] <- mergedpep$precursor.leID.quant
	i <- grep("precursor.leID$", names(ans))
	names(ans)[i] <- "precursor.leID.ident" ## to avoid any confusion

	ans$idSource <- "transfer"


	if (mergedEMRTs == "rescue") {
	## these are those that were in the merged data set but that
	## did NOT get transferred because they did NOT uniquely matched
	## a pep3D EMRT
	lost <- ans$Function != 1 & ans$precursor.leID.ident %in% mergedpep$precursor.leID.ident
	## rescue these by adding their quant straight from QuantPeptideData
	lostids <- ans$precursor.leID.ident[lost]
	ans[lost, "Counts"] <-
	mergedpep[match(lostids, mergedpep$precursor.leID.ident), "precursor.inten.quant"]
	ans[lost, "idSource"] <- "rescue"
	} else if (mergedEMRTs == "copy") {
	allmerged <- ans$precursor.leID.ident %in% mergedpep$precursor.leID.ident
	allmergedids <- ans$precursor.leID.ident[allmerged]
	ans[allmerged, "Counts"] <-
	mergedpep[match(allmergedids, mergedpep$precursor.leID.ident), "precursor.inten.quant"]
	ans[allmerged, "idSource"] <- "copy"
	} ## else if (fromQuant == "transfer") keep as is

	## since v 0.5.0 - removing multiply matched EMRTs
	## dupIDs <- ans$spectrumID[ans$Function == 1 & duplicated(ans$spectrumID)]
	## dupRows <- ans$spectrumID %in% dupIDs
	## ans[dupRows, (ncol(identpep)+1) : ncol(ans)] <- -1

	return(ans)
	}

	system.time(
	fm1 <- findMSeEMRTs(s$IdentPeptideData, s$QuantPep3DData,
	s$MergedFeatures[TRUE, ], ppms[1], nsds[1],
	s$RtModel, mergedEMRTs="transfer"))
	# user system elapsed
	# 31.018 0.464 31.876
	system.time(
	fm2 <- findMSeEMRTs2(s$IdentPeptideData, s$QuantPep3DData,
	s$MergedFeatures[TRUE, ], ppms[1], nsds[1],
	s$RtModel, mergedEMRTs="transfer"))
	# user system elapsed
	# 0.440 0.008 0.456

	all.equal(fm1, fm2)
	# TRUE

	system.time(
	fm3 <- findMSeEMRTs(s$IdentPeptideData, s$QuantPep3DData,
	s$MergedFeatures[TRUE, ], ppms[8], nsds[10],
	s$RtModel, mergedEMRTs="transfer"))
	# user system elapsed
	# 38.674 0.124 38.914
	system.time(
	fm4 <- findMSeEMRTs2(s$IdentPeptideData, s$QuantPep3DData,
	s$MergedFeatures[TRUE, ], ppms[8], nsds[10],
	s$RtModel, mergedEMRTs="transfer"))
	# user system elapsed
	# 0.900 0.012 0.925

	all.equal(fm3, fm4)
	# TRUE