Skip to content

Instantly share code, notes, and snippets.

@sgibb
Created January 24, 2019 13:30
Show Gist options
  • Save sgibb/54c1477d26fa8702231c447badd11776 to your computer and use it in GitHub Desktop.
Save sgibb/54c1477d26fa8702231c447badd11776 to your computer and use it in GitHub Desktop.
MSnbase3 - multiple MSnExp backends
library("mzR")
library("msdata")
library("rhdf5")
library("digest")
.sha1 <- function(x)vapply(x, digest::sha1, NA_character_, USE.NAMES=FALSE)
setClass(
"Backend",
contains="VIRTUAL"
)
setMethod(
"show",
signature="Backend",
definition=function(object) {
cat("Backend:", gsub("Backend", "", class(object)[1L]), "\n")
}
)
setGeneric(
"backendSetup",
def=function(object, files) standardGeneric("backendSetup"),
valueClass="Backend"
)
setMethod(
"backendSetup",
signature="Backend",
definition=function(object, files) {
object
}
)
setGeneric(
"backendPrefill",
def=function(object, files) standardGeneric("backendPrefill"),
valueClass="Backend"
)
setMethod(
"backendPrefill",
signature="Backend",
definition=function(object, files) {
object
}
)
setGeneric(
"backendReadSpectra",
def=function(object, files, ids) standardGeneric("backendReadSpectra"),
valueClass="list"
)
setGeneric(
"backendWriteSpectra",
def=function(object, files, ids, spectra) standardGeneric("backendWriteSpectra"),
valueClass="Backend"
)
setClass(
"BackendMemory",
contains="Backend",
slots=c(
assay="list" # or environment
)
)
BackendMemory <- function() { new("BackendMemory") }
setMethod(
"backendSetup",
signature="BackendMemory",
definition=function(object, files) {
mzMl <- openMSfile(files)
on.exit(close(mzMl))
object@assay <- vector(mode="list", length=length(mzMl))
names(object@assay) <- paste(.sha1(files), seq_along(mzMl), sep="/")
object
})
setMethod(
"backendPrefill",
signature="BackendMemory",
definition=function(object, files) {
object <- backendSetup(object, files)
mzMl <- openMSfile(files)
on.exit(close(mzMl))
pks <- peaks(mzMl)
backendWriteSpectra(object, files, seq_along(pks), pks)
})
setMethod(
"backendReadSpectra",
signature="BackendMemory",
definition=function(object, files, ids) {
nms <- paste(.sha1(files), ids, sep="/")
object@assay[nms]
})
setMethod(
"backendWriteSpectra",
signature="BackendMemory",
definition=function(object, files, ids, spectra) {
nms <- paste(.sha1(files), ids, sep="/")
object@assay[nms] <- spectra
object
})
setClass(
"BackendMzMl",
contains="Backend"
)
BackendMzMl <- function() { new("BackendMzMl") }
setMethod(
"backendReadSpectra",
signature="BackendMzMl",
definition=function(object, files, ids) {
mzMl <- openMSfile(files)
on.exit(close(mzMl))
spectra <- peaks(mzMl, scan=ids)
names(spectra) <- paste(.sha1(files), ids, sep="/")
spectra
})
setMethod(
"backendWriteSpectra",
signature="BackendMzMl",
definition=function(object, files, ids, spectra) {
stop("Not implemented yet")
})
setClass(
"BackendHdf5",
contains="Backend",
slots=c(
path="character",
h5files="character"
)
)
BackendHdf5 <- function(path=".") { new("BackendHdf5", path=path) }
setMethod(
"backendSetup",
signature="BackendHdf5",
definition=function(object, files) {
object@h5files <- file.path(object@path, paste0(.sha1(files), ".h5"))
names(object@h5files) <- files
h5 <- H5Fcreate(object@h5files[files])
h5createGroup(h5, .sha1(files))
on.exit(H5Fclose(h5))
object
})
setMethod(
"backendPrefill",
signature="BackendHdf5",
definition=function(object, files) {
object <- backendSetup(object, files)
mzMl <- openMSfile(files)
on.exit(close(mzMl))
pks <- peaks(mzMl)
backendWriteSpectra(object, files, seq_along(pks), pks)
})
setMethod(
"backendReadSpectra",
signature="BackendHdf5",
definition=function(object, files, ids) {
nms <- paste(.sha1(files), ids, sep="/")
h5 <- H5Fopen(object@h5files[files])
on.exit(H5Fclose(h5))
spectra <- lapply(nms, h5read, file=h5)
names(spectra) <- nms
spectra
})
setMethod(
"backendWriteSpectra",
signature="BackendHdf5",
definition=function(object, files, ids, spectra) {
group <- .sha1(files)
h5 <- H5Fopen(object@h5files[files])
h5g <- H5Gopen(h5, group)
on.exit(H5Gclose(h5g))
on.exit(H5Fclose(h5), add=TRUE)
for (i in seq_along(spectra)) {
h5write(spectra[[i]], h5g, as.character(i))
}
object
})
setClass(
"MSnExperiment",
slots=c(
backend="Backend",
featureData="data.frame",
files="character"
)
)
setMethod(
"show",
signature="MSnExperiment",
definition=function(object) {
cat("MSnExperiment from", basename(object@files), "\n")
show(object@backend)
}
)
setGeneric(
"spectra",
def=function(object, files, ids) standardGeneric("spectra"),
valueClass="list"
)
setMethod(
"spectra",
signature="MSnExperiment",
definition=function(object, files, ids) {
if (missing(files)) {
files <- object@files
}
if (missing(ids)) {
ids <- object@featureData$spectrumIndex
}
backendReadSpectra(object@backend, files=files, ids=ids)
}
)
setGeneric(
"spectra<-",
def=function(object, files, ids, value) standardGeneric("spectra<-"),
valueClass="MSnExperiment"
)
setReplaceMethod(
"spectra",
signature="MSnExperiment",
definition=function(object, files, ids, value) {
if (missing(files)) {
files <- object@files
}
if (missing(ids)) {
ids <- object@featureData$spectrumIndex
}
object@backend <- backendWriteSpectra(
object@backend,
files=files,
ids=ids,
spectra=value
)
object
}
)
setGeneric(
"switchBackend",
def=function(object, to) standardGeneric("switchBackend"),
valueClass="MSnExperiment"
)
setMethod(
"switchBackend",
signature="MSnExperiment",
definition=function(object, to=BackendMemory()) {
to <- backendSetup(to, object@files)
files <- object@files
ids <- object@featureData$spectrumIndex
from <- object@backend
object@backend <- backendWriteSpectra(
to,
files=files,
ids=spectrumIndex,
spectra=backendReadSpectra(from, files=files, ids=ids)
)
object
}
)
readMSData <- function(files, backend=BackendMemory()) {
fh <- openMSfile(files)
fd <- header(fh)
fd$spectrumIndex <- seq_len(nrow(fd))
close(fh)
new("MSnExperiment",
backend=backendPrefill(backend, files),
files=files,
featureData=fd
)
}
ms1 <- readMSData(msdata::proteomics("01.mzML.gz", full.names=TRUE))
ms2 <- readMSData(msdata::proteomics("01.mzML.gz", full.names=TRUE), backend=BackendMzMl())
ms3 <- readMSData(msdata::proteomics("01.mzML.gz", full.names=TRUE), backend=BackendHdf5(path=".."))
all.equal(spectra(ms1, ids=1:3), spectra(ms2, ids=1:3))
# [1] TRUE
all.equal(spectra(ms2, ids=1:3), spectra(ms3, ids=1:3))
# [1] TRUE
spectra(ms1, ids=1:2) <- list(cbind(1:3, 4:6), cbind(11:13, 14:16))
spectra(ms1, ids=3:509) <- NULL
ms1@featureData <- ms1@featureData[1:2,]
spectra(ms1)
# $`7ca27022eb61d649783dda21812088293d35f3a6/1`
# [,1] [,2]
# [1,] 1 4
# [2,] 2 5
# [3,] 3 6
#
# $`7ca27022eb61d649783dda21812088293d35f3a6/2`
# [,1] [,2]
# [1,] 11 14
# [2,] 12 15
# [3,] 13 16
#
ms4 <- switchBackend(ms1, BackendHdf5(path="."))
all.equal(spectra(ms1), spectra(ms4))
# [1] TRUE
@jorainer
Copy link

Nice indeed. Some suggestions though:

  1. I would implement methods
  • backendInitialize (reads data from input and uses
  • backendUpdate: writes changes to files (hdf5 backend), updates the SpectrumList (in-mem backend), does nothing (mzR backend).
  • backendSpectrapply: same functionality than spectrapply,OnDiskMSnExp (or better spectrapply,Hdf5MSnExp): gets all data
  1. Keep the Spectrum objects as they are (eventually just remove the extensions of the Versioned class - IMHO that's just some legacy class that not even Bioconductor developers use anymore. Also, I would lighten the validObject method, eventually implemeting a downstream function validate<class name> that has an additional argument to switch between a thourough check and a fast check (the latter used for validObject.
  2. I'm not quite happy with the spectra method returning just a list of matrices. For the OnDiskMSnExp-alike mzR backend this would be too slow, because I have to first load the data, create the spectra, apply the lazy evaluation queue and would then have to extract again the data as a matrix from the spectra... not ideal I think. I would by default return a SpectrumList.
  3. I would not write data manipulations of the hdf5 backend directly to the files. Writing hdf5 files is quite time consuming. I would also use the lazy evaluation queue as the OnDiskMSnExp has and write things to files is backupUpdate is called.

Why a backendSpectrapply? For large experiments I don't want to read all data into memory, so I need to have a function that reads spectra per file and applies a function (such as extract chromatogram, base peak intensity etc). I found this function really convenient for OnDiskMSnExp - we could use that as prototype.

Also, why not create a new branch on MSnbase? I would be more than happy if I could contribute here (especially add some of the hdf5 and the mzR backend functionality).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment