Created
November 18, 2014 20:17
-
-
Save lianos/1bba175c1cefafe8843e to your computer and use it in GitHub Desktop.
Functions for the Bioconductor seqc package to generate ExpressionSets from the data provided
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(seqc) | |
##' Creates an ExpressionSet of counts for a given subset of the samples | |
##' provided by the seqc data package. | |
##' | |
##' Currently this only works for \code{feature == 'gene'}. | |
##' | |
##' @param feature The type of features you want to build an ExpressionSet | |
##' out of (gene, junction, taqman | |
##' @param annotation If \code{feature == 'gene'}, then this determines which | |
##' set of gene features you want (refseq, aceview) | |
##' @param platform Subset the data based on the platform it was sequenced on? | |
##' (ILM, ROC, LIF) | |
##' @param center Subset the data based on the center that sequenced the | |
##' libraries? | |
##' | |
##' @return An ExpressionSet with the counts from the samples that satisfy the | |
##' critera set by the function parameters. | |
seqc.eSet <- function(feature=c('gene', 'junction', 'taqman'), | |
annotation=c('refseq', 'aceview'), | |
platform=NULL, | |
center=NULL) { | |
feature <- match.arg(feature) | |
annotation <- match.arg(annotation) | |
if (feature == 'taqman') { | |
out <- .create.SEQC.taqman.eSet() | |
} else if (feature == 'gene') { | |
out <- .create.SEQC.gene.eSet(annotation, platform, center) | |
} else { | |
stop("Not dealing with junctions for now") | |
out <- .create.SEQC.junction.eSet(platform, center) | |
} | |
out | |
} | |
##' Returns a character vector of the available SEQC samples | |
##' | |
##' @param feature Limit samples to be of "gene" or "junction" | |
##' @param annotation Limit samples to a particular annotation source (refseq or | |
##' aceview) | |
##' @param platform Limit samples to be of a particular platform (ILM, LIF, ROC) | |
##' @param center Limit samples to only include those of a particulr center | |
##' | |
##' @return a character vector of the sample names/objects in the package:seqc | |
##' namespace | |
seqc.samples <- function(feature=NULL, annotation=NULL, platform=NULL, | |
center=NULL) { | |
ls.idx <- which(search() == 'package:seqc') | |
fns.all <- ls(ls.idx) | |
fns <- fns.all | |
any.taqman <- c(feature == 'taqman', annotation == 'taqman', | |
platform == 'taqman', center == 'taqman') | |
if (any(any.taqman)) { | |
return('taqman') | |
} | |
if (is.character(feature)) { | |
features <- unique(.parse.feature(fns.all)) | |
feature <- match.arg(feature, features) | |
fns <- fns[.parse.feature(fns) == feature] | |
} | |
if (is.character(annotation)) { | |
annotation <- match.arg(annotation, c('refseq', 'aceview')) | |
## annotation plays no role in the junction files | |
annotation <- c(annotation, 'junction') | |
annotations <- .parse.annotation(fns) | |
fns <- fns[annotations %in% annotation] | |
} | |
if (is.character(platform)) { | |
platforms <- .parse.platform(fns.all) | |
platform <- match.arg(platform, platforms, several.ok=TRUE) | |
fns <- fns[.parse.platform(fns) %in% platform] | |
} | |
if (is.character(center)) { | |
centers <- setdiff(.parse.center(fns.all), 'taqman') | |
center <- match.arg(center, centers, several.ok=TRUE) | |
fns <- fns[.parse.center(fns) %in% center] | |
} | |
fns | |
} | |
## ----------------------------------------------------------------------------- | |
## eSet generation helper functions | |
##' Create eSet for all samples from given platform and site | |
.create.SEQC.gene.eSet <- function(annotation, platform, center) { | |
## Ensure that all rows are the same across every data.frame in x | |
seqc.idx <- which(search() == 'package:seqc') | |
fns <- seqc.samples(feature='gene', annotation=annotation, platform=platform, | |
center=center) | |
if (length(fns) == 0) { | |
stop("No samples fit the selection criteria to build a gene eSet") | |
} | |
## Check that the meta data for each dataset is the same | |
meta.cols <- c('EntrezID', 'Symbol', 'GeneLength', 'IsERCC') | |
meta <- lapply(fns, function(xx) get(xx, seqc.idx)[, meta.cols]) | |
kosher <- sapply(meta, function(m) all.equal(m, meta[[1]])) | |
if (!all(kosher)) { | |
stop("Features across each data gene-level data file are not the same") | |
} | |
## Build the counts matrix | |
platform <- c() | |
counts <- do.call(cbind, lapply(fns, function(xx) { | |
cnts <- get(xx, seqc.idx) | |
out <- as.matrix(cnts[, setdiff(names(cnts), meta.cols)]) | |
colnames(out) <- paste(colnames(out), .parse.center(xx), sep='_') | |
platform <<- c(platform, rep(.parse.platform(xx), ncol(out))) | |
out | |
})) | |
## Parse the names of the columns in this matrix to something useful | |
s.info <- strsplit(colnames(counts), '_') | |
pd <- data.frame( | |
platform=factor(platform), | |
sample=factor(sapply(s.info, '[[', 1L)), | |
replicate=factor(sapply(s.info, '[[', 2L)), | |
lane=factor(sapply(s.info, '[[', 3L)), ## or region for 454 | |
flowcell=factor(sapply(s.info, function(x) { | |
if (length(x) == 5) x[4L] else '454' | |
})), | |
center=factor(sapply(s.info, tail, 1L))) | |
out <- ExpressionSet(counts) | |
pData(out) <- pd | |
fData(out) <- meta[[1]] | |
out | |
} | |
.create.SEQC.junction.eSet <- function(x) { | |
stop("Junction stuff not yet implemented") | |
} | |
.create.SEQC.taqman.eSet <- function(x) { | |
stop("Taqman generation not yet implemented") | |
} | |
## ----------------------------------------------------------------------------- | |
## Utility functions to parse information out of the objects loaded up by the | |
## seqc package | |
.parse.platform <- function(x) { | |
sapply(strsplit(x, '_'), '[[', 1L) | |
} | |
## either aceview, refseq, junction, or taqman | |
.parse.annotation <- function(x) { | |
x <- sapply(strsplit(x, '_'), '[', 2L) | |
ifelse(is.na(x), 'taqman', x) | |
} | |
## either gene or junction | |
.parse.feature <- function(x) { | |
out <- .parse.annotation(x) | |
ifelse(out == 'taqman', 'gene', ifelse(out == 'junction', out, 'gene')) | |
} | |
.parse.center <- function(x) { | |
pieces <- strsplit(x, '_') | |
anno <- .parse.annotation(x) | |
ifelse(anno == 'junction', sapply(pieces, '[', 3), sapply(pieces, tail, 1)) | |
} | |
.parse.sample <- function(x) { | |
stop("This is only for junction stuff, which we ignore for now") | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment