Skip to content

Instantly share code, notes, and snippets.

@b-klaus
Created October 2, 2017 17:12
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save b-klaus/f2a502109f852702bf7efc5ad5c0bd11 to your computer and use it in GitHub Desktop.
Save b-klaus/f2a502109f852702bf7efc5ad5c0bd11 to your computer and use it in GitHub Desktop.
fixed getAE function for the Bioc ArrayExpress that allows the usage of https
# We have to change to https in `getAE`
# to https as EBI services have switched the protocol on Oct 1st
# see: https://www.ebi.ac.uk/services
# The package XML does not support https though, so we need RCurl::getURL:
# https://stackoverflow.com/questions/23584514/error-xml-content-does-not-seem-to-be-xml-r-3-1-0
https_getAE <- function (accession, path = getwd(), type = "full", extract = TRUE,
local = FALSE, sourcedir = path)
{
if (!local) {
baseURL = "https://www.ebi.ac.uk/arrayexpress/xml/v2/files"
xmlURL = getURL(paste(baseURL, accession, sep = "/"))
xml = xmlTreeParse(xmlURL, useInternalNodes = TRUE, isURL=FALSE)
sdrfURL = xpathSApply(xml, "/files/experiment/file[kind='sdrf' and extension='txt']/url",
xmlValue)
sdrfFile = xpathSApply(xml, "/files/experiment/file[kind='sdrf' and extension='txt']/name",
xmlValue)
idfURL = xpathSApply(xml, "/files/experiment/file[kind='idf' and extension='txt']/url",
xmlValue)
idfFile = xpathSApply(xml, "/files/experiment/file[kind='idf' and extension='txt']/name",
xmlValue)
adfURL = xpathApply(xml, "/files/experiment/file[kind='adf' and extension='txt']/url",
xmlValue)
adfFiles = xpathApply(xml, "/files/experiment/file[kind='adf' and extension='txt']/name",
xmlValue)
rawArchiveURL = xpathApply(xml, "/files/experiment/file[kind='raw' and extension='zip']/url",
xmlValue)
procArchiveURL = xpathApply(xml, "/files/experiment/file[kind='processed' and extension='zip']/url",
xmlValue)
}
else {
allfiles = list.files(sourcedir)
sdrfFile = allfiles[grep(paste(accession, ".sdrf.txt$",
sep = ""), allfiles)]
if (length(sdrfFile) == 0)
stop("SDRF file not found in directory ", sourcedir)
sdrfURL = paste("file:/", sourcedir, sdrfFile, sep = "/")
idfFile = allfiles[grep(paste(accession, ".idf.txt$",
sep = ""), allfiles)]
if (length(idfFile) == 0)
warning("IDF file not found in directory ", sourcedir)
idfURL = paste("file:/", sourcedir, idfFile, sep = "/")
ph = try(read.AnnotatedDataFrame(sdrfFile, path = sourcedir,
row.names = NULL, blank.lines.skip = TRUE, fill = TRUE,
varMetadata.char = "$"))
if (inherits(ph, "try-error")) {
warning("Unable to retrieve ADF reference from SDRF. Reading any ADF in directory.")
adfFiles = allfiles[grep(".adf.txt$", allfiles)]
}
else {
adr = unique(pData(ph)[, getSDRFcolumn("ArrayDesignREF",
varLabels(ph))])
adfFiles = paste(adr, ".adf.txt", sep = "")
}
if (all(file.exists(file.path(sourcedir, adfFiles)))) {
adfURL = paste("file:/", sourcedir, adfFiles, sep = "/")
downloadADF = FALSE
}
else {
filesURL = "https://www.ebi.ac.uk/arrayexpress/files"
adfURL = paste(filesURL, adr, adfFiles, sep = "/")
downloadADF = TRUE
}
rawArchiveURL = NULL
procArchiveURL = NULL
rawArchive = allfiles[grep(paste(accession, ".raw.[0-9]{1,}.zip",
sep = ""), allfiles)]
if (length(rawArchive) != 0)
rawArchiveURL = paste("file:/", sourcedir, rawArchive,
sep = "/")
else warning("No raw files found in directory ", sourcedir)
processedArchive = allfiles[grep(paste(accession, ".processed.[0-9]{1,}.zip",
sep = ""), allfiles)]
if (length(processedArchive) != 0)
procArchiveURL = paste("file:/", sourcedir, processedArchive,
sep = "/")
else warning("No processed data files found in directory ",
sourcedir)
}
if (length(sdrfURL) > 1) {
warning("Found two SDRF files: \n", paste(sdrfURL, "\n"))
hybSDRF = grep("hyb.sdrf", sdrfURL)
if (length(hybSDRF) > 0) {
message("Choosing ", sdrfURL[hybSDRF])
sdrfURL = sdrfURL[hybSDRF]
sdrfFile = sdrfFile[hybSDRF]
}
else {
warning("Unable to choose SDRF file. Please report experiment to miamexpress@ebi.ac.uk")
}
}
if (!local || path != sourcedir || downloadADF) {
adfFiles <- lapply(adfURL, function(url) {
filedest = paste(path, basename(url), sep = "/")
dnld = try(download.file(url, filedest, mode = "wb"))
if (inherits(dnld, "try-error") || file.info(filedest)$size ==
0) {
warning(paste(url, " does not exist or is empty. \n"),
sep = "")
adffile = NULL
}
else {
adffile = basename(filedest)
}
return(adffile)
})
if (!is.null(adfFiles))
adfFiles = unlist(adfFiles)
}
if (!local || path != sourcedir) {
sdrfFileDest = paste(path, sdrfFile, sep = "/")
dnld = try(download.file(sdrfURL, sdrfFileDest, mode = "wb"))
if (inherits(dnld, "try-error") || file.info(sdrfFileDest)$size ==
0) {
warning(paste(sdrfFile, " does not exist or is empty. The object will not have featureData or phenoData. \n"),
sep = "")
sdrfFile = NULL
adffile = NULL
}
idfFileDest = paste(path, idfFile, sep = "/")
dnld = try(download.file(idfURL, idfFileDest, mode = "wb"))
if (inherits(dnld, "try-error") || file.info(idfFileDest)$size ==
0) {
warning(paste(idfFile, " does not exist or is empty. \n"),
sep = "")
idfFile = NULL
}
rawArchive = NULL
processedArchive = NULL
if (type != "mageFilesOnly" && !is.null(rawArchiveURL) &&
(type == "full" || type == "raw")) {
message("Copying raw data files\n")
rawArchive <- lapply(rawArchiveURL, function(url) {
filedest = paste(path, basename(url), sep = "/")
dnld = try(download.file(url, filedest, mode = "wb"))
if (inherits(dnld, "try-error") || file.info(filedest)$size ==
0) {
warning(paste(url, " does not exist or is empty. \n"),
sep = "")
}
else {
return(filedest)
}
})
if (!is.null(rawArchive)) {
rawArchive = unlist(rawArchive)
rawArchive = basename(rawArchive)
}
}
if ((type != "mageFilesOnly" && type == "full" || type ==
"processed") && !is.null(procArchiveURL)) {
message("Copying processed data files\n")
processedArchive <- lapply(procArchiveURL, function(url) {
filedest = paste(path, basename(url), sep = "/")
dnld = try(download.file(url, filedest, mode = "wb"))
if (inherits(dnld, "try-error") || file.info(filedest)$size ==
0) {
warning(paste(url, " does not exist or is empty. \n"),
sep = "")
}
else {
return(filedest)
}
})
if (!is.null(processedArchive)) {
processedArchive = unlist(processedArchive)
processedArchive = basename(processedArchive)
}
}
}
rawFiles = NULL
processedFiles = NULL
if (extract) {
message("Unpacking data files")
if (!is.null(rawArchive))
rawFiles <- lapply(rawArchive, function(zipfile) {
rawfiles = extract.zip(file = paste(path, zipfile,
sep = "/"))
return(rawfiles)
})
if (!is.null(processedArchive))
processedFiles <- lapply(processedArchive, function(zipfile) {
procfiles = extract.zip(file = paste(path, zipfile,
sep = "/"))
return(procfiles)
})
if (!is.null(rawFiles))
rawFiles = unlist(rawFiles)
if (!is.null(processedFiles))
processedFiles = unlist(processedFiles)
}
res = list(path = path, rawFiles = rawFiles, rawArchive = rawArchive,
processedFiles = processedFiles, processedArchive = processedArchive,
sdrf = sdrfFile, idf = idfFile, adf = adfFiles)
return(res)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment