Skip to content

Instantly share code, notes, and snippets.

@memoiry
Created August 21, 2017 14:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save memoiry/30715257430b3896507996c53532fe5c to your computer and use it in GitHub Desktop.
Save memoiry/30715257430b3896507996c53532fe5c to your computer and use it in GitHub Desktop.
runMashup and mashup_runCV_featureSet function added to netDX package.
#' Run Mashup cross-validation with a provided networks.
#'
#' @details Creates query files if choosen, runs Mashup for 10-fold cross validation.
#' @param Mashup_db (char) path to Mashup generic database with
#' training population.
#' @param queries_dir (char) directory where a list of query file stored (default 10).
#' @param trainID_pred (cbar) training samples' index.
#' @param true_pheno (integer) pheno information.
#' @param incNets (char) vector of networks to include in this analysis.
#' (features/pathway names). Useful for subset-based feature selection.
#' @param smooth (char) perform smooth in the network or not.
#' @param cut_off (integer) cutoff to eliminate redundant network through network tally.
#' @param orgName (char) organism name for Mashup generic database.
#' The default value will likely never need to be changed.
#' @param write_query (logical) write query file by mashup itself or not.
#' @param fileSfx (char) file suffix.
#' @param verbose (logical) print messages.
#' @param numCores (logical) num parallel threads for cross-validation.
#' @param seed_CVqueries (integer) RNG seed for inner cross validation loop.
#' Makes deterministic samples held-out for each mashup query (see
#' makeCVqueries())
#' @param ... args for \code{makeCVqueries()}
#' @examples
#' mashup_runCV_featureSet(Mashup_db, queries_dir, trainID_pred, true_pheno, incNets = "all",
#' smooth = TRUE, cut_off = 9, orgName = "predictor", write_query = TRUE, fileSfx = "CV", verbose = FALSE, numCores = 2L, seed_CVqueries = 42L, ...)
#' @export
mashup_runCV_featureSet <- function (Mashup_db, queries_dir, trainID_pred, true_pheno, incNets = "all",
smooth = TRUE, cut_off = 9, orgName = "predictor", write_query = TRUE, fileSfx = "CV", verbose = FALSE, numCores = 2L, seed_CVqueries = 42L, ...)
{
num_train_samps <- length(true_pheno)
if (!file.exists(queries_dir))
dir.create(queries_dir)
if (write_query){
if (verbose)
cat("\tWriting GM queries: ")
qSamps <- makeCVqueries(trainID_pred, verbose = verbose,
setSeed = seed_CVqueries, ...)
for (m in 1:length(qSamps)) {
if (verbose)
cat(sprintf("%i ", m))
qFile <- sprintf("%s/%s_%i.query", queries_dir, fileSfx, m)
GM_writeQueryFile(qSamps[[m]], incNets, num_train_samps,
qFile, orgName)
}
}
runMashup(Mashup_db, queries_dir, true_pheno, trainID_pred = trainID_pred,smooth = smooth, ranking = FALSE, cut_off = cut_off, verbose = verbose)
top_net = sprintf("top_networks")
if(smooth)
top_net <- sprintf("%s/smooth_result/top_networks.txt", queries_dir)
if(!smooth)
top_net <- sprintf("%s/no_smooth_result/top_networks.txt", queries_dir)
mashupTally <- read.delim(top_net, header = FALSE)
mashupTally <- sub("_cont", "", mashupTally[[1]])
return(list(top_net = top_net, tally = mashupTally))
}
#' Run a Mashup feature selection or patients ranking query.
#'
#' @param Mashup_db (char) path to directory with Mashup generic database
#' @param queries (char) path to query file
#' @param true_pheno (char) pheno information.
#' @param trainID_pred (char) training samples' index.
#' @param smooth (logical) perform smooth in the network or not.
#' @param verbose (logical) print messages
#' @param ranking (logical) rank patients or not.
#' @param top_net (char) a file stores selected top networks for the interested type.
#' @param cut_off (integer) cutoff to eliminate redundant network through network tally.
#' @return if used for patients ranking, return path to Mashup PRANK file.
#' @examples
#' runMashup(Mashup_db, queries, true_pheno, trainID_pred = NULL, smooth = TRUE, verbose = TRUE,
#' ranking = TRUE, top_net = NULL, cut_off = 9)
#' @export
runMashup <- function (Mashup_db, queries, true_pheno, trainID_pred = NULL, smooth = TRUE, verbose = TRUE,
ranking = TRUE, top_net = NULL, cut_off = 9)
{
# write id and labels file.
if (!is.null(trainID_pred) ){
true_pheno$STATUS[which(true_pheno$ID %in% trainID_pred)] <- 1
true_pheno$STATUS[which(!true_pheno$ID %in% trainID_pred)] <- -1
labels_file <- sprintf("%s/labels.txt", queries)
if (verbose)
cat(labels_file)
write.table(true_pheno[c("ID", "STATUS")],
file = labels_file, col.names = FALSE, row.names = FALSE, quote = FALSE)
}
id <- sprintf("%s/ids.txt", dirname(queries))
if (verbose)
cat(id)
write.table(true_pheno["ID"],
file = id, col.names = FALSE, row.names = FALSE, quote = FALSE)
# Check if want to smooth the similarity network.
smooth_str <- ifelse(smooth, "true", "false")
# Default value for cmd
cmd <- sprintf("julia")
mashup_julia <- sprintf("%s/julia/mashup.jl", path.package("netDx"))
# If runnning for pantients ranking.
if (ranking){
if (is.null(top_net)){
# In ranking, the query must be a single query flat file instead of a directory contains a
# lot of queries file.
stopifnot(!dir.exists(queries))
cmd <- sprintf("julia %s ranking --net %s --id %s --CV_query %s --smooth %s --res_dir %s",
mashup_julia, Mashup_db, id, queries, smooth_str, dirname(queries))
}
else{
stopifnot(!dir.exists(queries))
cmd <- sprintf("julia %s ranking --top_net %s --net %s --id %s --CV_query %s --smooth %s --res_dir %s",
mashup_julia, top_net, Mashup_db, id, queries, smooth_str, dirname(queries))
}
}
# If running for network selection.
else{
stopifnot(dir.exists(queries))
cmd <- sprintf("julia %s selection --net %s --id %s --labels %s --CV_query %s --smooth %s --cut_off %d --res_dir %s",
mashup_julia, Mashup_db, id, labels_file, queries, smooth_str, cut_off, queries)
}
print(cmd)
#attempt <- 1
#t0 <- Sys.time()
#while ((!file.exists(resFile)) & (attempt <= MAX_ATTEMPTS)) {
# cat(sprintf("* Attempt %i : %s\n", attempt, basename(queryFile)))
# system(cmd, wait = TRUE, ignore.stdout = !verbose, ignore.stderr = !verbose)
# attempt <- attempt + 1
#}
cat("\nRunning command:\n")
system.time(system(cmd, wait = TRUE, ignore.stdout = !verbose, ignore.stderr = !verbose))
#cat(sprintf("QueryRunner time taken: %1.1f s\n", Sys.time() - t0))
if (!dir.exists(queries)){
if (smooth){
res_file <- sprintf("%s_smooth_mashup_PRANK.txt", queries)
res_file
}
else{
res_file <- sprintf("%s_no_smooth_mashup_PRANK.txt", queries)
res_file
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment