Last active
February 12, 2020 23:36
-
-
Save kstawiski/22d73e01c648da7c25e2968c0d50d0d6 to your computer and use it in GitHub Desktop.
miRNAhelpers - code for miRNA feature selection (publication in progress..)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(plyr) | |
library(dplyr) | |
library(edgeR) | |
library(epiDisplay) | |
library(rsq) | |
library(MASS) | |
library(Biocomb) | |
library(caret) | |
library(dplyr) | |
library(epiDisplay) | |
library(pROC) | |
library(ggplot2) | |
library(DMwR) | |
library(ROSE) | |
library(gridExtra) | |
library(gplots) | |
library(devtools) | |
library(stringr) | |
library(data.table) | |
library(tidyverse) | |
ks.wykreskorelacji = function(var1, var2, labvar1, labvar2, title, yx = T, metoda = 'pearson', gdzie_legenda = "topleft") { | |
var1 = as.numeric(as.character(var1)) | |
var2 = as.numeric(as.character(var2)) | |
plot(var1, var2, pch = 19, cex=0.5, xlab=labvar1, ylab=labvar2, main=title) | |
if (yx == T) { abline(0,1, col='gray') } | |
abline(fit <- lm(var2 ~ var1), col='black', lty = 2) | |
temp = cor.test(var1, var2, method = metoda) | |
if (metoda=='pearson') { | |
if (temp$p.value < 0.0001) { | |
legend(gdzie_legenda, bty="n", legend=paste("r =",round(cor(var1,var2,use="complete.obs"),2),", p < 0.0001\nadj. R² =", format(summary(fit)$adj.r.squared, digits=4))) | |
} else { | |
legend(gdzie_legenda, bty="n", legend=paste("r =",round(cor(var1,var2,use="complete.obs"),2),", p =",round(temp$p.value, 4),"\nadj. R² =", format(summary(fit)$adj.r.squared, digits=4))) } } | |
if (metoda=="spearman") | |
{ if (temp$p.value < 0.0001) { | |
legend(gdzie_legenda, bty="n", legend=paste("rho =",round(cor(var1,var2,use="complete.obs"),2),", p < 0.0001")) | |
} else { | |
legend(gdzie_legenda, bty="n", legend=paste("rho =",round(cor(var1,var2,use="complete.obs"),2),", p =",round(temp$p.value, 4))) } } | |
} | |
ks.heatmap.3 <- function(x, | |
Rowv = TRUE, Colv = if (symm) "Rowv" else TRUE, | |
distfun = dist, | |
hclustfun = hclust, | |
dendrogram = c("both","row", "column", "none"), | |
symm = FALSE, | |
scale = c("none","row", "column"), | |
na.rm = TRUE, | |
revC = identical(Colv,"Rowv"), | |
add.expr, | |
breaks, | |
symbreaks = max(x < 0, na.rm = TRUE) || scale != "none", | |
col = "heat.colors", | |
colsep, | |
rowsep, | |
sepcolor = "white", | |
sepwidth = c(0.05, 0.05), | |
cellnote, | |
notecex = 1, | |
notecol = "cyan", | |
na.color = par("bg"), | |
trace = c("none", "column","row", "both"), | |
tracecol = "cyan", | |
hline = median(breaks), | |
vline = median(breaks), | |
linecol = tracecol, | |
margins = c(5,5), | |
ColSideColors, | |
RowSideColors, | |
side.height.fraction=0.3, | |
cexRow = 0.2 + 1/log10(nr), | |
cexCol = 0.2 + 1/log10(nc), | |
labRow = NULL, | |
labCol = NULL, | |
key = TRUE, | |
keysize = 1.5, | |
density.info = c("none", "histogram", "density"), | |
denscol = tracecol, | |
symkey = max(x < 0, na.rm = TRUE) || symbreaks, | |
densadj = 0.25, | |
main = NULL, | |
xlab = NULL, | |
ylab = NULL, | |
lmat = NULL, | |
lhei = NULL, | |
lwid = NULL, | |
ColSideColorsSize = 1, | |
RowSideColorsSize = 1, | |
KeyValueName="Value",...){ | |
invalid <- function (x) { | |
if (missing(x) || is.null(x) || length(x) == 0) | |
return(TRUE) | |
if (is.list(x)) | |
return(all(sapply(x, invalid))) | |
else if (is.vector(x)) | |
return(all(is.na(x))) | |
else return(FALSE) | |
} | |
x <- as.matrix(x) | |
scale01 <- function(x, low = min(x), high = max(x)) { | |
x <- (x - low)/(high - low) | |
x | |
} | |
retval <- list() | |
scale <- if (symm && missing(scale)) | |
"none" | |
else match.arg(scale) | |
dendrogram <- match.arg(dendrogram) | |
trace <- match.arg(trace) | |
density.info <- match.arg(density.info) | |
if (length(col) == 1 && is.character(col)) | |
col <- get(col, mode = "function") | |
if (!missing(breaks) && (scale != "none")) | |
warning("Using scale=\"row\" or scale=\"column\" when breaks are", | |
"specified can produce unpredictable results.", "Please consider using only one or the other.") | |
if (is.null(Rowv) || is.na(Rowv)) | |
Rowv <- FALSE | |
if (is.null(Colv) || is.na(Colv)) | |
Colv <- FALSE | |
else if (Colv == "Rowv" && !isTRUE(Rowv)) | |
Colv <- FALSE | |
if (length(di <- dim(x)) != 2 || !is.numeric(x)) | |
stop("`x' must be a numeric matrix") | |
nr <- di[1] | |
nc <- di[2] | |
if (nr <= 1 || nc <= 1) | |
stop("`x' must have at least 2 rows and 2 columns") | |
if (!is.numeric(margins) || length(margins) != 2) | |
stop("`margins' must be a numeric vector of length 2") | |
if (missing(cellnote)) | |
cellnote <- matrix("", ncol = ncol(x), nrow = nrow(x)) | |
if (!inherits(Rowv, "dendrogram")) { | |
if (((!isTRUE(Rowv)) || (is.null(Rowv))) && (dendrogram %in% | |
c("both", "row"))) { | |
if (is.logical(Colv) && (Colv)) | |
dendrogram <- "column" | |
else dedrogram <- "none" | |
warning("Discrepancy: Rowv is FALSE, while dendrogram is `", | |
dendrogram, "'. Omitting row dendogram.") | |
} | |
} | |
if (!inherits(Colv, "dendrogram")) { | |
if (((!isTRUE(Colv)) || (is.null(Colv))) && (dendrogram %in% | |
c("both", "column"))) { | |
if (is.logical(Rowv) && (Rowv)) | |
dendrogram <- "row" | |
else dendrogram <- "none" | |
warning("Discrepancy: Colv is FALSE, while dendrogram is `", | |
dendrogram, "'. Omitting column dendogram.") | |
} | |
} | |
if (inherits(Rowv, "dendrogram")) { | |
ddr <- Rowv | |
rowInd <- order.dendrogram(ddr) | |
} | |
else if (is.integer(Rowv)) { | |
hcr <- hclustfun(distfun(x)) | |
ddr <- as.dendrogram(hcr) | |
ddr <- reorder(ddr, Rowv) | |
rowInd <- order.dendrogram(ddr) | |
if (nr != length(rowInd)) | |
stop("row dendrogram ordering gave index of wrong length") | |
} | |
else if (isTRUE(Rowv)) { | |
Rowv <- rowMeans(x, na.rm = na.rm) | |
hcr <- hclustfun(distfun(x)) | |
ddr <- as.dendrogram(hcr) | |
ddr <- reorder(ddr, Rowv) | |
rowInd <- order.dendrogram(ddr) | |
if (nr != length(rowInd)) | |
stop("row dendrogram ordering gave index of wrong length") | |
} | |
else { | |
rowInd <- nr:1 | |
} | |
if (inherits(Colv, "dendrogram")) { | |
ddc <- Colv | |
colInd <- order.dendrogram(ddc) | |
} | |
else if (identical(Colv, "Rowv")) { | |
if (nr != nc) | |
stop("Colv = \"Rowv\" but nrow(x) != ncol(x)") | |
if (exists("ddr")) { | |
ddc <- ddr | |
colInd <- order.dendrogram(ddc) | |
} | |
else colInd <- rowInd | |
} | |
else if (is.integer(Colv)) { | |
hcc <- hclustfun(distfun(if (symm) | |
x | |
else t(x))) | |
ddc <- as.dendrogram(hcc) | |
ddc <- reorder(ddc, Colv) | |
colInd <- order.dendrogram(ddc) | |
if (nc != length(colInd)) | |
stop("column dendrogram ordering gave index of wrong length") | |
} | |
else if (isTRUE(Colv)) { | |
Colv <- colMeans(x, na.rm = na.rm) | |
hcc <- hclustfun(distfun(if (symm) | |
x | |
else t(x))) | |
ddc <- as.dendrogram(hcc) | |
ddc <- reorder(ddc, Colv) | |
colInd <- order.dendrogram(ddc) | |
if (nc != length(colInd)) | |
stop("column dendrogram ordering gave index of wrong length") | |
} | |
else { | |
colInd <- 1:nc | |
} | |
retval$rowInd <- rowInd | |
retval$colInd <- colInd | |
retval$call <- match.call() | |
x <- x[rowInd, colInd] | |
x.unscaled <- x | |
cellnote <- cellnote[rowInd, colInd] | |
if (is.null(labRow)) | |
labRow <- if (is.null(rownames(x))) | |
(1:nr)[rowInd] | |
else rownames(x) | |
else labRow <- labRow[rowInd] | |
if (is.null(labCol)) | |
labCol <- if (is.null(colnames(x))) | |
(1:nc)[colInd] | |
else colnames(x) | |
else labCol <- labCol[colInd] | |
if (scale == "row") { | |
retval$rowMeans <- rm <- rowMeans(x, na.rm = na.rm) | |
x <- sweep(x, 1, rm) | |
retval$rowSDs <- sx <- apply(x, 1, sd, na.rm = na.rm) | |
x <- sweep(x, 1, sx, "/") | |
} | |
else if (scale == "column") { | |
retval$colMeans <- rm <- colMeans(x, na.rm = na.rm) | |
x <- sweep(x, 2, rm) | |
retval$colSDs <- sx <- apply(x, 2, sd, na.rm = na.rm) | |
x <- sweep(x, 2, sx, "/") | |
} | |
if (missing(breaks) || is.null(breaks) || length(breaks) < 1) { | |
if (missing(col) || is.function(col)) | |
breaks <- 16 | |
else breaks <- length(col) + 1 | |
} | |
if (length(breaks) == 1) { | |
if (!symbreaks) | |
breaks <- seq(min(x, na.rm = na.rm), max(x, na.rm = na.rm), | |
length = breaks) | |
else { | |
extreme <- max(abs(x), na.rm = TRUE) | |
breaks <- seq(-extreme, extreme, length = breaks) | |
} | |
} | |
nbr <- length(breaks) | |
ncol <- length(breaks) - 1 | |
if (class(col) == "function") | |
col <- col(ncol) | |
min.breaks <- min(breaks) | |
max.breaks <- max(breaks) | |
x[x < min.breaks] <- min.breaks | |
x[x > max.breaks] <- max.breaks | |
if (missing(lhei) || is.null(lhei)) | |
lhei <- c(keysize, 4) | |
if (missing(lwid) || is.null(lwid)) | |
lwid <- c(keysize, 4) | |
if (missing(lmat) || is.null(lmat)) { | |
lmat <- rbind(4:3, 2:1) | |
if (!missing(ColSideColors)) { | |
#if (!is.matrix(ColSideColors)) | |
#stop("'ColSideColors' must be a matrix") | |
if (!is.character(ColSideColors) || nrow(ColSideColors) != nc) | |
stop("'ColSideColors' must be a matrix of nrow(x) rows") | |
lmat <- rbind(lmat[1, ] + 1, c(NA, 1), lmat[2, ] + 1) | |
#lhei <- c(lhei[1], 0.2, lhei[2]) | |
lhei=c(lhei[1], side.height.fraction*ColSideColorsSize/2, lhei[2]) | |
} | |
if (!missing(RowSideColors)) { | |
#if (!is.matrix(RowSideColors)) | |
#stop("'RowSideColors' must be a matrix") | |
if (!is.character(RowSideColors) || ncol(RowSideColors) != nr) | |
stop("'RowSideColors' must be a matrix of ncol(x) columns") | |
lmat <- cbind(lmat[, 1] + 1, c(rep(NA, nrow(lmat) - 1), 1), lmat[,2] + 1) | |
#lwid <- c(lwid[1], 0.2, lwid[2]) | |
lwid <- c(lwid[1], side.height.fraction*RowSideColorsSize/2, lwid[2]) | |
} | |
lmat[is.na(lmat)] <- 0 | |
} | |
if (length(lhei) != nrow(lmat)) | |
stop("lhei must have length = nrow(lmat) = ", nrow(lmat)) | |
if (length(lwid) != ncol(lmat)) | |
stop("lwid must have length = ncol(lmat) =", ncol(lmat)) | |
op <- par(no.readonly = TRUE) | |
on.exit(par(op)) | |
layout(lmat, widths = lwid, heights = lhei, respect = FALSE) | |
if (!missing(RowSideColors)) { | |
if (!is.matrix(RowSideColors)){ | |
par(mar = c(margins[1], 0, 0, 0.5)) | |
image(rbind(1:nr), col = RowSideColors[rowInd], axes = FALSE) | |
} else { | |
par(mar = c(margins[1], 0, 0, 0.5)) | |
rsc = t(RowSideColors[,rowInd, drop=F]) | |
rsc.colors = matrix() | |
rsc.names = names(table(rsc)) | |
rsc.i = 1 | |
for (rsc.name in rsc.names) { | |
rsc.colors[rsc.i] = rsc.name | |
rsc[rsc == rsc.name] = rsc.i | |
rsc.i = rsc.i + 1 | |
} | |
rsc = matrix(as.numeric(rsc), nrow = dim(rsc)[1]) | |
image(t(rsc), col = as.vector(rsc.colors), axes = FALSE) | |
if (length(rownames(RowSideColors)) > 0) { | |
axis(1, 0:(dim(rsc)[2] - 1)/max(1,(dim(rsc)[2] - 1)), rownames(RowSideColors), las = 2, tick = FALSE) | |
} | |
} | |
} | |
if (!missing(ColSideColors)) { | |
if (!is.matrix(ColSideColors)){ | |
par(mar = c(0.5, 0, 0, margins[2])) | |
image(cbind(1:nc), col = ColSideColors[colInd], axes = FALSE) | |
} else { | |
par(mar = c(0.5, 0, 0, margins[2])) | |
csc = ColSideColors[colInd, , drop=F] | |
csc.colors = matrix() | |
csc.names = names(table(csc)) | |
csc.i = 1 | |
for (csc.name in csc.names) { | |
csc.colors[csc.i] = csc.name | |
csc[csc == csc.name] = csc.i | |
csc.i = csc.i + 1 | |
} | |
csc = matrix(as.numeric(csc), nrow = dim(csc)[1]) | |
image(csc, col = as.vector(csc.colors), axes = FALSE) | |
if (length(colnames(ColSideColors)) > 0) { | |
axis(2, 0:(dim(csc)[2] - 1)/max(1,(dim(csc)[2] - 1)), colnames(ColSideColors), las = 2, tick = FALSE) | |
} | |
} | |
} | |
par(mar = c(margins[1], 0, 0, margins[2])) | |
x <- t(x) | |
cellnote <- t(cellnote) | |
if (revC) { | |
iy <- nr:1 | |
if (exists("ddr")) | |
ddr <- rev(ddr) | |
x <- x[, iy] | |
cellnote <- cellnote[, iy] | |
} | |
else iy <- 1:nr | |
image(1:nc, 1:nr, x, xlim = 0.5 + c(0, nc), ylim = 0.5 + c(0, nr), axes = FALSE, xlab = "", ylab = "", col = col, breaks = breaks, ...) | |
retval$carpet <- x | |
if (exists("ddr")) | |
retval$rowDendrogram <- ddr | |
if (exists("ddc")) | |
retval$colDendrogram <- ddc | |
retval$breaks <- breaks | |
retval$col <- col | |
if (!invalid(na.color) & any(is.na(x))) { # load library(gplots) | |
mmat <- ifelse(is.na(x), 1, NA) | |
image(1:nc, 1:nr, mmat, axes = FALSE, xlab = "", ylab = "", | |
col = na.color, add = TRUE) | |
} | |
axis(1, 1:nc, labels = labCol, las = 2, line = -0.5, tick = 0, | |
cex.axis = cexCol) | |
if (!is.null(xlab)) | |
mtext(xlab, side = 1, line = margins[1] - 1.25) | |
axis(4, iy, labels = labRow, las = 2, line = -0.5, tick = 0, | |
cex.axis = cexRow) | |
if (!is.null(ylab)) | |
mtext(ylab, side = 4, line = margins[2] - 1.25) | |
if (!missing(add.expr)) | |
eval(substitute(add.expr)) | |
if (!missing(colsep)) | |
for (csep in colsep) rect(xleft = csep + 0.5, ybottom = rep(0, length(csep)), xright = csep + 0.5 + sepwidth[1], ytop = rep(ncol(x) + 1, csep), lty = 1, lwd = 1, col = sepcolor, border = sepcolor) | |
if (!missing(rowsep)) | |
for (rsep in rowsep) rect(xleft = 0, ybottom = (ncol(x) + 1 - rsep) - 0.5, xright = nrow(x) + 1, ytop = (ncol(x) + 1 - rsep) - 0.5 - sepwidth[2], lty = 1, lwd = 1, col = sepcolor, border = sepcolor) | |
min.scale <- min(breaks) | |
max.scale <- max(breaks) | |
x.scaled <- scale01(t(x), min.scale, max.scale) | |
if (trace %in% c("both", "column")) { | |
retval$vline <- vline | |
vline.vals <- scale01(vline, min.scale, max.scale) | |
for (i in colInd) { | |
if (!is.null(vline)) { | |
abline(v = i - 0.5 + vline.vals, col = linecol, | |
lty = 2) | |
} | |
xv <- rep(i, nrow(x.scaled)) + x.scaled[, i] - 0.5 | |
xv <- c(xv[1], xv) | |
yv <- 1:length(xv) - 0.5 | |
lines(x = xv, y = yv, lwd = 1, col = tracecol, type = "s") | |
} | |
} | |
if (trace %in% c("both", "row")) { | |
retval$hline <- hline | |
hline.vals <- scale01(hline, min.scale, max.scale) | |
for (i in rowInd) { | |
if (!is.null(hline)) { | |
abline(h = i + hline, col = linecol, lty = 2) | |
} | |
yv <- rep(i, ncol(x.scaled)) + x.scaled[i, ] - 0.5 | |
yv <- rev(c(yv[1], yv)) | |
xv <- length(yv):1 - 0.5 | |
lines(x = xv, y = yv, lwd = 1, col = tracecol, type = "s") | |
} | |
} | |
if (!missing(cellnote)) | |
text(x = c(row(cellnote)), y = c(col(cellnote)), labels = c(cellnote), | |
col = notecol, cex = notecex) | |
par(mar = c(margins[1], 0, 0, 0)) | |
if (dendrogram %in% c("both", "row")) { | |
plot(ddr, horiz = TRUE, axes = FALSE, yaxs = "i", leaflab = "none") | |
} | |
else plot.new() | |
par(mar = c(0, 0, if (!is.null(main)) 5 else 0, margins[2])) | |
if (dendrogram %in% c("both", "column")) { | |
plot(ddc, axes = FALSE, xaxs = "i", leaflab = "none") | |
} | |
else plot.new() | |
if (!is.null(main)) | |
title(main, cex.main = 1.5 * op[["cex.main"]]) | |
if (key) { | |
par(mar = c(5, 4, 2, 1), cex = 0.75) | |
tmpbreaks <- breaks | |
if (symkey) { | |
max.raw <- max(abs(c(x, breaks)), na.rm = TRUE) | |
min.raw <- -max.raw | |
tmpbreaks[1] <- -max(abs(x), na.rm = TRUE) | |
tmpbreaks[length(tmpbreaks)] <- max(abs(x), na.rm = TRUE) | |
} | |
else { | |
min.raw <- min(x, na.rm = TRUE) | |
max.raw <- max(x, na.rm = TRUE) | |
} | |
z <- seq(min.raw, max.raw, length = length(col)) | |
image(z = matrix(z, ncol = 1), col = col, breaks = tmpbreaks, | |
xaxt = "n", yaxt = "n") | |
par(usr = c(0, 1, 0, 1)) | |
lv <- pretty(breaks) | |
xv <- scale01(as.numeric(lv), min.raw, max.raw) | |
axis(1, at = xv, labels = lv) | |
if (scale == "row") | |
mtext(side = 1, "Row Z-Score", line = 2) | |
else if (scale == "column") | |
mtext(side = 1, "Column Z-Score", line = 2) | |
else mtext(side = 1, KeyValueName, line = 2) | |
if (density.info == "density") { | |
dens <- density(x, adjust = densadj, na.rm = TRUE) | |
omit <- dens$x < min(breaks) | dens$x > max(breaks) | |
dens$x <- dens$x[-omit] | |
dens$y <- dens$y[-omit] | |
dens$x <- scale01(dens$x, min.raw, max.raw) | |
lines(dens$x, dens$y/max(dens$y) * 0.95, col = denscol, | |
lwd = 1) | |
axis(2, at = pretty(dens$y)/max(dens$y) * 0.95, pretty(dens$y)) | |
title("Color Key\nand Density Plot") | |
par(cex = 0.5) | |
mtext(side = 2, "Density", line = 2) | |
} | |
else if (density.info == "histogram") { | |
h <- hist(x, plot = FALSE, breaks = breaks) | |
hx <- scale01(breaks, min.raw, max.raw) | |
hy <- c(h$counts, h$counts[length(h$counts)]) | |
lines(hx, hy/max(hy) * 0.95, lwd = 1, type = "s", | |
col = denscol) | |
axis(2, at = pretty(hy)/max(hy) * 0.95, pretty(hy)) | |
title("Color Key\nand Histogram") | |
par(cex = 0.5) | |
mtext(side = 2, "Count", line = 2) | |
} | |
else title("Color Key") | |
} | |
else plot.new() | |
retval$colorTable <- data.frame(low = retval$breaks[-length(retval$breaks)], | |
high = retval$breaks[-1], color = retval$col) | |
invisible(retval) | |
} | |
# DE funkcja | |
ks.miRNA.de = function(ttpm_pofiltrze, klasy) | |
{ | |
wyniki = data.frame(miR = colnames(ttpm_pofiltrze)) | |
library(dplyr) | |
for (i in 1:length(colnames(ttpm_pofiltrze))) | |
{ | |
# Filtry | |
#wyniki[i,"jakieszero"] = ifelse(sum(ttpm_pofiltrze[,i] == 0) > 0, "tak", "nie") | |
# Średnia i SD | |
wyniki[i,"mean logtpm"] = mean(ttpm_pofiltrze[,i]) | |
wyniki[i,"median logtpm"] = median(ttpm_pofiltrze[,i]) | |
wyniki[i,"SD logtpm"] = sd(ttpm_pofiltrze[,i]) | |
# Cancer | |
tempx = ttpm_pofiltrze[klasy == "Cancer",] | |
wyniki[i,"cancer mean"] = mean(tempx[,i]) | |
wyniki[i,"cancer median"] = median(tempx[,i]) | |
wyniki[i,"cancer SD"] = sd(tempx[,i]) | |
# Cancer | |
tempx = ttpm_pofiltrze[klasy != "Cancer",] | |
wyniki[i,"control mean"] = mean(tempx[,i]) | |
wyniki[i,"control median"] = median(tempx[,i]) | |
wyniki[i,"control SD"] = sd(tempx[,i]) | |
# DE | |
temp = t.test(ttpm_pofiltrze[,i] ~ as.factor(klasy)) | |
fc = (temp$estimate[1] - temp$estimate[2]) | |
wyniki[i,"log10FC (subtr estim)"] = fc | |
wyniki[i,"log10FC"] = wyniki[i,"cancer mean"] - wyniki[i,"control mean"] | |
wyniki[i,"log2FC"] = wyniki[i,"log10FC"] / log10(2) | |
revfc = (temp$estimate[2] - temp$estimate[1]) | |
wyniki[i,"reversed_log10FC"] = revfc | |
wyniki[i,"reverse_log2FC"] = wyniki[i,"reversed_log10FC"] / log10(2) | |
#wyniki[i,"log2FC"] = log2(fc) | |
wyniki[i,"p-value"] = temp$p.value | |
} | |
wyniki[,"p-value Bonferroni"] = p.adjust(wyniki$`p-value`, method = "bonferroni") | |
wyniki[,"p-value Holm"] = p.adjust(wyniki$`p-value`, method = "holm") | |
wyniki[,"-log10(p-value Bonferroni)"] = -log10(p.adjust(wyniki$`p-value`, method = "bonferroni")) | |
wyniki[,"p-value BH"] = p.adjust(wyniki$`p-value`, method = "BH") | |
return(wyniki %>% arrange(`p-value BH`)) | |
} | |
ks.wykreskorelacji = function(var1, var2, labvar1, labvar2, title) { | |
plot(var1, var2, pch = 19, cex=0.5, xlab=labvar1, ylab=labvar2, main=title) | |
abline(0,1, col='gray') | |
abline(fit <- lm(var2 ~ var1), col='black', lty = 2) | |
temp = cor.test(var1, var2, method = 'pearson') | |
legend("topleft", bty="n", legend=paste("r =",round(cor(var1,var2),2),", p =",round(temp$p.value, 4),"\nR² =", format(summary(fit)$adj.r.squared, digits=4))) | |
} | |
ks.miR.formula = function(wybrane_miRy) { | |
as.formula(paste0("Class ~ ",paste0(as.character(wybrane_miRy), collapse = " + "))) | |
} | |
ks.wczytajmix = function(wd = getwd(), smote_over = 10000, use_smote_not_rose = F, replace_smote = F) { | |
oldwd = getwd() | |
setwd(wd) | |
train = dplyr::select(read.csv("mixed_train.csv", stringsAsFactors = F), starts_with("hsa"), Class) | |
test = dplyr::select(read.csv("mixed_test.csv", stringsAsFactors = F), starts_with("hsa"), Class) | |
valid = dplyr::select(read.csv("mixed_valid.csv", stringsAsFactors = F), starts_with("hsa"), Class) | |
train$Class = factor(train$Class, levels = c("Control","Cancer")) | |
test$Class = factor(test$Class, levels = c("Control","Cancer")) | |
valid$Class = factor(valid$Class, levels = c("Control","Cancer")) | |
# Wywalamy miRy z zerowa wariancja | |
temp = train %>% filter(Class == "Cancer") | |
temp2 = as.numeric(which(apply(temp, 2, var) == 0)) | |
temp = train %>% filter(Class == "Control") | |
temp3 = as.numeric(which(apply(temp, 2, var) == 0)) | |
temp4 = unique(c(temp2, temp3)) | |
if (length(temp4) > 0) { | |
train = train[,-temp4] | |
test = test[,-temp4] | |
valid = valid[,-temp4] | |
} | |
if(use_smote_not_rose) { | |
train_smoted = DMwR::SMOTE(Class ~ ., data = train, perc.over = smote_over,perc.under=100, k=10) | |
train_smoted$Class = factor(train_smoted$Class, levels = c("Control","Cancer")) | |
} else { | |
rosed = ROSE(Class ~ ., data = train, N = nrow(train)*10, seed = 1) | |
train_smoted = rosed[["data"]] | |
train_smoted$Class = factor(train_smoted$Class, levels = c("Control","Cancer")) | |
} | |
if (replace_smote == T) { train = train_smoted } | |
trainx = dplyr::select(train, starts_with("hsa")) | |
trainx_smoted = dplyr::select(train_smoted, starts_with("hsa")) | |
setwd(oldwd) | |
return(list(train, test, valid, train_smoted, trainx, trainx_smoted)) | |
} | |
ks.selekcjamiRNA = function(wd = getwd(), m = c(1:70), | |
max_iterations = 10, code_path = "/home/konrad/snorlax/2019_PRELUDIUM/feature_selection/", | |
register_parallel = T, clx = NULL, stamp = as.numeric(Sys.time()), | |
prefer_no_features = 11, conda_path = "/home/konrad/anaconda3/bin/conda") { | |
oldwd = getwd() | |
setwd(wd) | |
if(!dir.exists("temp")) { dir.create("temp") } | |
run_id = stamp | |
formulas = list() | |
times = list() | |
zz <- file(paste0("temp/",stamp,"featureselection.log"), open = "wt") | |
sink(zz) | |
#sink(zz, type = "message") | |
pdf(paste0("temp/",stamp,"featureselection.pdf")) | |
dane = ks.wczytajmix(); train = dane[[1]]; test = dane[[2]]; valid = dane[[3]]; train_smoted = dane[[4]]; trainx = dane[[5]]; trainx_smoted = dane[[6]] | |
n = 1 | |
if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); | |
start_time <- Sys.time() | |
formulas[["all"]] = ks.miR.formula(colnames(trainx)) | |
end_time <- Sys.time() | |
saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")) | |
saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
cat("\n\nStandard DE...") | |
wyniki = ks.miRNA.de(trainx, train$Class) | |
istotne = filter(wyniki, `p-value BH` <= 0.05) %>% arrange(`p-value BH`) | |
istotne_top = wyniki %>% arrange(`p-value BH`) %>% head(prefer_no_features) | |
istotne_topBonf = wyniki %>% arrange(`p-value Bonferroni`) %>% head(prefer_no_features) | |
istotne_topHolm = wyniki %>% arrange(`p-value Holm`) %>% head(prefer_no_features) | |
istotne_topFC = wyniki %>% arrange(desc(abs(`log2FC`))) %>% head(prefer_no_features) | |
train_sig = dplyr::select(train, as.character(istotne$miR), Class) | |
trainx_sig = dplyr::select(train, as.character(istotne$miR)) | |
cat("\n\nPreparing for SMOTE...") | |
#train_sig_smoted = DMwR::SMOTE(Class ~ ., data = train_sig, perc.over = 10000,perc.under=100, k=10) | |
train_sig_smoted = dplyr::select(train_smoted, as.character(istotne$miR), Class) | |
train_sig_smoted$Class = factor(train_sig_smoted$Class, levels = c("Control","Cancer")) | |
trainx_sig_smoted = dplyr::select(train_sig_smoted, starts_with("hsa")) | |
wyniki_smoted = ks.miRNA.de(trainx_smoted, train_smoted$Class) | |
istotne_smoted = filter(wyniki_smoted, `p-value BH` <= 0.05) %>% arrange(`p-value BH`) | |
istotne_top_smoted = wyniki_smoted %>% arrange(`p-value BH`) %>% head(prefer_no_features) | |
istotne_topBonf_smoted = wyniki_smoted %>% arrange(`p-value Bonferroni`) %>% head(prefer_no_features) | |
istotne_topHolm_smoted = wyniki_smoted %>% arrange(`p-value Holm`) %>% head(prefer_no_features) | |
istotne_topFC_smoted = wyniki_smoted %>% arrange(desc(abs(`log2FC`))) %>% head(prefer_no_features) | |
# Caret prep | |
if (register_parallel) { | |
cat("\n\nGetting cluster ready...") | |
if(is.null(clx)) { | |
library(doParallel) | |
cl <- makePSOCKcluster(detectCores() - 2) | |
registerDoParallel(cl) } | |
else { registerDoParallel(clx) } | |
} | |
# 0. All and sig | |
cat("\n\nStarting selected methods...") | |
n = n + 1 | |
if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); | |
cat("\n\nStarting SIG") | |
start_time <- Sys.time() | |
formulas[["sig"]] = ks.miR.formula(as.character(istotne$miR)) | |
formulas[["sigtop"]] = ks.miR.formula(as.character(istotne_top$miR)) | |
formulas[["sigtopBonf"]] = ks.miR.formula(as.character(istotne_topBonf$miR)) | |
formulas[["sigtopHolm"]] = ks.miR.formula(as.character(istotne_topHolm$miR)) | |
formulas[["topFC"]] = ks.miR.formula(as.character(istotne_topFC$miR)) | |
formulas[["sigSMOTE"]] = ks.miR.formula(as.character(istotne_smoted$miR)) | |
formulas[["sigtopSMOTE"]] = ks.miR.formula(as.character(istotne_top_smoted$miR)) | |
formulas[["sigtopBonfSMOTE"]] = ks.miR.formula(as.character(istotne_topBonf_smoted$miR)) | |
formulas[["sigtopHolmSMOTE"]] = ks.miR.formula(as.character(istotne_topHolm_smoted$miR)) | |
formulas[["topFCSMOTE"]] = ks.miR.formula(as.character(istotne_topFC_smoted$miR)) | |
end_time <- Sys.time() | |
saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")) | |
saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
# 1. Fold-change and sig. filter | |
n = n + 1 | |
if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); | |
start_time <- Sys.time() | |
cat("\n\nStarting FC and SIG") | |
fcsig = as.character(istotne$miR[abs(istotne$log2FC)>1]) | |
formulas[["fcsig"]] = ks.miR.formula(fcsig) | |
fcsig = as.character(istotne_smoted$miR[abs(istotne_smoted$log2FC)>1]) | |
formulas[["fcsigSMOTE"]] = ks.miR.formula(fcsig) | |
end_time <- Sys.time() | |
saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")) | |
saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
# 2. CFS | |
n = n + 1 | |
if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); | |
start_time <- Sys.time() | |
cat("\n\nStarting CFS") | |
cfs = select.cfs(train) | |
formulas[["cfs"]] = ks.miR.formula(as.character(cfs$Biomarker)) | |
cfs = select.cfs(train_smoted) | |
formulas[["cfsSMOTE"]] = ks.miR.formula(as.character(cfs$Biomarker)) | |
cfs = select.cfs(train_sig) | |
formulas[["cfs_sig"]] = ks.miR.formula(as.character(cfs$Biomarker)) | |
cfs = select.cfs(train_sig_smoted) | |
formulas[["cfsSMOTE_sig"]] = ks.miR.formula(as.character(cfs$Biomarker)) | |
end_time <- Sys.time() | |
saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")) | |
saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
# 3. classifier.loop | |
n = n + 1 | |
if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); | |
start_time <- Sys.time() | |
cat("\n\nStarting classifier loop") | |
classloop = classifier.loop(train, feature.selection = "auc", method.cross="fold-crossval", classifiers=c("svm","lda","rf","nsc"), no.feat=prefer_no_features) | |
f_classloop = rownames(classloop$no.selected)[classloop$no.selected[,1]>0] | |
formulas[["classloop"]] = ks.miR.formula(f_classloop) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
classloop = classifier.loop(train_smoted, feature.selection = "auc", method.cross="fold-crossval", classifiers=c("svm","lda","rf","nsc"), no.feat=prefer_no_features) | |
f_classloop = rownames(classloop$no.selected)[classloop$no.selected[,1]>0] | |
formulas[["classloopSMOTE"]] = ks.miR.formula(f_classloop) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
classloop = classifier.loop(train_sig, feature.selection = "auc", method.cross="fold-crossval", classifiers=c("svm","lda","rf","nsc"), no.feat=prefer_no_features) | |
f_classloop = rownames(classloop$no.selected)[classloop$no.selected[,1]>0] | |
formulas[["classloop_sig"]] = ks.miR.formula(f_classloop) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
classloop = classifier.loop(train_sig, feature.selection = "auc", method.cross="fold-crossval", classifiers=c("svm","lda","rf","nsc"), no.feat=prefer_no_features) | |
f_classloop = rownames(classloop$no.selected)[classloop$no.selected[,1]>0] | |
formulas[["classloopSMOTE_sig"]] = ks.miR.formula(f_classloop) | |
end_time <- Sys.time() | |
saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")) | |
saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
# 4. select.forward.Corr | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
cat("\n\nStarting select.forward.Corr") | |
fcfs = select.forward.Corr(train, disc.method="MDL") | |
formulas[["fcfs"]] = ks.miR.formula(fcfs) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
fcfs = select.forward.Corr(train_smoted, disc.method="MDL") | |
formulas[["fcfsSMOTE"]] = ks.miR.formula(fcfs) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
fcfs = select.forward.Corr(train_sig, disc.method="MDL") | |
formulas[["fcfs_sig"]] = ks.miR.formula(fcfs) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
fcfs = select.forward.Corr(train_sig_smoted, disc.method="MDL") | |
formulas[["fcfsSMOTE_sig"]] = ks.miR.formula(fcfs) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
# 5. select.forward.wrapper | |
fwrap = select.forward.wrapper(train) | |
formulas[["fwrap"]] = ks.miR.formula(fwrap) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
fwrap = select.forward.wrapper(train_smoted) | |
formulas[["fwrapSMOTE"]] = ks.miR.formula(fwrap) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
fwrap = select.forward.wrapper(train_sig) | |
formulas[["fwrap_sig"]] = ks.miR.formula(fwrap) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
fwrap = select.forward.wrapper(train_sig_smoted) | |
formulas[["fwrapSMOTE_sig"]] = ks.miR.formula(fwrap) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
# 6, 7, 8. | |
#select.process(dattable,method="InformationGain",disc.method="MDL", | |
# threshold=0.2,threshold.consis=0.05,attrs.nominal=numeric(), | |
# max.no.features=10) | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
cat("\n\nStarting MDL") | |
formulas[["AUC_MDL"]] = ks.miR.formula(colnames(train)[select.process(train, method="auc", disc.method = "MDL", max.no.features = prefer_no_features)]) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
formulas[["SU_MDL"]] = ks.miR.formula(colnames(train)[select.process(train, method="symmetrical.uncertainty", disc.method = "MDL", max.no.features = prefer_no_features)]) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
formulas[["CorrSF_MDL"]] = ks.miR.formula(colnames(train)[select.process(train, method="CorrSF", disc.method = "MDL", max.no.features = prefer_no_features)]) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
formulas[["AUC_MDLSMOTE"]] = ks.miR.formula(colnames(train_smoted)[select.process(train_smoted, method="auc", disc.method = "MDL", max.no.features = prefer_no_features)]) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
formulas[["SU_MDLSMOTE"]] = ks.miR.formula(colnames(train_smoted)[select.process(train_smoted, method="symmetrical.uncertainty", disc.method = "MDL", max.no.features = prefer_no_features)]) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
formulas[["CorrSF_MDLSMOTE"]] = ks.miR.formula(colnames(train_smoted)[select.process(train_smoted, method="CorrSF", disc.method = "MDL", max.no.features = prefer_no_features)]) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
formulas[["AUC_MDL_sig"]] = ks.miR.formula(colnames(train_sig)[select.process(train_sig, method="auc", disc.method = "MDL", max.no.features = prefer_no_features)]) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
formulas[["SU_MDL_sig"]] = ks.miR.formula(colnames(train_sig)[select.process(train_sig, method="symmetrical.uncertainty", disc.method = "MDL", max.no.features = prefer_no_features)]) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
formulas[["CorrSF_MDL_sig"]] = ks.miR.formula(colnames(train_sig)[select.process(train_sig, method="CorrSF", disc.method = "MDL", max.no.features = prefer_no_features)]) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
formulas[["AUC_MDLSMOTE_sig"]] = ks.miR.formula(colnames(train_sig_smoted)[select.process(train_sig_smoted, method="auc", disc.method = "MDL", max.no.features = prefer_no_features)]) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
formulas[["SU_MDLSMOTE_sig"]] = ks.miR.formula(colnames(train_sig_smoted)[select.process(train_sig_smoted, method="symmetrical.uncertainty", disc.method = "MDL", max.no.features = prefer_no_features)]) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
formulas[["CorrSF_MDLSMOTE_sig"]] = ks.miR.formula(colnames(train_sig_smoted)[select.process(train_sig_smoted, method="CorrSF", disc.method = "MDL", max.no.features = prefer_no_features)]) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
# 9. bounceR - genetic | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
cat("\n\nStarting bounceR..") | |
library(bounceR) | |
mrmr <- bounceR::featureSelection(data = train, | |
target = "Class", | |
max_time = "15 mins", | |
selection = selectionControl(n_rounds = NULL, | |
n_mods = NULL, | |
p = prefer_no_features, | |
penalty = 0.5, | |
reward = 0.2), | |
bootstrap = "regular", | |
early_stopping = "aic", | |
boosting = boostingControl(mstop = 100, nu = 0.1), | |
cores = parallel::detectCores()-1) | |
formulas[["bounceR-full"]] = mrmr@opt_formula | |
formulas[["bounceR-stability"]] = ks.miR.formula(as.character(mrmr@stability[1:prefer_no_features,] %>% pull('feature'))) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
mrmr <- featureSelection(data = train_smoted, | |
target = "Class", | |
max_time = "15 mins", | |
selection = selectionControl(n_rounds = NULL, | |
n_mods = NULL, | |
p = prefer_no_features, | |
penalty = 0.5, | |
reward = 0.2), | |
bootstrap = "regular", | |
early_stopping = "aic", | |
boosting = boostingControl(mstop = 100, nu = 0.1), | |
cores = parallel::detectCores()-1) | |
formulas[["bounceR-full_SMOTE"]] = mrmr@opt_formula | |
formulas[["bounceR-stability_SMOTE"]] = ks.miR.formula(as.character(mrmr@stability[1:prefer_no_features,] %>% pull('feature'))) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
mrmr <- featureSelection(data = train_sig, | |
target = "Class", | |
max_time = "15 mins", | |
selection = selectionControl(n_rounds = NULL, | |
n_mods = NULL, | |
p = prefer_no_features, | |
penalty = 0.5, | |
reward = 0.2), | |
bootstrap = "regular", | |
early_stopping = "aic", | |
boosting = boostingControl(mstop = 100, nu = 0.1), | |
cores = parallel::detectCores()-1) | |
formulas[["bounceR-full_SIG"]] = mrmr@opt_formula | |
formulas[["bounceR-stability_SMOTE"]] = ks.miR.formula(as.character(mrmr@stability[1:prefer_no_features,] %>% pull('feature'))) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
mrmr <- featureSelection(data = train_sig_smoted, | |
target = "Class", | |
max_time = "15 mins", | |
selection = selectionControl(n_rounds = NULL, | |
n_mods = NULL, | |
p = prefer_no_features, | |
penalty = 0.5, | |
reward = 0.2), | |
bootstrap = "regular", | |
early_stopping = "aic", | |
boosting = boostingControl(mstop = 100, nu = 0.1), | |
cores = parallel::detectCores()-1) | |
formulas[["bounceR-full_SIGSMOTE"]] = mrmr@opt_formula | |
formulas[["bounceR-stability_SIGSMOTE"]] = ks.miR.formula(as.character(mrmr@stability[1:prefer_no_features,] %>% pull('feature'))) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
# 10. RFE RandomForest | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
cat("\n\nStarting RFE RF") | |
ctrl <- rfeControl(functions =rfFuncs, | |
method = "cv", number = 10, | |
saveDetails = TRUE, | |
allowParallel = TRUE, | |
returnResamp = "all", | |
verbose = T) | |
rfProfile <- rfe(trainx, train$Class, | |
sizes = 3:11, | |
rfeControl = ctrl) | |
plot(rfProfile, type=c("g", "o")) | |
formulas[["RandomForestRFE"]] = ks.miR.formula(predictors(rfProfile)) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
rfProfile <- rfe(trainx_smoted, train_smoted$Class, | |
sizes = 3:11, | |
rfeControl = ctrl) | |
plot(rfProfile, type=c("g", "o")) | |
formulas[["RandomForestRFESMOTE"]] = ks.miR.formula(predictors(rfProfile)) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
rfProfile <- rfe(trainx_sig, train_sig$Class, | |
sizes = 3:11, | |
rfeControl = ctrl) | |
plot(rfProfile, type=c("g", "o")) | |
formulas[["RandomForestRFE_sig"]] = ks.miR.formula(predictors(rfProfile)) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
rfProfile <- rfe(trainx_sig_smoted, train_sig_smoted$Class, | |
sizes = 3:11, | |
rfeControl = ctrl) | |
plot(rfProfile, type=c("g", "o")) | |
formulas[["RandomForestRFESMOTE_sig"]] = ks.miR.formula(predictors(rfProfile)) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
# 11. Genetic | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
cat("\n\nStarting Genetic") | |
ga_ctrl <- gafsControl(functions = rfGA, method = "repeatedcv", number=10, repeats=5, allowParallel=T) | |
rf_ga <- gafs(x = trainx, y = train$Class, | |
iters = max_iterations, | |
gafsControl = ga_ctrl) | |
plot(rf_ga) + theme_bw() | |
print(rf_ga) | |
formulas[["GeneticAlgorithmRF"]] = ks.miR.formula(rf_ga$ga$final) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
ga_ctrl <- gafsControl(functions = rfGA, method = "repeatedcv", number=10, repeats=5, allowParallel=T) | |
rf_ga <- gafs(x = trainx_smoted, y = train_smoted$Class, | |
iters = max_iterations, | |
gafsControl = ga_ctrl) | |
plot(rf_ga) + theme_bw() | |
print(rf_ga) | |
formulas[["GeneticAlgorithmRFSMOTE"]] = ks.miR.formula(rf_ga$ga$final) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
ga_ctrl <- gafsControl(functions = rfGA, method = "repeatedcv", number=10, repeats=5, allowParallel=T) | |
rf_ga <- gafs(x = trainx_sig, y = train_sig$Class, | |
iters = max_iterations, | |
gafsControl = ga_ctrl) | |
plot(rf_ga) + theme_bw() | |
print(rf_ga) | |
formulas[["GeneticAlgorithmRF_sig"]] = ks.miR.formula(rf_ga$ga$final) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
ga_ctrl <- gafsControl(functions = rfGA, method = "repeatedcv", number=10, repeats=5, allowParallel=T) | |
rf_ga <- gafs(x = trainx_sig_smoted, y = train_sig_smoted$Class, | |
iters = max_iterations, | |
gafsControl = ga_ctrl) | |
plot(rf_ga) + theme_bw() | |
print(rf_ga) | |
formulas[["GeneticAlgorithmRFSMOTE_sig"]] = ks.miR.formula(rf_ga$ga$final) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
# 12. SimulatedAnealing | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
cat("\n\nStarting SimulatedAnealing") | |
sa_ctrl <- safsControl(functions = rfSA, | |
method = "repeatedcv", | |
number=5, repeats=10, allowParallel=T, | |
improve = 50) | |
rf_sa <- safs(x = trainx, y = train$Class, | |
iters = max_iterations, | |
safsControl = sa_ctrl) | |
print(rf_sa) | |
plot(rf_sa) + theme_bw() | |
formulas[["SimulatedAnnealingRF"]] = ks.miR.formula(rf_sa$sa$final) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
rf_sa <- safs(x = trainx_smoted, y = train_smoted$Class, | |
iters = max_iterations, | |
safsControl = sa_ctrl) | |
print(rf_sa) | |
plot(rf_sa) + theme_bw() | |
formulas[["SimulatedAnnealingRFSMOTE"]] = ks.miR.formula(rf_sa$sa$final) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
rf_sa <- safs(x = trainx_sig, y = train_sig$Class, | |
iters = max_iterations, | |
safsControl = sa_ctrl) | |
print(rf_sa) | |
plot(rf_sa) + theme_bw() | |
formulas[["SimulatedAnnealingRF_sig"]] = ks.miR.formula(rf_sa$sa$final) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
rf_sa <- safs(x = trainx_sig_smoted, y = train_sig_smoted$Class, | |
iters = max_iterations, | |
safsControl = sa_ctrl) | |
print(rf_sa) | |
plot(rf_sa) + theme_bw() | |
formulas[["SimulatedAnnealingRFSMOTE_sig"]] = ks.miR.formula(rf_sa$sa$final) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
# 14. Boruta (https://www.jstatsoft.org/article/view/v036i11) | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
library(Boruta) | |
cat("\n\nStarting Boruta") | |
bor = Boruta(trainx, train$Class) | |
formulas[["Boruta"]] = ks.miR.formula(names(bor$finalDecision)[as.character(bor$finalDecision) == "Confirmed"]) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
bor = Boruta(trainx_smoted, train_smoted$Class) | |
formulas[["BorutaSMOTE"]] = ks.miR.formula(names(bor$finalDecision)[as.character(bor$finalDecision) == "Confirmed"]) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
# 15. spFSR - feature selection and ranking by simultaneous perturbation stochastic approximation | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
cat("\n\nStarting spFSR") | |
library(spFSR) | |
knnWrapper <- makeLearner("classif.knn", k = 5) | |
classifTask <- makeClassifTask(data = train, target = "Class") | |
perf.measure <- acc | |
spsaMod <- spFeatureSelection( | |
task = classifTask, | |
wrapper = knnWrapper, | |
measure = perf.measure , | |
num.features.selected = prefer_no_features, | |
iters.max = max_iterations, | |
num.cores = detectCores() - 1) | |
formulas[["spFSR"]] = ks.miR.formula(spsaMod$features) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
classifTask <- makeClassifTask(data = train_smoted, target = "Class") | |
perf.measure <- acc | |
spsaMod <- spFeatureSelection( | |
task = classifTask, | |
wrapper = knnWrapper, | |
measure = perf.measure , | |
num.features.selected = prefer_no_features, | |
iters.max = max_iterations, | |
num.cores = detectCores() - 2) | |
formulas[["spFSRSMOTE"]] = ks.miR.formula(spsaMod$features) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
# varSelRF | |
cat("\n\nStarting varSelRF") | |
library(varSelRF) | |
var.sel <- varSelRF(trainx, train$Class, ntree = 500, ntreeIterat = max_iterations, vars.drop.frac = 0.05, whole.range = T, keep.forest = T) | |
formulas[["varSelRF"]] = ks.miR.formula(var.sel$selected.vars) | |
var.sel <- varSelRF(trainx_smoted, train_smoted$Class, ntree = 500, ntreeIterat = max_iterations, vars.drop.frac = 0.05, whole.range = T, keep.forest = T) | |
formulas[["varSelRFSMOTE"]] = ks.miR.formula(var.sel$selected.vars) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1; if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); start_time <- Sys.time(); | |
# 13. WxNet (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6642261/) | |
cat("\nStarting WxNet") | |
library(plyr) | |
library(dplyr) | |
library(reticulate) | |
library(tidyverse) | |
library(data.table) | |
library(DMwR) | |
# Set the path to the Python executale file | |
#use_python("/anaconda3/bin/python", required = T) | |
conda_list() | |
use_condaenv("tensorflow", required = T) | |
py_config() | |
dane = ks.wczytajmix(replace_smote = F); train = dane[[1]]; test = dane[[2]]; valid = dane[[3]]; train_smoted = dane[[4]]; trainx = dane[[5]]; trainx_smoted = dane[[6]] | |
#train_org = train | |
#train = SMOTE(Class ~ ., data = train_org, perc.over = 10000,perc.under=100) | |
# Przygotowanie | |
trainx = dplyr::select(train, starts_with("hsa")) | |
testx = dplyr::select(test, starts_with("hsa")) | |
trainx = t(trainx) | |
testx = t(testx) | |
ids = paste0("train",1:ncol(trainx)) | |
ids2 = paste0("test",1:ncol(testx)) | |
colnames(trainx) = gsub("-", "", ids) | |
colnames(testx) = gsub("-", "", ids2) | |
traindata = cbind(data.frame(fnames = rownames(trainx), trainx)) | |
fwrite(traindata, paste0(code_path, "wx/DearWXpub/src/train-data.csv"), row.names = F, quote = F) | |
testdata = cbind(data.frame(fnames = rownames(testx), testx)) | |
fwrite(testdata, paste0(code_path, "wx/DearWXpub/src/test-data.csv"), row.names = F, quote = F) | |
trainanno = data.frame(id = colnames(trainx), label = train$Class) | |
fwrite(trainanno, paste0(code_path, "wx/DearWXpub/src/train-anno.csv"), row.names = F, quote = F) | |
testanno = data.frame(id = colnames(testx), label = test$Class) | |
fwrite(testanno, paste0(code_path, "wx/DearWXpub/src/test-anno.csv"), row.names = F, quote = F) | |
# Wywolanie WX | |
out <- tryCatch( | |
{ | |
setwd(paste0(code_path,"wx/DearWXpub/src/")) | |
system(paste0(conda_path," activate tensorflow")) | |
py_run_file("wx_konsta.py", local = T) | |
np <- import("numpy") | |
w = np$load("wyniki.npy",allow_pickle=T) | |
formulas[["Wx"]] = ks.miR.formula(w) | |
}, | |
error=function(cond) { | |
message("ERROR:") | |
message(cond) | |
# Choose a return value in case of error | |
}, | |
warning=function(cond) { | |
message("WARNING:") | |
message(cond) | |
}, | |
finally={ | |
setwd(wd) | |
} | |
) | |
# Wx with SMOTE | |
dane = ks.wczytajmix(replace_smote = T); train = dane[[1]]; test = dane[[2]]; valid = dane[[3]]; train_smoted = dane[[4]]; trainx = dane[[5]]; trainx_smoted = dane[[6]] | |
# Przygotowanie | |
trainx = dplyr::select(train, starts_with("hsa")) | |
testx = dplyr::select(test, starts_with("hsa")) | |
trainx = t(trainx) | |
testx = t(testx) | |
ids = paste0("train",1:ncol(trainx)) | |
ids2 = paste0("test",1:ncol(testx)) | |
colnames(trainx) = gsub("-", "", ids) | |
colnames(testx) = gsub("-", "", ids2) | |
traindata = cbind(data.frame(fnames = rownames(trainx), trainx)) | |
fwrite(traindata, paste0(code_path, "wx/DearWXpub/src/train-data.csv"), row.names = F, quote = F) | |
testdata = cbind(data.frame(fnames = rownames(testx), testx)) | |
fwrite(testdata, paste0(code_path, "wx/DearWXpub/src/test-data.csv"), row.names = F, quote = F) | |
trainanno = data.frame(id = colnames(trainx), label = train$Class) | |
fwrite(trainanno, paste0(code_path, "wx/DearWXpub/src/train-anno.csv"), row.names = F, quote = F) | |
testanno = data.frame(id = colnames(testx), label = test$Class) | |
fwrite(testanno, paste0(code_path, "wx/DearWXpub/src/test-anno.csv"), row.names = F, quote = F) | |
# Wywolanie WX | |
# setwd(paste0(code_path,"wx/DearWXpub/src/")) | |
# py_run_file("wx_konsta.py", local = T) | |
# | |
# np <- import("numpy") | |
# w = np$load("wyniki.npy",allow_pickle=T) | |
# print(w) | |
# setwd(wd) | |
# formulas[["WxSMOTE"]] = ks.miR.formula(w) | |
out <- tryCatch( | |
{ | |
setwd(paste0(code_path,"wx/DearWXpub/src/")) | |
py_run_file("wx_konsta.py", local = T) | |
np <- import("numpy") | |
w = np$load("wyniki.npy",allow_pickle=T) | |
formulas[["WxSMOTE"]] = ks.miR.formula(w) | |
}, | |
error=function(cond) { | |
message("ERROR:") | |
message(cond) | |
# Choose a return value in case of error | |
}, | |
warning=function(cond) { | |
message("WARNING:") | |
message(cond) | |
}, | |
finally={ | |
setwd(wd) | |
} | |
) | |
dane = ks.wczytajmix(replace_smote = F); train = dane[[1]]; test = dane[[2]]; valid = dane[[3]]; train_smoted = dane[[4]]; trainx = dane[[5]]; trainx_smoted = dane[[6]] | |
# Przygotowanie | |
trainx = dplyr::select(train, starts_with("hsa")) | |
testx = dplyr::select(test, starts_with("hsa")) | |
trainx = t(trainx) | |
testx = t(testx) | |
ids = paste0("train",1:ncol(trainx)) | |
ids2 = paste0("test",1:ncol(testx)) | |
colnames(trainx) = gsub("-", "", ids) | |
colnames(testx) = gsub("-", "", ids2) | |
traindata = cbind(data.frame(fnames = rownames(trainx), trainx)) | |
fwrite(traindata, paste0(code_path, "wx/DearWXpub/src/train-data.csv"), row.names = F, quote = F) | |
testdata = cbind(data.frame(fnames = rownames(testx), testx)) | |
fwrite(testdata, paste0(code_path, "wx/DearWXpub/src/test-data.csv"), row.names = F, quote = F) | |
trainanno = data.frame(id = colnames(trainx), label = train$Class) | |
fwrite(trainanno, paste0(code_path, "wx/DearWXpub/src/train-anno.csv"), row.names = F, quote = F) | |
testanno = data.frame(id = colnames(testx), label = test$Class) | |
fwrite(testanno, paste0(code_path, "wx/DearWXpub/src/test-anno.csv"), row.names = F, quote = F) | |
# Wywolanie WX | |
# setwd(paste0(code_path,"wx/DearWXpub/src/")) | |
# py_run_file("wx_konsta_z.py", local = T) | |
# | |
# np <- import("numpy") | |
# w = np$load("wyniki.npy",allow_pickle=T) | |
# print(w) | |
# setwd(wd) | |
# formulas[["Wx_Zscore"]] = ks.miR.formula(w) | |
out <- tryCatch( | |
{ | |
setwd(paste0(code_path,"wx/DearWXpub/src/")) | |
py_run_file("wx_konsta_z.py", local = T) | |
np <- import("numpy") | |
w = np$load("wyniki.npy",allow_pickle=T) | |
formulas[["Wx_Zscore"]] = ks.miR.formula(w) | |
}, | |
error=function(cond) { | |
message("ERROR:") | |
message(cond) | |
# Choose a return value in case of error | |
}, | |
warning=function(cond) { | |
message("WARNING:") | |
message(cond) | |
}, | |
finally={ | |
setwd(wd) | |
} | |
) | |
dane = ks.wczytajmix(replace_smote = T); train = dane[[1]]; test = dane[[2]]; valid = dane[[3]]; train_smoted = dane[[4]]; trainx = dane[[5]]; trainx_smoted = dane[[6]] | |
# Przygotowanie | |
trainx = dplyr::select(train, starts_with("hsa")) | |
testx = dplyr::select(test, starts_with("hsa")) | |
trainx = t(trainx) | |
testx = t(testx) | |
ids = paste0("train",1:ncol(trainx)) | |
ids2 = paste0("test",1:ncol(testx)) | |
colnames(trainx) = gsub("-", "", ids) | |
colnames(testx) = gsub("-", "", ids2) | |
traindata = cbind(data.frame(fnames = rownames(trainx), trainx)) | |
fwrite(traindata, paste0(code_path, "wx/DearWXpub/src/train-data.csv"), row.names = F, quote = F) | |
testdata = cbind(data.frame(fnames = rownames(testx), testx)) | |
fwrite(testdata, paste0(code_path, "wx/DearWXpub/src/test-data.csv"), row.names = F, quote = F) | |
trainanno = data.frame(id = colnames(trainx), label = train$Class) | |
fwrite(trainanno, paste0(code_path, "wx/DearWXpub/src/train-anno.csv"), row.names = F, quote = F) | |
testanno = data.frame(id = colnames(testx), label = test$Class) | |
fwrite(testanno, paste0(code_path, "wx/DearWXpub/src/test-anno.csv"), row.names = F, quote = F) | |
# Wywolanie WX | |
# setwd(paste0(code_path,"wx/DearWXpub/src/")) | |
# py_run_file("wx_konsta_z.py", local = T) | |
# | |
# np <- import("numpy") | |
# w = np$load("wyniki.npy",allow_pickle=T) | |
# print(w) | |
# setwd(wd) | |
# formulas[["Wx_ZscoreSMOTE"]] = ks.miR.formula(w) | |
out <- tryCatch( | |
{ | |
setwd(paste0(code_path,"wx/DearWXpub/src/")) | |
py_run_file("wx_konsta_z.py", local = T) | |
np <- import("numpy") | |
w = np$load("wyniki.npy",allow_pickle=T) | |
formulas[["Wx_ZscoreSMOTE"]] = ks.miR.formula(w) | |
}, | |
error=function(cond) { | |
message("ERROR:") | |
message(cond) | |
# Choose a return value in case of error | |
}, | |
warning=function(cond) { | |
message("WARNING:") | |
message(cond) | |
}, | |
finally={ | |
setwd(wd) | |
} | |
) | |
end_time <- Sys.time(); saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")); saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
# My.stepwise | |
n = n + 1 | |
if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); | |
cat("\n\nStarting My.stepwise") | |
start_time <- Sys.time() | |
dane = ks.wczytajmix(replace_smote = F); train = dane[[1]]; test = dane[[2]]; valid = dane[[3]]; train_smoted = dane[[4]]; trainx = dane[[5]]; trainx_smoted = dane[[6]] | |
library(My.stepwise) | |
temp = capture.output(My.stepwise.glm(Y = "Class", colnames(trainx), data = train, sle = 0.05, sls = 0.05, myfamily = "binomial")) | |
# temp2 = temp[length(temp)-1] | |
# temp3 = temp[length(temp)-3] | |
# temp4 = temp[length(temp)-5] | |
temp5 = paste(temp[(length(temp)-12):length(temp)], collapse = " ") | |
wybrane = FALSE | |
for(i in 1:length(colnames(trainx))) | |
{ | |
temp6 = colnames(trainx)[i] | |
wybrane[i] = grepl(temp6, temp5) | |
} | |
formulas[["Mystepwise_glm_binomial"]] = ks.miR.formula(colnames(trainx)[wybrane]) | |
wyniki = ks.miRNA.de(trainx, train$Class) | |
istotne = filter(wyniki, `p-value BH` <= 0.05) %>% arrange(`p-value BH`) | |
temp = capture.output(My.stepwise.glm(Y = "Class", as.character(istotne$miR), data = train, sle = 0.05, sls = 0.05, myfamily = "binomial")) | |
# temp2 = temp[length(temp)-1] | |
# temp3 = temp[length(temp)-3] | |
# temp4 = temp[length(temp)-5] | |
temp5 = paste(temp[(length(temp)-12):length(temp)], collapse = " ") | |
wybrane = FALSE | |
for(i in 1:length(colnames(trainx))) | |
{ | |
temp6 = colnames(trainx)[i] | |
wybrane[i] = grepl(temp6, temp5) | |
} | |
formulas[["Mystepwise_sig_glm_binomial"]] = ks.miR.formula(colnames(trainx)[wybrane]) | |
end_time <- Sys.time() | |
saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")) | |
saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1 | |
if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); | |
cat("\n\nStarting My.stepwise SMOTE") | |
start_time <- Sys.time() | |
dane = ks.wczytajmix(replace_smote = F); train = dane[[1]]; test = dane[[2]]; valid = dane[[3]]; train_smoted = dane[[4]]; trainx = dane[[5]]; trainx_smoted = dane[[6]] | |
library(My.stepwise) | |
temp = capture.output(My.stepwise.glm(Y = "Class", colnames(trainx_smoted), data = train_smoted, sle = 0.05, sls = 0.05, myfamily = "binomial")) | |
# temp2 = temp[length(temp)-1] | |
# temp3 = temp[length(temp)-3] | |
# temp4 = temp[length(temp)-5] | |
temp5 = paste(temp[(length(temp)-12):length(temp)], collapse = " ") | |
wybrane = FALSE | |
for(i in 1:length(colnames(trainx_smoted))) | |
{ | |
temp6 = colnames(trainx_smoted)[i] | |
wybrane[i] = grepl(temp6, temp5) | |
} | |
formulas[["Mystepwise_glm_binomialSMOTE"]] = ks.miR.formula(colnames(trainx_smoted)[wybrane]) | |
wyniki = ks.miRNA.de(trainx, train$Class) | |
istotne = filter(wyniki, `p-value BH` <= 0.05) %>% arrange(`p-value BH`) | |
temp = capture.output(My.stepwise.glm(Y = "Class", as.character(istotne$miR), data = train_smoted, sle = 0.05, sls = 0.05, myfamily = "binomial")) | |
# temp2 = temp[length(temp)-1] | |
# temp3 = temp[length(temp)-3] | |
# temp4 = temp[length(temp)-5] | |
temp5 = paste(temp[(length(temp)-12):length(temp)], collapse = " ") | |
wybrane = FALSE | |
for(i in 1:length(colnames(trainx_smoted))) | |
{ | |
temp6 = colnames(trainx_smoted)[i] | |
wybrane[i] = grepl(temp6, temp5) | |
} | |
formulas[["Mystepwise_sig_glm_binomialSMOTE"]] = ks.miR.formula(colnames(trainx_smoted)[wybrane]) | |
end_time <- Sys.time() | |
saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")) | |
saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1 | |
if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); | |
cat("\n\nStarting stepAIC") | |
start_time <- Sys.time() | |
dane = ks.wczytajmix(replace_smote = F); train = dane[[1]]; test = dane[[2]]; valid = dane[[3]]; train_smoted = dane[[4]]; trainx = dane[[5]]; trainx_smoted = dane[[6]] | |
temp = glm(Class ~ ., data = train, family = "binomial") | |
temp2 = stepAIC(temp) | |
formulas[["stepAIC"]] = temp2$formula | |
wyniki = ks.miRNA.de(trainx, train$Class) | |
istotne = filter(wyniki, `p-value BH` <= 0.05) %>% arrange(`p-value BH`) | |
train.sig = dplyr::select(train, as.character(istotne$miR), Class) | |
temp = glm(Class ~ ., data = train.sig, family = "binomial") | |
temp2 = stepAIC(temp) | |
formulas[["stepAICsig"]] = temp2$formula | |
end_time <- Sys.time() | |
saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")) | |
saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1 | |
if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); | |
cat("\n\nStarting stepAIC SMOTE") | |
start_time <- Sys.time() | |
dane = ks.wczytajmix(replace_smote = F); train = dane[[1]]; test = dane[[2]]; valid = dane[[3]]; train_smoted = dane[[4]]; trainx = dane[[5]]; trainx_smoted = dane[[6]] | |
temp = glm(Class ~ ., data = train_smoted, family = "binomial") | |
temp2 = stepAIC(temp) | |
formulas[["stepAIC_SMOTE"]] = temp2$formula | |
wyniki = ks.miRNA.de(trainx, train$Class) | |
istotne = filter(wyniki, `p-value BH` <= 0.05) %>% arrange(`p-value BH`) | |
train.sig = dplyr::select(train_smoted, as.character(istotne$miR), Class) | |
temp = glm(Class ~ ., data = train.sig, family = "binomial") | |
temp2 = stepAIC(temp) | |
formulas[["stepAICsig_SMOTE"]] = temp2$formula | |
end_time <- Sys.time() | |
saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")) | |
saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1 | |
if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); | |
cat("\n\nStarting MK method (iterated RFE)") | |
start_time <- Sys.time() | |
dane = ks.wczytajmix(replace_smote = F); train = dane[[1]]; test = dane[[2]]; valid = dane[[3]]; train_smoted = dane[[4]]; trainx = dane[[5]]; trainx_smoted = dane[[6]] | |
selectedMirsCV <- mk.iteratedRFE(trainSet = train, useCV = T, classLab = 'Class', checkNFeatures = prefer_no_features)$topFeaturesPerN[[prefer_no_features]] | |
selectedMirsTest <- mk.iteratedRFE(trainSet = train, testSet = test, classLab = 'Class', checkNFeatures = prefer_no_features)$topFeaturesPerN[[prefer_no_features]] | |
formulas[["iteratedRFECV"]] = ks.miR.formula(selectedMirsCV$topFeaturesPerN[[prefer_no_features]]) | |
formulas[["iteratedRFETest"]] = ks.miR.formula(selectedMirsTest$topFeaturesPerN[[prefer_no_features]]) | |
end_time <- Sys.time() | |
saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")) | |
saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
n = n + 1 | |
if (n %in% m) { cat(paste0("\n\nMatched method ", n, " with those requested.. Starting..")); | |
cat("\n\nStarting MK method (iterated RFE) with SMOTE") | |
start_time <- Sys.time() | |
dane = ks.wczytajmix(replace_smote = F); train = dane[[1]]; test = dane[[2]]; valid = dane[[3]]; train_smoted = dane[[4]]; trainx = dane[[5]]; trainx_smoted = dane[[6]] | |
selectedMirsCV <- mk.iteratedRFE(trainSet = train_smoted, useCV = T, classLab = 'Class', checkNFeatures = prefer_no_features)$topFeaturesPerN[[prefer_no_features]] | |
selectedMirsTest <- mk.iteratedRFE(trainSet = train_smoted, testSet = test, classLab = 'Class', checkNFeatures = prefer_no_features)$topFeaturesPerN[[prefer_no_features]] | |
formulas[["iteratedRFECV"]] = ks.miR.formula(selectedMirsCV$topFeaturesPerN[[prefer_no_features]]) | |
formulas[["iteratedRFETest"]] = ks.miR.formula(selectedMirsTest$topFeaturesPerN[[prefer_no_features]]) | |
end_time <- Sys.time() | |
saveRDS(end_time - start_time, paste0("temp/",n,"-",run_id,".RDS")) | |
saveRDS(formulas, paste0("temp/formulas",run_id,"-",n,".RDS")) | |
save(list = ls(), file = paste0("temp/all",run_id,".rdata")); print(formulas) | |
} | |
# End | |
cat("\n\nEnding...") | |
stopCluster(cl) | |
saveRDS(formulas, paste0("temp/featureselection_formulas",stamp,".RDS")) | |
saveRDS(formulas, paste0("featureselection_formulas.RDS")) | |
nazwa = paste0("temp/featureselection_data",stamp,".rda") | |
save(list = ls(), file = nazwa) | |
dev.off() | |
sink() | |
print(formulas) | |
setwd(oldwd) | |
return(formulas) | |
} | |
mk.iteratedRFE <- function(trainSet, testSet = NULL, initFeatures = colnames(trainSet), classLab, checkNFeatures = 25, votingIterations = 100000, useCV = F, nfolds = 10, initRandomState = 42 ) { | |
set.seed(initRandomState) | |
#prepare output data structures | |
resAcc <- c(rep(0, checkNFeatures)) | |
resVotes <- data.frame(matrix(0, nrow = length(initFeatures), ncol = checkNFeatures), row.names = initFeatures) | |
for(i in 1:checkNFeatures) colnames(resVotes)[i] <- toString(i) | |
resTop <- list() | |
for (i in 1:votingIterations) { | |
if(useCV == F) { | |
params <- rfeControl(functions = rfFuncs, saveDetails = T) | |
iter <- rfeIter(x = trainSet[, initFeatures], y = as.factor(trainSet[, classLab]), testX = testSet[, initFeatures], testY = as.factor(testSet[, classLab]), sizes = 1:checkNFeatures, | |
metric = "Accuracy", rfeControl = params) | |
for(j in 1:checkNFeatures) { | |
tmp <- iter$pred[iter$pred$Variables == j, ] | |
acc <- length(which(tmp$pred == tmp$obs)) / nrow(tmp) #calculate and add accuracy | |
resAcc[j] <- resAcc[j] + acc | |
selected <- iter$finalVariables[[j+1]]$var | |
numb <- iter$finalVariables[[j+1 ]]$Variables[1] | |
resVotes[selected, numb] <- resVotes[selected, numb] + 1 | |
} | |
} | |
else { | |
seeds <- vector(mode = "list", length = nfolds + 1) # add random seeds for cross validation | |
for(i in 1:nfolds) seeds[[i]] <- sample.int(1000000000, checkNFeatures + 1) | |
seeds[nfolds + 1] <- sample.int(1000000000, 1) | |
params <- rfeControl(functions = rfFuncs, number = nfolds, saveDetails = T) | |
iter <- rfe(x = trainSet[, initFeatures], y = as.factor(trainSet[, classLab]), sizes = 1:checkNFeatures, rfeControl = params) | |
for(j in 1:checkNFeatures) { | |
tmp <- iter$variables[iter$variables$Variables == j, ] | |
for(k in tmp$var) resVotes[k, j] <- resVotes[k, j] + 1 # increase a voting score for each fold | |
resAcc[j] <- resAcc[j] + iter$results[iter$results$Variables == j, "Accuracy"] | |
} | |
} | |
} | |
resAcc <- resAcc / votingIterations #make average accuracy | |
for(i in 1:ncol(resVotes)) resTop[[i]] <- rownames(resVotes[order(-resVotes[, i])[1:i], ]) | |
returning <- list(data.frame(resAcc), resVotes, resTop) | |
names(returning) <- c("accuracyPerNFeatures", "votesPerN", "topFeaturesPerN") | |
return(returning) | |
} | |
ks.benchmark = function(wd = getwd(), search_iters = 2000, keras_epochs = 5000, keras_threads = floor(parallel::detectCores()/2), search_iters_mxnet = 5000, | |
cores = detectCores()-1, input_formulas = readRDS("featureselection_formulas_final.RDS"), | |
output_file = "benchmark.csv", mxnet = F, gpu = F, | |
#algorithms = c("nnet","svmRadial", "svmLinear","rf","C5.0","mlp", "mlpML","xgbTree"), | |
algorithms = c("mlpKerasDropout", "mlpKerasDecay", "mlpML","xgbTree","svmRadial", "svmLinear","rf","C5.0"), | |
holdout = T, stamp = as.character(as.numeric(Sys.time()))) { | |
library(plyr) | |
library(dplyr) | |
library(edgeR) | |
library(epiDisplay) | |
library(rsq) | |
library(MASS) | |
library(Biocomb) | |
library(caret) | |
library(dplyr) | |
library(epiDisplay) | |
library(pROC) | |
library(ggplot2) | |
library(DMwR) | |
library(stringr) | |
library(psych) | |
library(C50) | |
library(randomForest) | |
library(nnet) | |
library(reticulate) | |
library(stargazer) | |
if(!dir.exists("temp")) { dir.create("temp") } | |
use_condaenv("tensorflow") | |
zz <- file(paste0("temp/benchmark",stamp,".log"), open = "wt") | |
pdf(paste0("temp/benchmark",stamp,".pdf")) | |
sink(zz) | |
sink(zz, type = "message") | |
library(doParallel) | |
oldwd = getwd() | |
setwd(wd) | |
formulas = input_formulas | |
wybrane = list() | |
ile_miRNA = numeric() | |
for (i in 1:length(formulas)) { | |
wybrane[[i]] = all.vars(as.formula(formulas[[i]]))[-1] | |
ile_miRNA[i] = length(all.vars(as.formula(formulas[[i]])))-1 | |
} | |
hist(ile_miRNA, breaks = 150) | |
psych::describe(ile_miRNA) | |
# Wywalamy formuły z więcej niż 15 miRNA | |
#formulas_old = formulas | |
#formulas = formulas[which(ile_miRNA <= 15)] | |
#print(formulas) | |
dane = ks.wczytajmix(wd = wd, replace_smote = F); train = dane[[1]]; test = dane[[2]]; valid = dane[[3]]; train_smoted = dane[[4]]; trainx = dane[[5]]; trainx_smoted = dane[[6]] | |
if(!dir.exists("models")) { dir.create("models") } | |
wyniki = data.frame(method = names(formulas)) | |
wyniki$SMOTE = ifelse(grepl("SMOTE", names(formulas)),"Yes","No") | |
if(mxnet == T) { | |
library(mxnet) | |
#gpu = system("nvidia-smi", intern = T) | |
if (gpu == T) { | |
mxctx = mx.gpu() | |
print("MXNET is using GPU :)") | |
} else { | |
mxctx = mx.cpu() | |
print("MXNET is using CPU :(") | |
} | |
algorytmy = c("glm", "mxnetAdam","mxnet", algorithms) | |
} else { | |
algorytmy = c("glm",algorithms) | |
} | |
for (ii in 1:length(algorytmy)) { | |
algorytm = algorytmy[ii] | |
print(algorytm) | |
if(grepl("Keras",algorytm)) { | |
cl <- makePSOCKcluster(keras_threads) | |
registerDoParallel(cl) | |
} else { | |
cl <- makePSOCKcluster(cores-1) | |
registerDoParallel(cl) | |
} | |
for (i in 1:length(formulas)) { | |
print(paste0("Testing formula: ", as.character(formulas[[i]]))) | |
wyniki$miRy[i] = as.character(formulas[[i]]) | |
temptrain = train | |
if (wyniki$SMOTE[i] == "Yes") { temptrain = train_smoted } | |
# Hold-out czy CV? | |
temptrainold = temptrain | |
if(holdout == T) { | |
#fit_on = list(rs1 = 1:nrow(temptrain), rs2 = 1:nrow(temptrain)) | |
#pred_on = list(rs1 = (nrow(temptrain)+1):((nrow(temptrain)+1)+nrow(test)), rs2 = ((nrow(temptrain)+1)+nrow(test)+1):((nrow(temptrain)+1)+nrow(test)+1+nrow(valid))) | |
#temptrain = rbind.fill(temptrain,test,valid) | |
fit_on = list(rs1 = 1:nrow(temptrain)) | |
pred_on = list(rs1 = (nrow(temptrain)+1):((nrow(temptrain))+nrow(test))) | |
temptrain = rbind.fill(temptrain,test) | |
} | |
# wyniki2 = tryCatch({ | |
if(algorytm == "mxnet") { | |
hyperparameters = expand.grid(layer1 = seq(2,14,1), layer2 = c(0,2,4), layer3 = c(0), activation = c('relu', 'sigmoid', 'tanh', 'softrelu'), | |
dropout = c(0,0.05),learning.rate= c(0.00001, 0.01), momentum = c(0, 0.8, 0.99)) | |
train_control <- trainControl(method="cv", repeats=5, number = 10, classProbs = TRUE,verboseIter = TRUE, summaryFunction = twoClassSummary, savePredictions = TRUE) | |
if(holdout == T) { train_control <- trainControl(method="cv", index= fit_on, indexOut = pred_on, indexFinal = fit_on[[1]], verboseIter = TRUE, | |
classProbs = TRUE, summaryFunction = twoClassSummary, savePredictions = TRUE) } | |
model1 = caret::train(as.formula(formulas[[i]]), ctx = mxctx, optimizer = 'sgd', | |
#optimizer_params=(('learning_rate',0.1),('lr_scheduler',lr_sch)), | |
#preProc = c("center", "scale"), | |
#epoch.end.callback = mx.callback.early.stop(5,10,NULL, maximize=TRUE, verbose=TRUE), | |
#eval.data = list(data=dplyr::select(test, starts_with("hsa")),label=dplyr::select(test, Class)), | |
epoch.end.callback=mx.callback.early.stop(30, 30), | |
#preProc = c("center", "scale"), | |
num.round = search_iters_mxnet, data=temptrain, trControl=train_control, method=algorytm, tuneGrid = hyperparameters) | |
print(model1$finalModel) | |
} else if(algorytm == "mxnetAdam") { | |
hyperparameters = expand.grid(layer1 = seq(2,14,1), layer2 = c(0,2,4), layer3 = c(0), activation = c('relu', 'sigmoid', 'tanh', 'softrelu'), | |
dropout = c(0,0.05), beta1=0.9, beta2=0.999, learningrate= c(0.00001, 0.001, 0.01)) | |
train_control <- trainControl(method="cv", repeats=5, number = 10, classProbs = TRUE,verboseIter = TRUE, summaryFunction = twoClassSummary, savePredictions = TRUE) | |
if(holdout == T) { train_control <- trainControl(method="cv", index= fit_on, indexOut = pred_on, indexFinal = fit_on[[1]], verboseIter = TRUE, | |
classProbs = TRUE, summaryFunction = twoClassSummary, savePredictions = TRUE) } | |
model1 = caret::train(as.formula(formulas[[i]]), ctx = mxctx, | |
#optimizer_params=(('learning_rate',0.1),('lr_scheduler',lr_sch)), | |
#preProc = c("center", "scale"), | |
#epoch.end.callback = mx.callback.early.stop(5,10,NULL, maximize=TRUE, verbose=TRUE), | |
#eval.data = list(data=dplyr::select(test, starts_with("hsa")),label=dplyr::select(test, Class)), | |
epoch.end.callback=mx.callback.early.stop(30, 30), | |
#preProc = c("center", "scale"), | |
num.round = search_iters_mxnet, data=temptrain, trControl=train_control, method=algorytm, tuneGrid = hyperparameters) | |
print(model1$finalModel) | |
} else if(grepl("Keras",algorytm)) { | |
train_control <- trainControl(method="cv", number = 5, search="random", classProbs = TRUE, verboseIter = TRUE, | |
summaryFunction = twoClassSummary, savePredictions = TRUE, indexFinal = fit_on[[1]]) | |
#if(holdout == T) { train_control <- trainControl(method="cv", index= fit_on, indexOut = pred_on, indexFinal = fit_on[[1]], verboseIter = TRUE, | |
# classProbs = TRUE, summaryFunction = twoClassSummary, savePredictions = TRUE) } | |
model1 = caret::train(as.formula(formulas[[i]]), data=temptrain, trControl=train_control, method=algorytm, tuneLength = search_iters, | |
epochs = keras_epochs) | |
print(model1$finalModel) | |
} else if (algorytm == "glm") { | |
train_control <- trainControl(method="repeatedcv", repeats=5, number = 10, search="random", classProbs = TRUE, verboseIter = TRUE, | |
summaryFunction = twoClassSummary, savePredictions = TRUE) | |
if(holdout == T) { train_control <- trainControl(method="cv", index= fit_on, indexOut = pred_on, indexFinal = fit_on[[1]], verboseIter = TRUE, | |
classProbs = TRUE, summaryFunction = twoClassSummary, savePredictions = TRUE) } | |
model1 = caret::train(as.formula(formulas[[i]]), data=temptrain, trControl=train_control, method=algorytm, family="binomial") | |
print(model1$finalModel) | |
} else { | |
train_control <- trainControl(method="repeatedcv", repeats=5, number = 10, search="random", classProbs = TRUE, verboseIter = TRUE, | |
summaryFunction = twoClassSummary, savePredictions = TRUE) | |
if(holdout == T) { train_control <- trainControl(method="cv", index= fit_on, indexOut = pred_on, indexFinal = fit_on[[1]], verboseIter = TRUE, | |
classProbs = TRUE, summaryFunction = twoClassSummary, savePredictions = TRUE) } | |
model1 = caret::train(as.formula(formulas[[i]]), data=temptrain, trControl=train_control, method=algorytm, tuneLength = search_iters) | |
print(model1$finalModel) | |
} | |
modelname = as.numeric(Sys.time()) | |
print(paste0("MODELID: ", modelname)) | |
saveRDS(model1, paste0("models/",modelname,".RDS")) | |
wyniki[i,paste0(algorytm,"_modelname")] = modelname | |
t1_roc = roc(temptrainold$Class ~ predict(model1, newdata = temptrainold , type = "prob")[,2]) | |
wyniki[i,paste0(algorytm,"_train_ROCAUC")] = t1_roc$auc | |
wyniki[i,paste0(algorytm,"_train_ROCAUC_lower95CI")] = as.character(ci(t1_roc))[1] | |
wyniki[i,paste0(algorytm,"_train_ROCAUC_upper95CI")] = as.character(ci(t1_roc))[3] | |
t1 = caret::confusionMatrix(predict(model1, newdata = temptrainold), as.factor(temptrainold$Class), positive = "Cancer") | |
wyniki[i,paste0(algorytm,"_train_Accuracy")] = t1$overall["Accuracy"] | |
wyniki[i,paste0(algorytm,"_train_Sensitivity")] = t1$byClass["Sensitivity"] | |
wyniki[i,paste0(algorytm,"_train_Specificity")] = t1$byClass["Specificity"] | |
v1 = caret::confusionMatrix(predict(model1, newdata = test), as.factor(test$Class), positive = "Cancer") | |
wyniki[i,paste0(algorytm,"_test_Accuracy")] = v1$overall["Accuracy"] | |
wyniki[i,paste0(algorytm,"_test_Sensitivity")] = v1$byClass["Sensitivity"] | |
wyniki[i,paste0(algorytm,"_test_Specificity")] = v1$byClass["Specificity"] | |
v1 = caret::confusionMatrix(predict(model1, newdata = valid), as.factor(valid$Class), positive = "Cancer") | |
wyniki[i,paste0(algorytm,"_valid_Accuracy")] = v1$overall["Accuracy"] | |
wyniki[i,paste0(algorytm,"_valid_Sensitivity")]= v1$byClass["Sensitivity"] | |
wyniki[i,paste0(algorytm,"_valid_Specificity")] = v1$byClass["Specificity"] | |
# return(wyniki) | |
# } | |
# , warning = function(warning_condition) { | |
# print(paste("WARNING: ",warning_condition)) | |
# return(wyniki) | |
# }, error = function(error_condition) { | |
# print(paste("WARNING: ",error_condition)) | |
# wyniki[i,paste0(algorytm,"_train_Accuracy")] = as.character(error_condition) | |
# wyniki[i,paste0(algorytm,"_train_Sensitivity")] = NA | |
# wyniki[i,paste0(algorytm,"_train_Specificity")] = NA | |
# | |
# wyniki[i,paste0(algorytm,"_test_Accuracy")] = NA | |
# wyniki[i,paste0(algorytm,"_test_Sensitivity")] = NA | |
# wyniki[i,paste0(algorytm,"_test_Specificity")] = NA | |
# | |
# wyniki[i,paste0(algorytm,"_valid_Accuracy")] = NA | |
# wyniki[i,paste0(algorytm,"_valid_Sensitivity")]= NA | |
# wyniki[i,paste0(algorytm,"_valid_Specificity")] = NA | |
# return(wyniki) | |
# }, finally={ | |
# stargazer(wyniki, type = 'html', out = paste0("temp/wyniki",stamp,".html"), summary = F) | |
# }) | |
stargazer(wyniki, type = 'html', out = paste0("temp/wyniki",stamp,".html"), summary = F) | |
write.csv(wyniki,paste0("temp/",output_file)) | |
} | |
stopCluster(cl) | |
} | |
stargazer(wyniki, type = 'html', out = paste0("temp/wyniki",stamp,".html"), summary = F) | |
write.csv(wyniki,output_file) | |
setwd(oldwd) | |
sink(type = "message") | |
sink() | |
dev.off() | |
return(wyniki) | |
} | |
ks.korektaaliasowmiRNA = function(temp, species = "hsa") { | |
library(foreach) | |
library(doParallel) | |
library(dplyr) | |
miRbase_aliasy = fread("ftp://mirbase.org/pub/mirbase/CURRENT/aliases.txt.gz") | |
colnames(miRbase_aliasy) = c("MIMAT","Aliasy") | |
miRbase_aliasy_hsa = miRbase_aliasy %>% filter(str_detect(Aliasy, paste0(species,"*"))) %>% filter(str_detect(MIMAT, "MIMAT*")) | |
#setup parallel backend to use many processors | |
cores=detectCores() | |
cl <- makePSOCKcluster(cores-1) #not to overload your computer | |
registerDoParallel(cl) | |
temp2 = colnames(temp) | |
final <- foreach(i=1:length(temp2), .combine=c) %dopar% { | |
#for(i in 1:length(temp2)) { | |
naz = temp2[i] | |
library(data.table) | |
library(stringr) | |
for (ii in 1:nrow(miRbase_aliasy_hsa)) { | |
temp3 = str_split(as.character(miRbase_aliasy_hsa[ii,2]), ";") | |
temp4 = temp3[[1]] | |
temp4 = temp4[temp4 != ""] | |
if(naz %in% temp4) { naz = temp4[length(temp4)] } | |
} | |
naz | |
} | |
colnames(temp) = final | |
stopCluster(cl) | |
return(temp) | |
} | |
ks.setup = function(install_pkgs = T, create_pyenv = T) { | |
# apt install libcairo2-dev libopencv-dev build-essential git ninja-build ccache libopenblas-dev libglu1-mesa-dev freeglut3-dev mesa-common-dev libtool autossh ocl-icd-opencl-dev libfreetype6-dev libssl-dev default-jdk default-jre libcurl4-openssl-dev libxml2-dev | |
if(install_pkgs) { | |
install.packages("BiocManager") | |
BiocManager::install(c("reticulate","devtools","plyr","dplyr","edgeR","epiDisplay","rsq","MASS","Biocomb","caret","dplyr", | |
"pROC","ggplot2","DMwR", "doParallel", "Boruta", "spFSR", "varSelRF", "stringr", "psych", "C50", "randomForest", | |
"foreach","data.table", "ROSE", "deepnet", "gridExtra", "stargazer","gplots")) | |
devtools::install_github("STATWORX/bounceR") | |
#system("../mxnet/docs/install/install_mxnet_ubuntu_python.sh") | |
} | |
library(reticulate) | |
library(devtools) | |
# paczki nie w CRAN | |
#install.packages("BiocManager") | |
py_config() | |
if(create_pyenv) { | |
gpu = system("nvidia-smi", intern = T) | |
if (length(gpu) > 1) { | |
reticulate::conda_create("tensorflow",c("tensorflow-gpu","keras","mxnet-gpu")) | |
} else { | |
reticulate::conda_create("tensorflow",c("tensorflow","keras","mxnet")) | |
} | |
} | |
use_condaenv("tensorflow") | |
} | |
ks.merge_formulas = function(wd = getwd(), max_miRNAs = 11, add = list()) { | |
oldwd = getwd() | |
setwd(wd) | |
formulas_files = list.files("temp","formulas*", all.files = T, full.names = T) | |
formulas = list() | |
formulas_names = character() | |
for (i in 1:length(formulas_files)) { | |
temp = readRDS(formulas_files[i]) | |
tempn = names(temp) | |
formulas = c(formulas, temp) | |
formulas_names = c(formulas_names, tempn) | |
} | |
temp = data.frame(name = formulas_names, formula = unlist(as.character(formulas)), stringsAsFactors = F) | |
final = temp %>% dplyr::distinct() | |
final$formula | |
formulas = list() | |
all = as.list(final$formula) | |
names(all) = make.names(final$name, unique = T) | |
saveRDS(all, "featureselection_formulas_all.RDS") | |
for(i in 1:nrow(final)){ | |
final$ile_miRNA[i] = length(all.vars(as.formula(final$formula[i])))-1 | |
} | |
for (i in 1:length(add)) { | |
tempdupa = data.frame(name = names(add)[i], formula = paste0("Class ~ ", as.character(ks.miR.formula(add[[i]]))[3]), ile_miRNA = 0, stringsAsFactors = F) | |
final = rbind(tempdupa, final) | |
} | |
finalold = final | |
final = final %>% filter(ile_miRNA <= max_miRNAs) | |
if (("fcsig" %in% final$name) == FALSE) { final = rbind(finalold[which(finalold$name=="fcsig"),], final) | |
} | |
if (("cfs_sig" %in% final$name) == FALSE) { final = rbind(finalold[which(finalold$name=="cfs_sig"),], final) } | |
formulas_final = as.list(final$formula) | |
names(formulas_final) = make.names(final$name, unique = T) | |
setwd(oldwd) | |
saveRDS(formulas_final, "featureselection_formulas_final.RDS") | |
#sink() | |
#dev.off() | |
return(formulas_final) | |
} | |
ks.miRNA_signiture_overlap = function(which_formulas = c("sig","cfs"), benchmark_csv = "benchmark1578929876.21765.csv") | |
{ | |
benchmark = read.csv(benchmark_csv, stringsAsFactors = F) | |
rownames(benchmark) = make.names(benchmark$method, unique = T) | |
wybrane = list() | |
for (i in 1:length(which_formulas)) { | |
ktora_to = match(which_formulas[i], rownames(benchmark)) | |
temp = as.formula(benchmark$miRy[ktora_to]) | |
wybrane[[rownames(benchmark)[ktora_to]]] = all.vars(temp)[-1] | |
} | |
require("gplots") | |
temp = venn(wybrane) | |
temp | |
} | |
ks.get_benchmark = function(benchmark_csv = "benchmark1578929876.21765.csv"){ | |
benchmark = read.csv(benchmark_csv, stringsAsFactors = F) | |
rownames(benchmark) = make.names(benchmark$method, unique = T) | |
benchmark$method = rownames(benchmark) | |
return(benchmark) | |
} | |
ks.get_benchmark_methods = function(benchmark_csv = "benchmark1578929876.21765.csv"){ | |
library(limma) | |
benchmark = read.csv(benchmark_csv, stringsAsFactors = F) | |
rownames(benchmark) = make.names(benchmark$method, unique = T) | |
benchmark$method = rownames(benchmark) | |
temp = dplyr::select(benchmark, ends_with("_valid_Accuracy")) | |
metody = strsplit2(colnames(temp), "_")[,1] | |
return(metody) | |
} | |
ks.best_signiture_proposals = function(benchmark_csv = "benchmark1578929876.21765.csv", without_train = F){ | |
benchmark = read.csv(benchmark_csv, stringsAsFactors = F) | |
rownames(benchmark) = make.names(benchmark$method, unique = T) | |
acc = dplyr::select(benchmark, ends_with("_train_Accuracy"), ends_with("_test_Accuracy"),ends_with("_valid_Accuracy") ) | |
if (without_train == T) { acc = dplyr::select(benchmark, ends_with("_test_Accuracy"),ends_with("_valid_Accuracy")) } | |
acc$metaindex = rowMeans(acc) | |
acc$method = rownames(benchmark) | |
acc$miRy = benchmark$miRy | |
rownames(acc) = make.names(benchmark$method, unique = T) | |
acc = acc %>% arrange(desc(metaindex)) | |
return(acc) | |
} | |
ks.best_signiture_proposals_meta11 = function(benchmark_csv = "benchmark1578929876.21765.csv"){ | |
benchmark = read.csv(benchmark_csv, stringsAsFactors = F) | |
rownames(benchmark) = make.names(benchmark$method, unique = T) | |
temp1 = dplyr::select(benchmark, ends_with("_valid_Sensitivity")) | |
temp2 = dplyr::select(benchmark, ends_with("_valid_Specificity")) | |
#acc = dplyr::select(benchmark, ends_with("_train_Accuracy"), ends_with("_test_Accuracy"),ends_with("_valid_Accuracy") ) | |
#if (without_train == T) { acc = dplyr::select(benchmark, ends_with("_test_Accuracy"),ends_with("_valid_Accuracy")) } | |
temp1$temp = rowMeans(temp1) | |
temp2$temp = rowMeans(temp2) | |
acc = benchmark | |
acc$metaindex = temp1$temp + temp2$temp - 1 | |
acc$method = rownames(benchmark) | |
acc$miRy = benchmark$miRy | |
rownames(acc) = make.names(benchmark$method, unique = T) | |
acc = acc %>% arrange(desc(metaindex)) | |
return(acc) | |
} | |
ks.get_miRNAs_from_benchmark = function(benchmark_csv = "benchmark1578990441.6531.csv", method = "fcsig") | |
{ | |
benchmark = read.csv(benchmark_csv, stringsAsFactors = F) | |
rownames(benchmark) = make.names(benchmark$method, unique = T) | |
return(all.vars(as.formula(benchmark$miRy[which(rownames(benchmark) == method)]))[-1]) | |
} | |
ks.best_signiture_de = function(selected_miRNAs, use_mix = F) | |
{ | |
dane = ks.wczytajmix(replace_smote = F); train = dane[[1]]; test = dane[[2]]; valid = dane[[3]]; train_smoted = dane[[4]]; trainx = dane[[5]]; trainx_smoted = dane[[6]] | |
wyniki = ks.miRNA.de(trainx, train$Class) | |
#write.csv(wyniki, "DE_tylkotrain.csv") | |
if (use_mix ==T) { | |
mix = rbind(train,test,valid) | |
wyniki = ks.miRNA.de(dplyr::select(mix,starts_with("hsa")), mix$Class) } | |
#write.csv(wynikiw, "DE_wszystko.csv") | |
wyniki = wyniki %>% arrange(`p-value BH`) | |
return(wyniki[match(selected_miRNAs, wyniki$miR),c("miR","log2FC","p-value BH")]) | |
} | |
ks.vulcano_plot = function(selected_miRNAs, DE = ks.miRNA.de(), only_label = NULL) | |
{ | |
temp = DE[match(selected_miRNAs, DE$miR),] | |
tlabels = gsub("\\.","-", selected_miRNAs) | |
if (is.null(only_label)) | |
{ | |
} | |
else { | |
tlabels = rep(NA, length(gsub("\\.","-", selected_miRNAs))) | |
tlabels[match(only_label, DE$miR)] = gsub("\\.","-", selected_miRNAs)[match(only_label, DE$miR)] | |
} | |
thres = -log10(0.05) | |
library(ggplot2) | |
library(ggrepel) | |
x_limits <- c(-3, +3) | |
pval = -log10(temp$`p-value BH`) | |
ggplot(data = temp, aes(x = temp$`log2FC`, y = pval, label = tlabels)) + | |
geom_text_repel(arrow = arrow(length = unit(0.01, "npc"), type = "closed", ends = "last"), | |
force = 50) + | |
geom_point(color = 'black') + | |
theme_classic(base_size = 16) + | |
labs(x = "Log2(FC)") + | |
labs(y = "-Log10(adjusted p-value)") + | |
geom_hline(yintercept=thres, linetype="dashed", color = "red") + | |
geom_vline(xintercept = 0, linetype="dotted", | |
color = "blue", size=0.5) + | |
xlim(-3,3) + | |
ylim(0,8) | |
} | |
ks.mydist=function(c) {dist(c,method="euclidian")} | |
ks.myclust=function(c) {hclust(c,method="average")} | |
ks.heatmap = function(x = trainx[,1:10], rlab = data.frame(Batch = dane$Batch, Class = dane$Class), zscore = F) { | |
assigcolor = character() | |
assigcode = character() | |
kolory = rep(palette(),20)[-1] | |
kolor_i = 1 | |
for (i in 1:ncol(rlab)){ | |
o_ile = as.numeric(length(unique(rlab[,i]))) | |
assigcode = c(assigcode, as.character(unique(rlab[,i]))) | |
assigcolor = c(assigcolor, kolory[kolor_i:(kolor_i+o_ile-1)]) | |
#levels(rlab[,i]) = topo.colors(length(unique(rlab[,i]))) | |
levels(rlab[,i]) = kolory[kolor_i:(kolor_i+o_ile-1)] | |
kolor_i = kolor_i+o_ile | |
} | |
assig = data.frame(assigcode, assigcolor) | |
#levels(rlab$Batch) = rainbow(length(unique(dane$Batch))) | |
#levels(rlab$Class) = c("red","green") # red - cancer, green - control | |
x2 = as.matrix(x) | |
colnames(x2) = gsub("\\.","-", colnames(x2)) | |
if(zscore == F) { | |
brks<-ks.diverge_color(x2, centeredOn = median(x2)) | |
# colors = seq(min(x2), max(x2), by = 0.01) | |
# my_palette <- colorRampPalette(c("blue", "white", "red"))(n = length(colors) - 1) | |
rlab = as.matrix(rlab) | |
ks.heatmap.3(x2, hclustfun=ks.myclust, distfun=ks.mydist, | |
RowSideColors=t(rlab), | |
margins = c(10,10), | |
KeyValueName="log10(TPM)", | |
symm=F,symkey=F,symbreaks=T, scale="none", | |
col=as.character(brks[[2]]), | |
breaks=as.numeric(brks[[1]]$brks), | |
#legend = T | |
#,scale="column" | |
) | |
legend("topright", | |
assigcode, fill=assigcolor, horiz=F, bg="transparent", cex=0.5) | |
} else { | |
x3 = x2 | |
for(i in 1:ncol(x2)) { | |
x3[,i] = scale(x2[,i]) | |
} | |
brks<-ks.diverge_color(x3, centeredOn = median(0)) | |
# colors = seq(min(x2), max(x2), by = 0.01) | |
# my_palette <- colorRampPalette(c("blue", "white", "red"))(n = length(colors) - 1) | |
rlab = as.matrix(rlab) | |
ks.heatmap.3(x3, hclustfun=ks.myclust, distfun=ks.mydist, | |
RowSideColors=t(rlab), | |
margins = c(10,10), | |
KeyValueName="Z-score log10(TPM)", | |
symm=F,symkey=F,symbreaks=T, scale="none", | |
col=as.character(brks[[2]]), | |
breaks=as.numeric(brks[[1]]$brks), | |
#legend = T | |
#,scale="column" | |
) | |
legend("topright", | |
assigcode, fill=assigcolor, horiz=F, bg="transparent", cex=0.5) | |
} | |
} | |
ks.diverge_color <- function(data,centeredOn=0){ | |
library(classInt) | |
nHalf=50 | |
Min <- min(data,na.rm=TRUE) | |
Max <- max(data,na.rm=TRUE) | |
Thresh <- centeredOn | |
pal<-colorRampPalette(c("blue", "white", "red"))(n = 11) | |
rc1<-colorRampPalette(colors=c(pal[1],pal[2]),space="Lab")(10) | |
for(i in 2:10){ | |
tmp<-colorRampPalette(colors=c(pal[i],pal[i+1]),space="Lab")(10) | |
rc1<-c(rc1,tmp) | |
} | |
rb1 <- seq(Min, Thresh, length.out=nHalf+1) | |
rb2 <- seq(Thresh, Max, length.out=nHalf+1)[-1] | |
rampbreaks <- c(rb1, rb2) | |
cuts <- classIntervals(data, style="fixed",fixedBreaks=rampbreaks) | |
return(list(cuts,rc1)) | |
} | |
# filter out by default if less than 10 counts in more than 1/2 samples. | |
ks.counts_to_log10tpm = function(danex, metadane = metadane, ids = metadane$ID, filtr = T, | |
filtr_minimalcounts = 10, filtr_howmany = 1/2) { | |
danex = as.matrix(danex) | |
dane_counts = t(danex) | |
colnames(dane_counts) = ids | |
mode(dane_counts) = 'numeric' | |
# Normalizacja do TPM | |
dane3 = DGEList(counts=dane_counts, genes=data.frame(miR = rownames(dane_counts)), samples = metadane) | |
tpm = cpm(dane3, normalized.lib.sizes=F, log = F, prior.count = 0.001) | |
tpm = tpm + 0.001 | |
tpm = log10(tpm) | |
ttpm = t(tpm) | |
saveRDS(dane3,"TPM_DGEList.rds") | |
cat("\nDGEList unfiltered object with TPM was saved as TPM_DGEList.rds.") | |
if (filtr == F) { | |
cat("\nReturned data are log10(TPM).") | |
return(ttpm) | |
} else { | |
zostaw = vector() | |
zostaw[1] = TRUE | |
for (i in 1:nrow(dane3)) | |
{ | |
temp = as.numeric(dane3$counts[i,]) | |
if (sum(temp < filtr_minimalcounts) > (filtr_howmany)*length(temp)) { zostaw[i] = FALSE } else { zostaw[i] = TRUE } | |
} | |
dane4 = dane3[zostaw,] | |
ttpm_pofiltrze = ttpm[,which(zostaw)] | |
cat("\nDGEList filtered object with TPM was saved as TPM_DGEList_filtered.rds.") | |
cat(paste0("\n(After filtering) miRNAs left: ", sum(zostaw), " | filtered out: ", sum(zostaw==FALSE), ".")) | |
saveRDS(dane4,"TPM_DGEList_filtered.rds") | |
cat("\nReturned data are log10(TPM).") | |
return(ttpm_pofiltrze) | |
} | |
} | |
ks.PCA = function(ttpm_pofiltrze, meta) { | |
dane.pca <- prcomp(ttpm_pofiltrze, scale. = TRUE) | |
library(ggbiplot) | |
ggbiplot(dane.pca,var.axes = FALSE,ellipse=TRUE,circle=TRUE, groups=as.factor(meta)) | |
} | |
ks.PCA_3D = function(ttpm_pofiltrze, meta) { | |
dane.pca <- prcomp(ttpm_pofiltrze, scale. = TRUE) | |
library(plotly) | |
pc = as.data.frame(dane.pca$x) | |
pc = cbind(pc, meta) | |
p <- plot_ly(data = pc, x = ~PC1, y = ~PC2, z = ~PC3, color = ~meta) %>% | |
add_markers() %>% | |
layout(scene = list(xaxis = list(title = 'PC1'), | |
yaxis = list(title = 'PC2'), | |
zaxis = list(title = 'PC3'))) | |
p | |
} | |
# tempp must contain Class | |
ks.prepare_split = function(metadane = metadane, ttpm = ttpm_pofiltrze, train_proc = 0.6) | |
{ | |
tempp = cbind(metadane, ttpm) | |
# Podzial - http://topepo.github.io/caret/data-splitting.html#simple-splitting-based-on-the-outcome | |
set.seed(1) | |
mix = rep("unasign", nrow(tempp)) | |
library(caret) | |
train.index <- createDataPartition(tempp$Class, p = train_proc, list = FALSE) | |
mix[train.index] = "train" | |
train = tempp[train.index,] | |
tempp2 = tempp[-train.index,] | |
train.index <- createDataPartition(tempp2$Class, p = .5, list = FALSE) | |
test = tempp2[train.index,] | |
valid = tempp2[-train.index,] | |
write.csv(train, "mixed_train.csv",row.names = F) | |
write.csv(test, "mixed_test.csv",row.names = F) | |
write.csv(valid, "mixed_valid.csv",row.names = F) | |
train$mix = "train" | |
test$mix = "test" | |
valid$mix = "valid" | |
mixed = rbind(train,test,valid) | |
metadane2 = dplyr::select(mixed, -starts_with("hsa")) | |
ttpm2 = dplyr::select(mixed, starts_with("hsa")) | |
mixed2 = cbind(metadane2, ttpm2) | |
write.csv(mixed2, "mixed.csv",row.names = F) | |
cat("\nSaved 3 sets as csv in working directory. Retruned mixed dataset.") | |
return(mixed) | |
} | |
ks.keras_create_model <- function(i, hyperparameters, how_many_features = ncol(x_train_scale)) { | |
# tempmodel <- keras_model_sequential() %>% | |
# { if(hyperparameters[i,10]==T) { layer_dense(. , units = hyperparameters[i,1], kernel_regularizer = regularizer_l2(l = 0.001), | |
# activation = hyperparameters[i,4], input_shape = c(ncol(x_train_scale))) } else | |
# { layer_dense(. , units = hyperparameters[i,1], activation = hyperparameters[i,4], | |
# input_shape = c(ncol(x_train_scale))) } } %>% | |
# { if(hyperparameters[i,7]>0) { layer_dropout(. , rate = hyperparameters[i,7]) } else { . } } %>% | |
# { if(hyperparameters[i,2]>0) { | |
# if(hyperparameters[i,11]==T) { layer_dense(. , units = hyperparameters[i,2], activation = hyperparameters[i,5], | |
# kernel_regularizer = regularizer_l2(l = 0.001)) } else { | |
# layer_dense(units = hyperparameters[i,2], activation = hyperparameters[i,5]) } } } %>% | |
# { if(hyperparameters[i,8]>0) { layer_dropout(rate = hyperparameters[i,8]) } else { . } } %>% | |
# { if(hyperparameters[i,3]>0) { | |
# if(hyperparameters[i,12]==T) { layer_dense(units = hyperparameters[i,3], activation = hyperparameters[i,6], | |
# kernel_regularizer = regularizer_l2(l = 0.001)) } else | |
# {layer_dense(units = hyperparameters[i,3], activation = hyperparameters[i,6])} } else { . } } %>% | |
# { if(hyperparameters[i,9]>0) { layer_dropout(rate = hyperparameters[i,9]) } else { . } } %>% | |
# layer_dense(units = 1, activation = 'sigmoid') | |
library(keras) | |
tempmodel <- keras_model_sequential() | |
if(hyperparameters[i,10]==T) { layer_dense(tempmodel , units = hyperparameters[i,1], kernel_regularizer = regularizer_l2(l = 0.001), | |
activation = hyperparameters[i,4], input_shape = c(how_many_features)) } else | |
{ layer_dense(tempmodel , units = hyperparameters[i,1], activation = hyperparameters[i,4], | |
input_shape = c(how_many_features)) } | |
if(hyperparameters[i,7]>0) { layer_dropout(tempmodel , rate = hyperparameters[i,7]) } | |
if(hyperparameters[i,2]>0) { | |
if(hyperparameters[i,11]==T) { layer_dense(tempmodel , units = hyperparameters[i,2], activation = hyperparameters[i,5], | |
kernel_regularizer = regularizer_l2(l = 0.001)) } else | |
{layer_dense(tempmodel, units = hyperparameters[i,2], activation = hyperparameters[i,5]) } } | |
if(hyperparameters[i,2]>0 & hyperparameters[i,8]>0) { layer_dropout(tempmodel, rate = hyperparameters[i,8]) } | |
if(hyperparameters[i,3]>0) { | |
if(hyperparameters[i,12]==T) { layer_dense(tempmodel, units = hyperparameters[i,3], activation = hyperparameters[i,6], | |
kernel_regularizer = regularizer_l2(l = 0.001)) } else | |
{ layer_dense(tempmodel, units = hyperparameters[i,3], activation = hyperparameters[i,6])} } | |
if(hyperparameters[i,3]>0 & hyperparameters[i,9]>0) { layer_dropout(rate = hyperparameters[i,9]) } | |
layer_dense(tempmodel, units = 2, activation = 'softmax') | |
print(tempmodel) | |
dnn_class_model = compile(tempmodel, optimizer = hyperparameters[i,13], | |
loss = 'binary_crossentropy', | |
metrics = 'accuracy') | |
} | |
# autoencoder ma softmax na deep feature | |
# jesli zmienna autoencoder >0 budujemy autoencoder 5-wartwowy, jesli <0 budujemy autoencoder 3-warstwowy | |
ks.deep_learning = function(selected_miRNAs = ".", wd = getwd(), | |
SMOTE = F, keras_batch_size = 64, clean_temp_files = T, | |
save_threshold_trainacc = 0.8, save_threshold_testacc = 0.7, keras_epochae = 5000, | |
keras_epoch = 1000, keras_patience = 100, | |
hyperparameters = expand.grid(layer1 = seq(3,11, by = 2), layer2 = c(0,seq(3,11, by = 2)), layer3 = c(0,seq(3,11, by = 2)), | |
activation_function_layer1 = c("relu","sigmoid"), activation_function_layer2 = c("relu","sigmoid"), activation_function_layer3 = c("relu","sigmoid"), | |
dropout_layer1 = c(0, 0.1), dropout_layer2 = c(0, 0.1), dropout_layer3 = c(0), | |
layer1_regularizer = c(T,F), layer2_regularizer = c(T,F), layer3_regularizer = c(T,F), | |
optimizer = c("sgd","adam","rmsprop"), autoencoder = c(0,7,-7), balanced = SMOTE, formula = as.character(ks.miR.formula(selected_miRNAs))[3], | |
stringsAsFactors = F), | |
keras_threads = ceiling(parallel::detectCores()/3), start = 1, end = nrow(hyperparameters), output_file = "deeplearning_results.csv") | |
{ | |
codename = sub(pattern = "(.*)\\..*$", replacement = "\\1", basename(output_file)) | |
#options(warn=-1) | |
oldwd = getwd() | |
setwd = setwd(wd) | |
set.seed(1) | |
temp_dir = tempdir() | |
options(bitmapType = 'cairo', device = 'png') | |
library(dplyr) | |
library(keras) | |
library(plyr) | |
library(foreach) | |
library(doParallel) | |
#library(doSNOW) | |
library(data.table) | |
fwrite(hyperparameters, paste0("hyperparameters_",output_file)) | |
dane = ks.wczytajmix(wd = wd, replace_smote = F); train = dane[[1]]; test = dane[[2]]; valid = dane[[3]]; train_smoted = dane[[4]]; trainx = dane[[5]]; trainx_smoted = dane[[6]] | |
if (SMOTE == T) { train = train_smoted } | |
#cores=detectCores() | |
cat(paste0("\nTemp dir: ", temp_dir, "\n")) | |
cat("\nStarting preparing cluster..\n") | |
#cl <- makePSOCKcluster(keras_threads) #not to overload your computer | |
#cl = makeCluster(keras_threads, outfile=paste0("temp/", ceiling(as.numeric(Sys.time())), "deeplearning_cluster.log")) | |
#registerDoParallel(cl) | |
#on.exit(stopCluster(cl)) | |
#cat("\nCluster prepared..\n") | |
#args= names(mget(ls())) | |
#export = export[!export %in% args] | |
# tu musi isc iteracja | |
cat(paste0("\nStarting parallel loop.. There are: ", end-start+1, " hyperparameter sets to be checked.\n")) | |
final <- foreach(i=as.numeric(start):as.numeric(end), .combine=rbind, .verbose=T, .inorder=F | |
,.errorhandling="remove", .export = ls(), .packages = loadedNamespaces() | |
) %do% { | |
library(keras) | |
library(ggplot2) | |
library(dplyr) | |
library(data.table) | |
cat("\nStarting hyperparameters..\n") | |
print(hyperparameters[i,]) | |
source("../ks_miRNAhelpers.R") | |
options(bitmapType = 'cairo', device = 'png') | |
model_id = paste0(format(i, scientific = FALSE), "-", ceiling(as.numeric(Sys.time()))) | |
if(SMOTE == T) { model_id = paste0(format(i, scientific = FALSE), "-SMOTE-", ceiling(as.numeric(Sys.time()))) } | |
tempwyniki = data.frame(model_id=model_id) | |
tempwyniki[1, "model_id"] = model_id | |
if(!dir.exists(paste0(temp_dir,"/models"))) { dir.create(paste0(temp_dir,"/models"))} | |
if(!dir.exists(paste0(temp_dir,"/models/keras",model_id))) { dir.create(paste0(temp_dir,"/models/keras",model_id))} | |
cat(paste0("\nTraining model: ",temp_dir,"/models/keras",model_id,"\n")) | |
# pdf(paste0(temp_dir,"/models/keras",model_id,"/plots.pdf"), paper="a4") | |
#con <- file(paste0(temp_dir,"/models/keras",model_id,"/training.log")) | |
#sink(con, append=TRUE) | |
#sink(con, append=TRUE, type="message") | |
early_stop <- callback_early_stopping(monitor = "val_loss", mode="min", patience = keras_patience) | |
cp_callback <- callback_model_checkpoint( | |
filepath = paste0(temp_dir,"/models/keras",model_id,"/finalmodel.hdf5"), | |
save_best_only = TRUE, save_freq = 5, monitor = "val_loss", | |
verbose = 0 | |
) | |
ae_cp_callback <- callback_model_checkpoint( | |
filepath = paste0(temp_dir,"/models/keras",model_id,"/autoencoderweights.hdf5"), | |
save_best_only = TRUE, save_weights_only = T, save_freq = 5, monitor = "val_loss", | |
verbose = 0 | |
) | |
x_train <- train %>% | |
{ if (selected_miRNAs != ".") { dplyr::select(., selected_miRNAs) } else { dplyr::select(., starts_with("hsa")) } } %>% | |
as.matrix() | |
y_train <- train %>% | |
dplyr::select("Class") %>% | |
as.matrix() | |
y_train[,1] = ifelse(y_train[,1] == "Cancer",1,0) | |
x_test <- test %>% | |
{ if (selected_miRNAs != ".") { dplyr::select(.,selected_miRNAs) } else { dplyr::select(.,starts_with("hsa")) } } %>% | |
as.matrix() | |
y_test <- test %>% | |
dplyr::select("Class") %>% | |
as.matrix() | |
y_test[,1] = ifelse(y_test[,1] == "Cancer",1,0) | |
x_valid <- valid %>% | |
{ if (selected_miRNAs != ".") { dplyr::select(.,selected_miRNAs) } else { dplyr::select(.,starts_with("hsa")) } } %>% | |
as.matrix() | |
y_valid <- valid %>% | |
dplyr::select("Class") %>% | |
as.matrix() | |
y_valid[,1] = ifelse(y_valid[,1] == "Cancer",1,0) | |
x_train_scale = x_train %>% scale() | |
col_mean_train <- attr(x_train_scale, "scaled:center") | |
col_sd_train <- attr(x_train_scale, "scaled:scale") | |
x_test_scale <- x_test %>% | |
scale(center = col_mean_train, | |
scale = col_sd_train) | |
x_valid_scale <- x_valid %>% | |
scale(center = col_mean_train, | |
scale = col_sd_train) | |
input_layer <- layer_input(shape = c(ncol(x_train_scale))) | |
psych::describe(x_test_scale) | |
# czy autoenkoder? | |
if(hyperparameters[i,14] != 0) { | |
n1 <- hyperparameters[i,14] | |
n3 <- ncol(x_train_scale) | |
if (hyperparameters[i,14]>0) { | |
encoder <- | |
input_layer %>% | |
layer_dense(units = ceiling(n3/2), activation = hyperparameters[i,6]) %>% | |
layer_dense(units = n1, activation = "sigmoid") # dimensions of final encoding layer | |
decoder <- encoder %>% | |
layer_dense(units = ceiling(n3/2), activation = hyperparameters[i,6]) %>% | |
layer_dense(units = n3, hyperparameters[i,6]) # dimension of original variable | |
} | |
else { | |
n1 = -n1 | |
encoder <- | |
input_layer %>% | |
layer_dense(units = n1, activation = "sigmoid") # dimensions of final encoding layer | |
decoder <- encoder %>% | |
layer_dense(units = n3, hyperparameters[i,6]) # dimension of original variable | |
} | |
ae_model <- keras_model(inputs = input_layer, outputs = decoder) | |
ae_model | |
ae_model %>% | |
compile(loss = "mean_absolute_error", | |
optimizer = hyperparameters[i,13], | |
metrics = c("mean_squared_error")) | |
summary(ae_model) | |
#ae_tempmodelfile = tempfile() | |
ae_history <- fit(ae_model, x = x_train_scale, | |
y = x_train_scale, | |
epochs = keras_epoch, batch_size = keras_batch_size, | |
shuffle = T, verbose = 0, | |
view_metrics = FALSE, | |
validation_data = list(x_test_scale, x_test_scale), | |
callbacks = list(ae_cp_callback, early_stop)) | |
#file.copy(ae_tempmodelfile, paste0(temp_dir,"/models/keras",model_id,"/finalmodel.hdf5"), overwrite = T) | |
ae_history_df <- as.data.frame(ae_history) | |
fwrite(ae_history_df, paste0(temp_dir,"/models/keras",model_id,"/autoencoder_history_df.csv.gz")) | |
compare_cx <- data.frame( | |
train_loss = ae_history$metrics$loss, | |
test_loss = ae_history$metrics$val_loss | |
) %>% | |
tibble::rownames_to_column() %>% | |
mutate(rowname = as.integer(rowname)) %>% | |
tidyr::gather(key = "type", value = "value", -rowname) | |
plot1 = ggplot(compare_cx, aes(x = rowname, y = value, color = type)) + | |
geom_line() + | |
xlab("epoch") + | |
ylab("loss") + | |
theme_bw() | |
ggplot2::ggsave(file = paste0(temp_dir,"/models/keras",model_id,"/training_autoencoder.png"), grid.arrange(plot1, nrow =1, top = "Training of autoencoder")) | |
cat("\n- Reloading autoencoder to get weights...\n") | |
encoder_model <- keras_model(inputs = input_layer, outputs = encoder) | |
encoder_model %>% load_model_weights_hdf5(paste0(temp_dir,"/models/keras",model_id,"/autoencoderweights.hdf5"), | |
skip_mismatch = T, | |
by_name = T) | |
cat("\n- Autoencoder saving...\n") | |
save_model_hdf5(encoder_model, paste0(temp_dir,"/models/keras",model_id,"/autoencoder.hdf5")) | |
cat("\n- Creating deep features...\n") | |
ae_x_train_scale <- encoder_model %>% | |
predict(x_train_scale) %>% | |
as.matrix() | |
fwrite(ae_x_train_scale, paste0(temp_dir,"/models/keras",model_id,"/deepfeatures_train.csv")) | |
ae_x_test_scale <- encoder_model %>% | |
predict(x_test_scale) %>% | |
as.matrix() | |
fwrite(ae_x_test_scale, paste0(temp_dir,"/models/keras",model_id,"/deepfeatures_test.csv")) | |
ae_x_valid_scale <- encoder_model %>% | |
predict(x_valid_scale) %>% | |
as.matrix() | |
fwrite(ae_x_valid_scale, paste0(temp_dir,"/models/keras",model_id,"/deepfeatures_valid.csv")) | |
# # podmiana żeby nie edytować kodu | |
x_train_scale = as.matrix(ae_x_train_scale) | |
x_test_scale = as.matrix(ae_x_test_scale) | |
x_valid_scale = as.matrix(ae_x_valid_scale) | |
cat("\n- Training model based on deep features...\n") | |
dnn_class_model <- ks.keras_create_model(i, hyperparameters = hyperparameters, how_many_features = ncol(x_train_scale)) | |
#tempmodelfile = tempfile() | |
history <- fit(dnn_class_model, x = x_train_scale, | |
y = to_categorical(y_train), | |
epochs = 1000, | |
validation_data = list(x_test_scale, to_categorical(y_test)), | |
callbacks = list(cp_callback, early_stop), | |
verbose = 0, | |
view_metrics = FALSE, | |
batch_size = 32, shuffle = T) | |
print(history) | |
#plot(history, col="black") | |
history_df <- as.data.frame(history) | |
fwrite(history_df, paste0(temp_dir,"/models/keras",model_id,"/history_df.csv.gz")) | |
cat("\n- Saving history and plots...\n") | |
compare_cx <- data.frame( | |
train_loss = history$metrics$loss, | |
test_loss = history$metrics$val_loss | |
) %>% | |
tibble::rownames_to_column() %>% | |
mutate(rowname = as.integer(rowname)) %>% | |
tidyr::gather(key = "type", value = "value", -rowname) | |
plot1 = ggplot(compare_cx, aes(x = rowname, y = value, color = type)) + | |
geom_line() + | |
xlab("Epoch") + | |
ylab("Loss") + | |
theme_bw() | |
compare_cx <- data.frame( | |
train_accuracy = history$metrics$accuracy, | |
test_accuracy = history$metrics$val_accuracy | |
) %>% | |
tibble::rownames_to_column() %>% | |
mutate(rowname = as.integer(rowname)) %>% | |
tidyr::gather(key = "type", value = "value", -rowname) | |
plot2 = ggplot(compare_cx, aes(x = rowname, y = value, color = type)) + | |
geom_line() + | |
xlab("Epoch") + | |
ylab("Accuracy") + | |
theme_bw() | |
ggplot2::ggsave(file = paste0(temp_dir,"/models/keras",model_id,"/training.png"), grid.arrange(plot1, plot2, nrow =2, top = "Training of final neural network")) | |
# ewaluacja | |
cat("\n- Saving final model...\n") | |
dnn_class_model = load_model_hdf5(paste0(temp_dir,"/models/keras",model_id,"/finalmodel.hdf5")) | |
y_train_pred <- predict(object = dnn_class_model, x = x_train_scale) | |
y_test_pred <- predict(object = dnn_class_model, x = x_test_scale) | |
y_valid_pred <- predict(object = dnn_class_model, x = x_valid_scale) | |
# wybranie odciecia | |
pred = data.frame(`Class` = train$Class, `Pred` = y_train_pred) | |
library(cutpointr) | |
cutoff = cutpointr(pred, Pred, Class, pos_class = "Cancer", metric = youden) | |
summary(cutoff) | |
ggplot2::ggsave(file = paste0(temp_dir,"/models/keras",model_id,"/cutoff.png"), plot(cutoff)) | |
wybrany_cutoff = cutoff$optimal_cutpoint | |
#wyniki[i, "training_AUC"] = cutoff$AUC | |
tempwyniki[1, "training_AUC"] = cutoff$AUC | |
#wyniki[i, "cutoff"] = wybrany_cutoff | |
tempwyniki[1, "cutoff"] = wybrany_cutoff | |
cat(paste0("\n\n---- TRAINING AUC: ",cutoff$AUC," ----\n\n")) | |
cat(paste0("\n\n---- OPTIMAL CUTOFF: ",wybrany_cutoff," ----\n\n")) | |
cat(paste0("\n\n---- TRAINING PERFORMANCE ----\n\n")) | |
pred$PredClass = ifelse(pred$Pred >= wybrany_cutoff, "Cancer", "Control") | |
pred$PredClass = factor(pred$PredClass, levels = c("Control","Cancer")) | |
cm_train = caret::confusionMatrix(pred$PredClass, pred$Class, positive = "Cancer") | |
print(cm_train) | |
t1_roc = pROC::roc(Class ~ as.numeric(Pred), data=pred) | |
tempwyniki[1, "training_AUC2"] = t1_roc$auc | |
tempwyniki[1, "training_AUC_lower95CI"] = as.character(ci(t1_roc))[1] | |
tempwyniki[1, "training_AUC_upper95CI"] = as.character(ci(t1_roc))[3] | |
saveRDS(t1_roc, paste0(temp_dir,"/models/keras",model_id,"/training_ROC.RDS")) | |
tempwyniki[1, "training_Accuracy"] = cm_train$overall[1] | |
tempwyniki[1, "training_Sensitivity"] = cm_train$byClass[1] | |
tempwyniki[1, "training_Specificity"] = cm_train$byClass[2] | |
tempwyniki[1, "training_PPV"] = cm_train$byClass[3] | |
tempwyniki[1, "training_NPV"] = cm_train$byClass[4] | |
tempwyniki[1, "training_F1"] = cm_train$byClass[7] | |
saveRDS(cm_train, paste0(temp_dir,"/models/keras",model_id,"/cm_train.RDS")) | |
cat(paste0("\n\n---- TESTING PERFORMANCE ----\n\n")) | |
pred = data.frame(`Class` = test$Class, `Pred` = y_test_pred) | |
pred$PredClass = ifelse(pred$Pred >= wybrany_cutoff, "Cancer", "Control") | |
pred$PredClass = factor(pred$PredClass, levels = c("Control","Cancer")) | |
cm_test = caret::confusionMatrix(pred$PredClass, pred$Class, positive = "Cancer") | |
print(cm_test) | |
tempwyniki[1, "test_Accuracy"] = cm_test$overall[1] | |
tempwyniki[1, "test_Sensitivity"] = cm_test$byClass[1] | |
tempwyniki[1, "test_Specificity"] = cm_test$byClass[2] | |
tempwyniki[1, "test_PPV"] = cm_test$byClass[3] | |
tempwyniki[1, "test_NPV"] = cm_test$byClass[4] | |
tempwyniki[1, "test_F1"] = cm_test$byClass[7] | |
saveRDS(cm_test, paste0(temp_dir,"/models/keras",model_id,"/cm_test.RDS")) | |
cat(paste0("\n\n---- VALIDATION PERFORMANCE ----\n\n")) | |
pred = data.frame(`Class` = valid$Class, `Pred` = y_valid_pred) | |
pred$PredClass = ifelse(pred$Pred >= wybrany_cutoff, "Cancer", "Control") | |
pred$PredClass = factor(pred$PredClass, levels = c("Control","Cancer")) | |
cm_valid = caret::confusionMatrix(pred$PredClass, pred$Class, positive = "Cancer") | |
print(cm_valid) | |
tempwyniki[1, "valid_Accuracy"] = cm_test$overall[1] | |
tempwyniki[1, "valid_Sensitivity"] = cm_test$byClass[1] | |
tempwyniki[1, "valid_Specificity"] = cm_test$byClass[2] | |
tempwyniki[1, "valid_PPV"] = cm_test$byClass[3] | |
tempwyniki[1, "valid_NPV"] = cm_test$byClass[4] | |
tempwyniki[1, "valid_F1"] = cm_test$byClass[7] | |
saveRDS(cm_valid, paste0(temp_dir,"/models/keras",model_id,"/cm_valid.RDS")) | |
mix = rbind(train,test,valid) | |
mixx = rbind(x_train_scale, x_test_scale, x_valid_scale) | |
y_mixx_pred <- predict(object = dnn_class_model, x = mixx) | |
mix$Podzial = c(rep("Training",nrow(train)),rep("Test",nrow(test)),rep("Validation",nrow(valid))) | |
mix$Pred = y_mixx_pred | |
mix$PredClass = ifelse(mix$Pred >= wybrany_cutoff, "Cancer", "Control") | |
fwrite(mix, paste0(temp_dir,"/models/keras",model_id,"/data_predictions.csv.gz")) | |
fwrite(cbind(hyperparameters[i,], tempwyniki), paste0(temp_dir,"/models/keras",model_id,"/wyniki.csv")) | |
wagi = get_weights(dnn_class_model) | |
saveRDS(wagi, paste0(temp_dir,"/models/keras",model_id,"/finalmodel_weights.RDS")) | |
save_model_weights_hdf5(dnn_class_model, paste0(temp_dir,"/models/keras",model_id,"/finalmodel_weights.hdf5")) | |
saveRDS(dnn_class_model, paste0(temp_dir,"/models/keras",model_id,"/finalmodel.RDS")) | |
# czy jest sens zapisywac? | |
#sink() | |
#sink(type="message") | |
cat(paste0("\n\n== ",model_id, ": ", tempwyniki[1, "training_Accuracy"], " / ", tempwyniki[1, "test_Accuracy"], " ==> ", tempwyniki[1, "training_Accuracy"]>save_threshold_trainacc & tempwyniki[1, "test_Accuracy"]>save_threshold_testacc)) | |
if(tempwyniki[1, "training_Accuracy"]>save_threshold_trainacc & tempwyniki[1, "test_Accuracy"]>save_threshold_testacc) { | |
# zapisywanie modelu do właściwego katalogu | |
zip(paste0("models/",codename,"_",model_id,".zip"),list.files(paste0(temp_dir,"/models/keras",model_id), full.names = T, recursive = T, include.dirs = T)) | |
# file.copy(list.files(paste0(temp_dir,"/models/keras",model_id), pattern = "_wyniki.csv$", full.names = T, recursive = T, include.dirs = T),paste0("temp/",codename,"_",model_id,"_deeplearningresults.csv")) | |
if (!dir.exists(paste0("temp/",codename,"/"))) { dir.create(paste0("temp/",codename,"/")) } | |
file.copy(list.files(paste0(temp_dir,"/models/keras",model_id), pattern = "_wyniki.csv$", full.names = T, recursive = T, include.dirs = T),paste0("temp/",codename,"/",model_id,"_deeplearningresults.csv")) | |
# dev.off() | |
} | |
} else { | |
x_train <- train %>% | |
{ if (selected_miRNAs != ".") { dplyr::select(.,selected_miRNAs) } else { dplyr::select(.,starts_with("hsa")) } } %>% | |
as.matrix() | |
y_train <- train %>% | |
dplyr::select("Class") %>% | |
as.matrix() | |
y_train[,1] = ifelse(y_train[,1] == "Cancer",1,0) | |
x_test <- test %>% | |
{ if (selected_miRNAs != ".") { dplyr::select(.,selected_miRNAs) } else { dplyr::select(.,starts_with("hsa")) } } %>% | |
as.matrix() | |
y_test <- test %>% | |
dplyr::select("Class") %>% | |
as.matrix() | |
y_test[,1] = ifelse(y_test[,1] == "Cancer",1,0) | |
x_valid <- valid %>% | |
{ if (selected_miRNAs != ".") { dplyr::select(.,selected_miRNAs) } else { dplyr::select(.,starts_with("hsa")) } } %>% | |
as.matrix() | |
y_valid <- valid %>% | |
dplyr::select("Class") %>% | |
as.matrix() | |
y_valid[,1] = ifelse(y_valid[,1] == "Cancer",1,0) | |
x_train_scale = x_train %>% scale() | |
col_mean_train <- attr(x_train_scale, "scaled:center") | |
col_sd_train <- attr(x_train_scale, "scaled:scale") | |
x_test_scale <- x_test %>% | |
scale(center = col_mean_train, | |
scale = col_sd_train) | |
x_valid_scale <- x_valid %>% | |
scale(center = col_mean_train, | |
scale = col_sd_train) | |
dnn_class_model <- ks.keras_create_model(i, hyperparameters = hyperparameters, how_many_features = ncol(x_train_scale)) | |
history <- fit(dnn_class_model, x = x_train_scale, | |
y = to_categorical(y_train), | |
epochs = 1000, | |
validation_data = list(x_test_scale, to_categorical(y_test)), | |
callbacks = list( | |
# callback_model_checkpoint( | |
# filepath = paste0(temp_dir,"/models/keras",model_id,"/finalmodel.hdf5"), | |
# save_best_only = TRUE, save_freq = 5, monitor = "val_loss", | |
# verbose = 0), | |
callback_reduce_lr_on_plateau(monitor = "val_loss", factor = 0.1), | |
callback_model_checkpoint(paste0(temp_dir,"/models/keras",model_id,"/finalmodel.hdf5")), | |
early_stop), | |
verbose = 0, | |
view_metrics = FALSE, | |
batch_size = 32, shuffle = T) | |
print(history) | |
#plot(history, col="black") | |
history_df <- as.data.frame(history) | |
fwrite(history_df, paste0(temp_dir,"/models/keras",model_id,"/history_df.csv.gz")) | |
# pdf(paste0(temp_dir,"/models/keras",model_id,"/plots.pdf")) | |
compare_cx <- data.frame( | |
train_loss = history$metrics$loss, | |
test_loss = history$metrics$val_loss | |
) %>% | |
tibble::rownames_to_column() %>% | |
mutate(rowname = as.integer(rowname)) %>% | |
tidyr::gather(key = "type", value = "value", -rowname) | |
plot1 = ggplot(compare_cx, aes(x = rowname, y = value, color = type)) + | |
geom_line() + | |
xlab("epoch") + | |
ylab("loss") + | |
theme_bw() | |
compare_cx <- data.frame( | |
train_accuracy = history$metrics$accuracy, | |
test_accuracy = history$metrics$val_accuracy | |
) %>% | |
tibble::rownames_to_column() %>% | |
mutate(rowname = as.integer(rowname)) %>% | |
tidyr::gather(key = "type", value = "value", -rowname) | |
plot2 = ggplot(compare_cx, aes(x = rowname, y = value, color = type)) + | |
geom_line() + | |
xlab("epoch") + | |
ylab("loss") + | |
theme_bw() | |
ggplot2::ggsave(file = paste0(temp_dir,"/models/keras",model_id,"/training.png"), grid.arrange(plot1, plot2, nrow =2, top = "Training of final neural network")) | |
# ewaluacja | |
dnn_class_model = load_model_hdf5(paste0(temp_dir,"/models/keras",model_id,"/finalmodel.hdf5")) | |
y_train_pred <- predict(object = dnn_class_model, x = x_train_scale) | |
y_test_pred <- predict(object = dnn_class_model, x = x_test_scale) | |
y_valid_pred <- predict(object = dnn_class_model, x = x_valid_scale) | |
# wybranie odciecia | |
pred = data.frame(`Class` = train$Class, `Pred` = y_train_pred) | |
library(cutpointr) | |
cutoff = cutpointr(pred, Pred, Class, pos_class = "Cancer", metric = youden) | |
print(summary(cutoff)) | |
ggplot2::ggsave(file = paste0(temp_dir,"/models/keras",model_id,"/cutoff.png"), plot(cutoff)) | |
wybrany_cutoff = cutoff$optimal_cutpoint | |
#wyniki[i, "training_AUC"] = cutoff$AUC | |
tempwyniki[1, "training_AUC"] = cutoff$AUC | |
#wyniki[i, "cutoff"] = wybrany_cutoff | |
tempwyniki[1, "cutoff"] = wybrany_cutoff | |
cat(paste0("\n\n---- TRAINING PERFORMANCE ----\n\n")) | |
pred$PredClass = ifelse(pred$Pred >= wybrany_cutoff, "Cancer", "Control") | |
pred$PredClass = factor(pred$PredClass, levels = c("Control","Cancer")) | |
cm_train = caret::confusionMatrix(pred$PredClass, pred$Class, positive = "Cancer") | |
print(cm_train) | |
t1_roc = pROC::roc(Class ~ as.numeric(Pred), data=pred) | |
tempwyniki[1, "training_AUC2"] = t1_roc$auc | |
tempwyniki[1, "training_AUC_lower95CI"] = as.character(ci(t1_roc))[1] | |
tempwyniki[1, "training_AUC_upper95CI"] = as.character(ci(t1_roc))[3] | |
saveRDS(t1_roc, paste0(temp_dir,"/models/keras",model_id,"/training_ROC.RDS")) | |
#ggplot2::ggsave(file = paste0(temp_dir,"/models/keras",model_id,"/training_ROC.png"), grid.arrange(plot(t1_roc), nrow =1, top = "Training ROC curve")) | |
tempwyniki[1, "training_Accuracy"] = cm_train$overall[1] | |
tempwyniki[1, "training_Sensitivity"] = cm_train$byClass[1] | |
tempwyniki[1, "training_Specificity"] = cm_train$byClass[2] | |
tempwyniki[1, "training_PPV"] = cm_train$byClass[3] | |
tempwyniki[1, "training_NPV"] = cm_train$byClass[4] | |
tempwyniki[1, "training_F1"] = cm_train$byClass[7] | |
saveRDS(cm_train, paste0(temp_dir,"/models/keras",model_id,"/cm_train.RDS")) | |
cat(paste0("\n\n---- TESTING PERFORMANCE ----\n\n")) | |
pred = data.frame(`Class` = test$Class, `Pred` = y_test_pred) | |
pred$PredClass = ifelse(pred$Pred >= wybrany_cutoff, "Cancer", "Control") | |
pred$PredClass = factor(pred$PredClass, levels = c("Control","Cancer")) | |
cm_test = caret::confusionMatrix(pred$PredClass, pred$Class, positive = "Cancer") | |
print(cm_test) | |
tempwyniki[1, "test_Accuracy"] = cm_test$overall[1] | |
tempwyniki[1, "test_Sensitivity"] = cm_test$byClass[1] | |
tempwyniki[1, "test_Specificity"] = cm_test$byClass[2] | |
tempwyniki[1, "test_PPV"] = cm_test$byClass[3] | |
tempwyniki[1, "test_NPV"] = cm_test$byClass[4] | |
tempwyniki[1, "test_F1"] = cm_test$byClass[7] | |
saveRDS(cm_test, paste0(temp_dir,"/models/keras",model_id,"/cm_test.RDS")) | |
cat(paste0("\n\n---- VALIDATION PERFORMANCE ----\n\n")) | |
pred = data.frame(`Class` = valid$Class, `Pred` = y_valid_pred) | |
pred$PredClass = ifelse(pred$Pred >= wybrany_cutoff, "Cancer", "Control") | |
pred$PredClass = factor(pred$PredClass, levels = c("Control","Cancer")) | |
cm_valid = caret::confusionMatrix(pred$PredClass, pred$Class, positive = "Cancer") | |
print(cm_valid) | |
tempwyniki[1, "valid_Accuracy"] = cm_test$overall[1] | |
tempwyniki[1, "valid_Sensitivity"] = cm_test$byClass[1] | |
tempwyniki[1, "valid_Specificity"] = cm_test$byClass[2] | |
tempwyniki[1, "valid_PPV"] = cm_test$byClass[3] | |
tempwyniki[1, "valid_NPV"] = cm_test$byClass[4] | |
tempwyniki[1, "valid_F1"] = cm_test$byClass[7] | |
saveRDS(cm_valid, paste0(temp_dir,"/models/keras",model_id,"/cm_valid.RDS")) | |
mix = rbind(train,test,valid) | |
mixx = rbind(x_train_scale, x_test_scale, x_valid_scale) | |
y_mixx_pred <- predict(object = dnn_class_model, x = mixx) | |
mix$Podzial = c(rep("Training",nrow(train)),rep("Test",nrow(test)),rep("Validation",nrow(valid))) | |
mix$Pred = y_mixx_pred | |
mix$PredClass = ifelse(mix$Pred >= wybrany_cutoff, "Cancer", "Control") | |
fwrite(mix, paste0(temp_dir,"/models/keras",model_id,"/data_predictions.csv.gz")) | |
fwrite(cbind(hyperparameters[i,], tempwyniki), paste0(temp_dir,"/models/keras",model_id,"/",model_id,"_wyniki.csv")) | |
wagi = get_weights(dnn_class_model) | |
saveRDS(wagi, paste0(temp_dir,"/models/keras",model_id,"/finalmodel_weights.RDS")) | |
save_model_weights_hdf5(dnn_class_model, paste0(temp_dir,"/models/keras",model_id,"/finalmodel_weights.hdf5")) | |
saveRDS(dnn_class_model, paste0(temp_dir,"/models/keras",model_id,"/finalmodel.RDS")) | |
# czy jest sens zapisywac? | |
cat(paste0("\n\n== ",model_id, ": ", tempwyniki[1, "training_Accuracy"], " / ", tempwyniki[1, "test_Accuracy"], " ==> ", tempwyniki[1, "training_Accuracy"]>save_threshold_trainacc & tempwyniki[1, "test_Accuracy"]>save_threshold_testacc)) | |
if(tempwyniki[1, "training_Accuracy"]>save_threshold_trainacc & tempwyniki[1, "test_Accuracy"]>save_threshold_testacc) { | |
# zapisywanie modelu do właściwego katalogu | |
zip(paste0("models/",codename,"_",model_id,".zip"),list.files(paste0(temp_dir,"/models/keras",model_id), full.names = T, recursive = T, include.dirs = T)) | |
if (!dir.exists(paste0("temp/",codename,"/"))) { dir.create(paste0("temp/",codename,"/")) } | |
file.copy(list.files(paste0(temp_dir,"/models/keras",model_id), pattern = "_wyniki.csv$", full.names = T, recursive = T, include.dirs = T),paste0("temp/",codename,"/",model_id,"_deeplearningresults.csv")) | |
} } | |
#sink() | |
#sink(type="message") | |
# dev.off() | |
tempwyniki2 = cbind(hyperparameters[i,],tempwyniki) | |
tempwyniki2[1,"name"] = paste0(codename,"_", model_id) | |
tempwyniki2[1,"worth_saving"] = (tempwyniki[1, "training_Accuracy"]>save_threshold_trainacc & tempwyniki[1, "test_Accuracy"]>save_threshold_testacc) | |
tempwyniki2 | |
} | |
saveRDS(final, paste0(output_file,".RDS")) | |
cat("\nAll done!! Ending..\n") | |
fwrite(final, output_file) | |
setwd(oldwd) | |
#options(warn=0) | |
# sprzątanie | |
if(clean_temp_files) { | |
unlink(paste0(normalizePath(temp_dir), "/", dir(temp_dir)), recursive = TRUE) } | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
install.packages("BiocManager",repos = "http://cran.r-project.org") | |
gpu = T | |
win = T | |
library(BiocManager) | |
BiocManager::install(c("reticulate","devtools","plyr","dplyr","edgeR","epiDisplay","rsq","MASS","Biocomb","caret","dplyr", | |
"pROC","ggplot2","DMwR", "doParallel", "Boruta", "spFSR", "varSelRF", "stringr", "psych", "C50", "randomForest", | |
"foreach","data.table", "ROSE", "deepnet", "gridExtra", "stargazer","gplots","My.stepwise","snow", | |
"calibrate", "ggrepel", "networkD3", "VennDiagram","RSNNS", "kernlab", "car", "PairedData", | |
"profileR","classInt","kernlab","xgboost", "keras", "tidyverse", "cutpointr","tibble","tidyr", "rpart", "CHAID", "party", "mgcv")) | |
devtools::install_github("STATWORX/bounceR", force = T) | |
#devtools::install_github("rstudio/reticulate") | |
library(keras) | |
if(gpu == F) { install_keras() } else { install_keras(tensorflow = "gpu") } | |
# if(win == T) { | |
# cran <- getOption("repos") | |
# cran["dmlc"] <- "https://s3-us-west-2.amazonaws.com/apache-mxnet/R/CRAN/" | |
# options(repos = cran) | |
# install.packages("mxnet",dependencies = T) | |
# library(mxnet) } | |
# #devtools::install_github("apache/incubator-mxnet/R-package") | |
# #system("../mxnet/docs/install/install_mxnet_ubuntu_python.sh") | |
# | |
# library(reticulate) | |
# library(devtools) | |
# | |
# # paczki nie w CRAN | |
# #install.packages("BiocManager") | |
# | |
# py_config() | |
# | |
# if(create_pyenv) { | |
# gpu = system("nvidia-smi", intern = T) | |
# if (length(gpu) > 1) { | |
# use_condaenv() | |
# system("/home/konrad/anaconda3/bin/conda create -y -n tensorflow -c bioconda -c conda-forge tensorflow-gpu keras mxnet-gpu pandas numpy scikit-learn -c conda-forge xgboost") | |
# #reticulate::conda_create("tensorflow", packages = c("tensorflow-gpu","keras","mxnet-gpu", "pandas", "numpy", "scikit-learn", "-c conda-forge xgboost")) | |
# } else { | |
# reticulate::conda_create("tensorflow",c("tensorflow","keras","mxnet", "pandas","numpy", "scikit-learn", "xgboost")) | |
# } | |
# } | |
# | |
# use_condaenv("tensorflow") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment