Arnie97/doubleScottKnott.R

## doubleScottKnott.R
#!/usr/bin/env Rscript

library('tools')
library('RColorBrewer')
library('ScottKnottESD')
suppressMessages(library('gdata'))

doubleSK <- function(file, clusterers) {
        data <- read.xls(file, header=F)
        data <- as.data.frame(t(data))

        # split the projects
        clusterers <- unlist(strsplit(clusterers, ','))
        n.clusterers <- length(clusterers)
        projects <- matrix(ncol=n.clusterers, nrow=0)
        n.projects <- ncol(data) / n.clusterers
        print(paste(file, n.clusterers, 'clusterers in', n.projects, 'projects'))

        for (i in 1:n.projects - 1) {
                beg <- n.clusterers * i + 1
                end <- n.clusterers *(i + 1)
                project <- data[beg:end]
                sk <- normalizeRank(sk_esd(project)$groups)
                sk.sorted <- sk[order(as.numeric(substring(names(sk), 2)))]
                projects <- rbind(projects, sk.sorted)
        }

        colnames(projects) <- clusterers
        rownames(projects) <- c()
        plotSK(file, projects)
}

normalizeRank <- function(rank) {
        beg <- 1
        for (i in 1:length(rank)) {
                if (length(rank) == i || rank[i] != rank[i+1]) {
                        end <- i
                        for (j in beg:end) {
                                rank[j] <- (beg + end) / 2
                        }
                        beg <- end + 1
                }
        }
        rank
}

plotSK <- function(file, projects) {
        print(projects)
        sk <- sk_esd(projects)

        file <- file_path_sans_ext(file)
        class(file) <- c('FileName', class(file))

        # print the mean and standard deviation
        df <- as.data.frame(sk$m.inf)
        values <- cbind(sk$groups, df$mean, df[3] - df$mean)
        colnames(values) <- c('group', 'mean', 'std')
        print(values)
        write.csv(values, file + 'csv')

        # deduce the graph title
        name <- gsub('_', ' ', file)
        if (endsWith(name, 'F')) name <- 'F-measure'
        if (endsWith(name, 'G')) name <- 'G-measure'
        if (endsWith(name, 'M')) name <- 'MCC'
        if (endsWith(name, 'A') || endsWith(name, 'AUC')) name <- 'AUC'

        # define the color palette
        palette <- c(4, 1, 2, 3, 6, 5, 7, 8, 1, 2, 3, 6, 5, 7, 8, 1, 2, 3, 6, 5, 7, 8)
        palette <- brewer.pal(8, 'Dark2')[palette]
        palette <- rev(palette[1:max(sk$groups)])
        draw <- function() plot(sk, main='', title='', xlab='', ylab='Rankings', las=2, col=palette)

        # specify the graph size (in inches)
        width = 7
        height = 3.5
        dpi = 240

        # specify the font size (in points, i.e. 1/72 inches)
        text.size = 12

        # plot in different formats
        pdf(file + 'pdf', width, height, pointsize=text.size)
        draw()
        png(file + 'png', width, height, pointsize=text.size, units='in', res=dpi)
        draw()
}

'+.FileName' <- function(self, ext) {
        paste(self, ext, sep='.')
}

argv <- commandArgs(trailingOnly=T)
clusterers <- argv[1]
files <- argv[2:length(argv)]
for (file in files) {
        doubleSK(file, clusterers)
}
	#!/usr/bin/env Rscript

	library('tools')
	library('RColorBrewer')
	library('ScottKnottESD')
	suppressMessages(library('gdata'))

	doubleSK <- function(file, clusterers) {
	data <- read.xls(file, header=F)
	data <- as.data.frame(t(data))

	# split the projects
	clusterers <- unlist(strsplit(clusterers, ','))
	n.clusterers <- length(clusterers)
	projects <- matrix(ncol=n.clusterers, nrow=0)
	n.projects <- ncol(data) / n.clusterers
	print(paste(file, n.clusterers, 'clusterers in', n.projects, 'projects'))

	for (i in 1:n.projects - 1) {
	beg <- n.clusterers * i + 1
	end <- n.clusterers *(i + 1)
	project <- data[beg:end]
	sk <- normalizeRank(sk_esd(project)$groups)
	sk.sorted <- sk[order(as.numeric(substring(names(sk), 2)))]
	projects <- rbind(projects, sk.sorted)
	}

	colnames(projects) <- clusterers
	rownames(projects) <- c()
	plotSK(file, projects)
	}

	normalizeRank <- function(rank) {
	beg <- 1
	for (i in 1:length(rank)) {
	if (length(rank) == i \|\| rank[i] != rank[i+1]) {
	end <- i
	for (j in beg:end) {
	rank[j] <- (beg + end) / 2
	}
	beg <- end + 1
	}
	}
	rank
	}

	plotSK <- function(file, projects) {
	print(projects)
	sk <- sk_esd(projects)

	file <- file_path_sans_ext(file)
	class(file) <- c('FileName', class(file))

	# print the mean and standard deviation
	df <- as.data.frame(sk$m.inf)
	values <- cbind(sk$groups, df$mean, df[3] - df$mean)
	colnames(values) <- c('group', 'mean', 'std')
	print(values)
	write.csv(values, file + 'csv')

	# deduce the graph title
	name <- gsub('_', ' ', file)
	if (endsWith(name, 'F')) name <- 'F-measure'
	if (endsWith(name, 'G')) name <- 'G-measure'
	if (endsWith(name, 'M')) name <- 'MCC'
	if (endsWith(name, 'A') \|\| endsWith(name, 'AUC')) name <- 'AUC'

	# define the color palette
	palette <- c(4, 1, 2, 3, 6, 5, 7, 8, 1, 2, 3, 6, 5, 7, 8, 1, 2, 3, 6, 5, 7, 8)
	palette <- brewer.pal(8, 'Dark2')[palette]
	palette <- rev(palette[1:max(sk$groups)])
	draw <- function() plot(sk, main='', title='', xlab='', ylab='Rankings', las=2, col=palette)

	# specify the graph size (in inches)
	width = 7
	height = 3.5
	dpi = 240

	# specify the font size (in points, i.e. 1/72 inches)
	text.size = 12

	# plot in different formats
	pdf(file + 'pdf', width, height, pointsize=text.size)
	draw()
	png(file + 'png', width, height, pointsize=text.size, units='in', res=dpi)
	draw()
	}

	'+.FileName' <- function(self, ext) {
	paste(self, ext, sep='.')
	}

	argv <- commandArgs(trailingOnly=T)
	clusterers <- argv[1]
	files <- argv[2:length(argv)]
	for (file in files) {
	doubleSK(file, clusterers)
	}