/build_model.R

## build_model.R
# an R script for building a model for predicting artist genre

# load the libraries
library(caret) # for the model training
library(doMC)  # for parallel processing
library(getopt)  # for script options

# GLOBAL variables (defaults)
CORES = 10
NAME = 'model'

# Read in the arguments
spec = matrix(c(
'help', 'h', 0, 'logical',
'train_data_file', 't', 1, 'character',
'out_dir', 'd', 1, 'character',
'cores', 'c', 2, 'integer',
'name', 'n', 2, 'character'
),byrow=T, ncol=4);

opt=getopt(spec)

# print a help message if needed
if ( !is.null(opt$help) ) {
    cat(getopt(spec, usage=T));
    q(status=1);
}

# sets some defualts if needed
if ( is.null opt$name ) { opt$name = NAME }
if ( is.null opt$cores ) { opt$cores = CORES }


# read in the data
train_data = read.csv(opt$train_data_file, header=T)

# set up the parallel processing
registerDoMC(opt$cores)  #use X cores

# set up the training control object
tc = trainControl(method = "repeatedcv", number=10, repeats=10, classProbs=T)


### make the nnet model
# set up the tunning parameter grid
tune.grid = expand.grid(.decay=c(1e-4, 1e-3, 1e-2, 1e-1, .5), .size=(1:5)*5)

# train the model
my.nnet = train(genre ~ ., data=train_data, method="nnet", trControl=tc, MaxNWts=27999, maxit=2000, tuneGrid=tune.grid)

# assign the name
name_var = opt$name
assign(name_var, my.nnet)

# save the model
out_file = paste(opt$name, ".RData", sep="")
save(name_var, file=out_file)
### make the multinom model --- NOT USING ANYMORE
# set up the tunning parameter grid
#tune.grid = expand.grid(.decay=c(1e-4, 1e-3, 1e-2, 1e-1, .5))

# train the model
#multinom = train(genre ~ ., data=train_data, method="multinom", trControl=tc, MaxNWts=27000, maxit=2000, tuneGrid=tune.grid)

# save the model
#save(multinom, file="multinom_model3.RData")


### make the avNNet model --- NOT WORKING
# set up the tunning parameter grid
#tune.grid = expand.grid(.decay=c(1e-4, 1e-3, 1e-2, 1e-1, .5), .size=(1:5)*5)

# train the model
#my.nnet = train(genre ~ ., data=train_data, method="avNNet", trControl=tc, MaxNWts=27999, maxit=2000, tuneGrid=tune.grid)

# save the model
#save(my.nnet, file="avNNet_model3.RData")

## postprocess.R
# an R script for postprocessing the model

# load some libraries
library(ggplot2)
library(ggdendro)  # creates a dendrogram
library(caret)  # used for findCorrelation function
library(lattice) # not sure what this is for
library(ade4) # for dist.binary
library(gplots)  # for heatmap.2

# set the working directory
setwd("/Users/Scott/for_fun/lyrics/")

#########
# Operations using the model built on Kure
#########

# the final model with expaneded training data
load("models/expanded/nnet_model3.RData")

### DESCRIBE THE DATABASE
term_count = read.csv("predict_genre/term_count.tbl",header=T)
ggplot(term_count) +
  geom_bar(aes(x=1:length(term),y=rev(sort(log10(count)))),
           stat="identity")

### DESCRIBE THE MODEL
# make some variables
predictions = predict(multifit, newdata=multifit$trainingData)
train.outcome = multifit$trainingData$.outcome
model_bool = predictions == train.outcome
model_diag = data.frame(predictions, train.outcome, model_bool)

# distribution of genres in training data
ggplot(model_diag, aes(x=train.outcome)) +
  geom_histogram() +
  theme(axis.text.x = element_text(angle = 55, hjust = 1, size=18),
        axis.title.x = element_text(size=20),
        axis.title.y = element_text(size=20),
        axis.text.y = element_text(size=14)) +
  xlab("Genre") +
  ylab("Count")

# show how well this model works on the training data
ggplot(model_diag, aes(predictions, train.outcome)) +
  geom_tile(aes(fill = length(which(model_bool*1 == 1))))

# convert to a matrix for ploting w/o ggplot
term_mat = matrix(0,
                  nrow = length(levels(model_diag$predictions)),
                  ncol = length(levels(model_diag$predictions)))
colnames(term_mat) = levels(model_diag$predictions)
rownames(term_mat) = levels(model_diag$predictions)

for (i in 1:nrow(model_diag)) {
  # rows are predictions (i.e. Observed)
  # cols are from the training set (i.e. Expected)
  term_mat[model_diag[i,1], model_diag[i,2]] = term_mat[model_diag[i,1], model_diag[i,2]] + 1
}

heatmap.2(scale(term_mat),
          #symm=T,
          scale='none',
          Rowv=F,
          Colv=F,
          dendrogram='none',
          trace='none',
          col=redblue,
          xlab = "Observed",
          ylab = "Expected",
          margins=c(8,8),
          density.info="none",
          keysize=1)


####
# show how the nnet model works
###
predictions = predict(my.nnet, newdata=my.nnet$trainingData)
train.outcome = my.nnet$trainingData$.outcome
model_bool = predictions == train.outcome
model_diag = data.frame(predictions, train.outcome, model_bool)

# distribution of genres in training data
ggplot(model_diag, aes(x=train.outcome)) +
  geom_histogram() +
  theme(axis.text.x = element_text(angle = 55, hjust = 1, size=18),
        axis.title.x = element_text(size=20),
        axis.title.y = element_text(size=20),
        axis.text.y = element_text(size=14)) +
  xlab("Genre") +
  ylab("Count")

# show how well this model works using a confusion matrix
# convert to a matrix for ploting w/o ggplot
term_mat = matrix(0,
                  nrow = length(levels(model_diag$predictions)),
                  ncol = length(levels(model_diag$predictions)))
colnames(term_mat) = levels(model_diag$predictions)
rownames(term_mat) = levels(model_diag$predictions)

for (i in 1:nrow(model_diag)) {
  # rows are predictions (i.e. Observed)
  # cols are from the training set (i.e. Expected)
  term_mat[model_diag[i,1], model_diag[i,2]] = term_mat[model_diag[i,1], model_diag[i,2]] + 1
}

tmp = log(term_mat)
tmp[is.infinite(tmp)] = 0

cmat = confusionMatrix(my.nnet)
heatmap.2(cmat$table,
          #symm=T,
          scale='none',
          Rowv=F,
          Colv=F,
          dendrogram='none',
          trace='none',
          tracecol='black',
          col=redblue,
          xlab = "Observed",
          ylab = "Expected",
          margins=c(8,8),
          density.info="none",
          keysize=1,
          sepcolor="black",
          sepwidth=c(0.05, 0.05),
          colsep=1:ncol(cmat$table),
          rowsep=1:nrow(cmat$table))
# remember to manually save this plot as figures/confusion_matrix.png


##
# genre dendrogram
#
hc = hclust(dist(cmat$table))
ggdendrogram(hc) +
  labs(title="Dendrogram Built from Confusion Matrix") +
  theme(axis.text.x = element_text(size=16),
        axis.text.y = element_text(size=16),
        plot.title = element_text(size=20),
        axis.ticks.y = element_blank())
ggsave("figures/genre_denro.png")


##
# Model Accuracy figure
##
d = my.nnet$results
d$xnames = paste(d$decay, d$size, sep=", ") # x axis names
limits = aes(ymax=Accuracy + AccuracySD, ymin=Accuracy - AccuracySD) # for error bars
ggplot(d, aes(x=xnames, y=Accuracy)) +
  geom_point() +
  geom_errorbar(limits) +
  theme(axis.text.x = element_text(angle=90, hjust=1)) +
  scale_x_discrete(limits=d$xnames) +
  xlab("Parameters -- decay, size")
ggsave("figures/model_accuracy.png")


###
# get similarity of terms in heatmap format
# read in the table - this table no longer exists!
terms = read.csv("data/head_100_artist_term.tbl",header=T, row.names=1)

# I migth be missing a library here too - ???
# graph the dissimilarity using the pearson dissimilarity measure
levelplot(as.matrix(dist.binary(t(terms),9)))


# cluster the rows and columns
row.clus = hclust(vegdist(data))
col.clus = hclust(vegdist(t(data)))


# a helper script for calculating AIC for my models
calc_aic = function(model) {
  sse = sum(model$residuals^2)
  n = dim(model$fitted.values)[1]
  aic = 2*model$nuits + n*log(sse/n)
  return(aic)
}

## preprocess.R
# an R script for preprocessing the training table for musician genre classification

# load the libraries
library('getopt')
library(gplots) # for heatmap.2
library(ade4) # for dist.binary
library(lattice) # not sure what this is for
library(caret) # used for findCorrelation function

# GLOBAL variables (defaults)
MIN.ARTISTS = 80 # number of artists to keep for each genre
MIN.PCT = .01 # min percent to classify as low varience

# read in the arguments
spec = matrix(c(
'help', 'h', 0, "logical",
'train_file', 't', 1, "character",
'top_file', 'f', 1, "character",
'out_file', 'o', 1, "character",
'out_dir', 'd', 1, "character",
'min.artists', 'a', 2, "integer",
'min.pct', 'p', 2, "integer"
), byrow=T, ncol=4);
opt = getopt(spec);

# print a help message if needed
if ( !is.null(opt$help) ) {
    cat(getopt(spec, usage=T));
    q(status=1);
}

# some parameter checks and set defaults if necessary
if ( is.null(opt$min.artists) ) { opt$min.artists = MIN.ARTISTS }
if ( is.null(opt$min.pct) ) { opt$min.pct = MIN.PCT }


# read in the training table
train = read.csv(opt$train_file, header=T)

# read in the top artists table
top_artists = read.csv(opt$top_file, header=T)


#### Noramlize the table to min.artists
to.keep.i = c()  # an empty vector that will contain ALL the indicies to keep

for (g in levels(train$genre.1)) {
  sub.i = which(train$genre.1 == g)  # all the indicies with genre g

  if ( length(sub.i) <= opt$min.artists ) {
    # keep all the artists in g.sub
    to.keep.i = append(to.keep.i, sub.i)
  }
  else {
    # make a subset of count min.artists
    weights = rep(.5, length(sub.i))
    top_artists_i = train$artist %in% top_artists$artist  #indicies of top artist
    weights[top_artists_i] == 100  # increase weights of top artists

    samp = sample(sub.i, opt$min.artists, replace = F, prob = weights)   # sample
    to.keep.i = append(to.keep.i, samp)  # record the sample
  }
}

# get only the sampled indicies
train.samp = train[to.keep.i,]

# output the subset for future reference (if needed)
samp.file = paste(opt$out_dir, "/train.samp.txt", sep="")
write.table(train.samp, samp.file, sep="\t", quote=F, row.names=F)

#### Done Normalizing


# save the artists name in a new vector and remove them from the table
artist_names = train.samp$artist
train.samp = train.samp[,-1]

# remove zero and near-zero variance predictors
min = floor(nrow(train.samp) * opt$min.pct) # number needed to be in > min.pct of observations
to_keep = which(apply(train.samp[,-ncol(train.samp)], 2, sum) > min) # predictors to keep
train.samp.nzv = train.samp[,to_keep]

# function for graphing correlation
printSimMat = function(x, col.clus){
  heatmap.2(x,
            Rowv=as.dendrogram(col.clus),
            Colv=as.dendrogram(col.clus),
            dendrogram='none',
            trace='none',
            col=redblue,
            xlab="Predictors",
            ylab="Predictors",
            main="Pearson's Phi Similarity of Predictor Variables",
            labRow="",
            labCol="",
            margins=c(2,2),
            density.info="none")
}

# remove predictors that are correlated
# make a similarity plot using the phi dissimilarity
# I changed from jaccard (1) to phi (9) because jaccard is not symmetric -
#   at least how they were doing it (which I think was wrong)
range01 <- function(x){(x-min(x))/(max(x)-min(x))}
pred.dist = dist.binary(t(train.samp.nzv),method = 9)
col.clus = hclust(pred.dist)
sim.mat = 1 - range01(as.matrix(pred.dist))
printSimMat(sim.mat, col.clus)

cor_to_remove = findCorrelation(sim.mat, cutoff=.9)
print(length(cor_to_remove))
if ( length(cor_to_remove) > 0 ) {
  train.samp.nzv.cor = train.samp.nzv[,-cor_to_remove]
} else {
  train.samp.nzv.cor = train.samp.nzv
}

# this builds the same correlation figure as above but for the
# matrix where correlated coefficients have been removed
pred.dist2 = dist.binary(t(train.samp.nzv.cor),method = 9)
col.clus2 = hclust(pred.dist2)
sim.mat2 = 1 - range01(as.matrix(pred.dist2))
printSimMat(sim.mat2, col.clus2)

###
# Now I have a well curated dataset from with to build a model
###
# NOTE: the numbers is the comments below refer to the top artists training set
# and NOT the expanded training set.  Also, I didn't do the second and third
# models for the expanded training set because in the top artist analysis having
# more predictors made the model more accurate.

# add on the outcomes to the last colunm of the training data
train.samp.nzv.cor.final = cbind(train.samp.nzv.cor,
                                "genre" = train.samp[rownames(train.samp.nzv.cor), ncol(train.samp)])

out.file = paste(opt$out_dir, opt$out_file, sep="/")
write.csv(train.samp.nzv.cor.final, file=out.file, quote=F, row.names=F)
	# an R script for building a model for predicting artist genre

	# load the libraries
	library(caret) # for the model training
	library(doMC) # for parallel processing
	library(getopt) # for script options

	# GLOBAL variables (defaults)
	CORES = 10
	NAME = 'model'

	# Read in the arguments
	spec = matrix(c(
	'help', 'h', 0, 'logical',
	'train_data_file', 't', 1, 'character',
	'out_dir', 'd', 1, 'character',
	'cores', 'c', 2, 'integer',
	'name', 'n', 2, 'character'
	),byrow=T, ncol=4);

	opt=getopt(spec)

	# print a help message if needed
	if ( !is.null(opt$help) ) {
	cat(getopt(spec, usage=T));
	q(status=1);
	}

	# sets some defualts if needed
	if ( is.null opt$name ) { opt$name = NAME }
	if ( is.null opt$cores ) { opt$cores = CORES }


	# read in the data
	train_data = read.csv(opt$train_data_file, header=T)

	# set up the parallel processing
	registerDoMC(opt$cores) #use X cores

	# set up the training control object
	tc = trainControl(method = "repeatedcv", number=10, repeats=10, classProbs=T)


	### make the nnet model
	# set up the tunning parameter grid
	tune.grid = expand.grid(.decay=c(1e-4, 1e-3, 1e-2, 1e-1, .5), .size=(1:5)*5)

	# train the model
	my.nnet = train(genre ~ ., data=train_data, method="nnet", trControl=tc, MaxNWts=27999, maxit=2000, tuneGrid=tune.grid)

	# assign the name
	name_var = opt$name
	assign(name_var, my.nnet)

	# save the model
	out_file = paste(opt$name, ".RData", sep="")
	save(name_var, file=out_file)
	### make the multinom model --- NOT USING ANYMORE
	# set up the tunning parameter grid
	#tune.grid = expand.grid(.decay=c(1e-4, 1e-3, 1e-2, 1e-1, .5))

	# train the model
	#multinom = train(genre ~ ., data=train_data, method="multinom", trControl=tc, MaxNWts=27000, maxit=2000, tuneGrid=tune.grid)

	# save the model
	#save(multinom, file="multinom_model3.RData")



	### make the avNNet model --- NOT WORKING
	# set up the tunning parameter grid
	#tune.grid = expand.grid(.decay=c(1e-4, 1e-3, 1e-2, 1e-1, .5), .size=(1:5)*5)

	# train the model
	#my.nnet = train(genre ~ ., data=train_data, method="avNNet", trControl=tc, MaxNWts=27999, maxit=2000, tuneGrid=tune.grid)

	# save the model
	#save(my.nnet, file="avNNet_model3.RData")
	# an R script for postprocessing the model

	# load some libraries
	library(ggplot2)
	library(ggdendro) # creates a dendrogram
	library(caret) # used for findCorrelation function
	library(lattice) # not sure what this is for
	library(ade4) # for dist.binary
	library(gplots) # for heatmap.2

	# set the working directory
	setwd("/Users/Scott/for_fun/lyrics/")

	#########
	# Operations using the model built on Kure
	#########

	# the final model with expaneded training data
	load("models/expanded/nnet_model3.RData")

	### DESCRIBE THE DATABASE
	term_count = read.csv("predict_genre/term_count.tbl",header=T)
	ggplot(term_count) +
	geom_bar(aes(x=1:length(term),y=rev(sort(log10(count)))),
	stat="identity")

	### DESCRIBE THE MODEL
	# make some variables
	predictions = predict(multifit, newdata=multifit$trainingData)
	train.outcome = multifit$trainingData$.outcome
	model_bool = predictions == train.outcome
	model_diag = data.frame(predictions, train.outcome, model_bool)

	# distribution of genres in training data
	ggplot(model_diag, aes(x=train.outcome)) +
	geom_histogram() +
	theme(axis.text.x = element_text(angle = 55, hjust = 1, size=18),
	axis.title.x = element_text(size=20),
	axis.title.y = element_text(size=20),
	axis.text.y = element_text(size=14)) +
	xlab("Genre") +
	ylab("Count")

	# show how well this model works on the training data
	ggplot(model_diag, aes(predictions, train.outcome)) +
	geom_tile(aes(fill = length(which(model_bool*1 == 1))))

	# convert to a matrix for ploting w/o ggplot
	term_mat = matrix(0,
	nrow = length(levels(model_diag$predictions)),
	ncol = length(levels(model_diag$predictions)))
	colnames(term_mat) = levels(model_diag$predictions)
	rownames(term_mat) = levels(model_diag$predictions)

	for (i in 1:nrow(model_diag)) {
	# rows are predictions (i.e. Observed)
	# cols are from the training set (i.e. Expected)
	term_mat[model_diag[i,1], model_diag[i,2]] = term_mat[model_diag[i,1], model_diag[i,2]] + 1
	}

	heatmap.2(scale(term_mat),
	#symm=T,
	scale='none',
	Rowv=F,
	Colv=F,
	dendrogram='none',
	trace='none',
	col=redblue,
	xlab = "Observed",
	ylab = "Expected",
	margins=c(8,8),
	density.info="none",
	keysize=1)


	####
	# show how the nnet model works
	###
	predictions = predict(my.nnet, newdata=my.nnet$trainingData)
	train.outcome = my.nnet$trainingData$.outcome
	model_bool = predictions == train.outcome
	model_diag = data.frame(predictions, train.outcome, model_bool)

	# distribution of genres in training data
	ggplot(model_diag, aes(x=train.outcome)) +
	geom_histogram() +
	theme(axis.text.x = element_text(angle = 55, hjust = 1, size=18),
	axis.title.x = element_text(size=20),
	axis.title.y = element_text(size=20),
	axis.text.y = element_text(size=14)) +
	xlab("Genre") +
	ylab("Count")

	# show how well this model works using a confusion matrix
	# convert to a matrix for ploting w/o ggplot
	term_mat = matrix(0,
	nrow = length(levels(model_diag$predictions)),
	ncol = length(levels(model_diag$predictions)))
	colnames(term_mat) = levels(model_diag$predictions)
	rownames(term_mat) = levels(model_diag$predictions)

	for (i in 1:nrow(model_diag)) {
	# rows are predictions (i.e. Observed)
	# cols are from the training set (i.e. Expected)
	term_mat[model_diag[i,1], model_diag[i,2]] = term_mat[model_diag[i,1], model_diag[i,2]] + 1
	}

	tmp = log(term_mat)
	tmp[is.infinite(tmp)] = 0

	cmat = confusionMatrix(my.nnet)
	heatmap.2(cmat$table,
	#symm=T,
	scale='none',
	Rowv=F,
	Colv=F,
	dendrogram='none',
	trace='none',
	tracecol='black',
	col=redblue,
	xlab = "Observed",
	ylab = "Expected",
	margins=c(8,8),
	density.info="none",
	keysize=1,
	sepcolor="black",
	sepwidth=c(0.05, 0.05),
	colsep=1:ncol(cmat$table),
	rowsep=1:nrow(cmat$table))
	# remember to manually save this plot as figures/confusion_matrix.png


	##
	# genre dendrogram
	#
	hc = hclust(dist(cmat$table))
	ggdendrogram(hc) +
	labs(title="Dendrogram Built from Confusion Matrix") +
	theme(axis.text.x = element_text(size=16),
	axis.text.y = element_text(size=16),
	plot.title = element_text(size=20),
	axis.ticks.y = element_blank())
	ggsave("figures/genre_denro.png")


	##
	# Model Accuracy figure
	##
	d = my.nnet$results
	d$xnames = paste(d$decay, d$size, sep=", ") # x axis names
	limits = aes(ymax=Accuracy + AccuracySD, ymin=Accuracy - AccuracySD) # for error bars
	ggplot(d, aes(x=xnames, y=Accuracy)) +
	geom_point() +
	geom_errorbar(limits) +
	theme(axis.text.x = element_text(angle=90, hjust=1)) +
	scale_x_discrete(limits=d$xnames) +
	xlab("Parameters -- decay, size")
	ggsave("figures/model_accuracy.png")


	###
	# get similarity of terms in heatmap format
	# read in the table - this table no longer exists!
	terms = read.csv("data/head_100_artist_term.tbl",header=T, row.names=1)

	# I migth be missing a library here too - ???
	# graph the dissimilarity using the pearson dissimilarity measure
	levelplot(as.matrix(dist.binary(t(terms),9)))





	# cluster the rows and columns
	row.clus = hclust(vegdist(data))
	col.clus = hclust(vegdist(t(data)))



	# a helper script for calculating AIC for my models
	calc_aic = function(model) {
	sse = sum(model$residuals^2)
	n = dim(model$fitted.values)[1]
	aic = 2model$nuits + nlog(sse/n)
	return(aic)
	}
	# an R script for preprocessing the training table for musician genre classification

	# load the libraries
	library('getopt')
	library(gplots) # for heatmap.2
	library(ade4) # for dist.binary
	library(lattice) # not sure what this is for
	library(caret) # used for findCorrelation function

	# GLOBAL variables (defaults)
	MIN.ARTISTS = 80 # number of artists to keep for each genre
	MIN.PCT = .01 # min percent to classify as low varience

	# read in the arguments
	spec = matrix(c(
	'help', 'h', 0, "logical",
	'train_file', 't', 1, "character",
	'top_file', 'f', 1, "character",
	'out_file', 'o', 1, "character",
	'out_dir', 'd', 1, "character",
	'min.artists', 'a', 2, "integer",
	'min.pct', 'p', 2, "integer"
	), byrow=T, ncol=4);
	opt = getopt(spec);

	# print a help message if needed
	if ( !is.null(opt$help) ) {
	cat(getopt(spec, usage=T));
	q(status=1);
	}

	# some parameter checks and set defaults if necessary
	if ( is.null(opt$min.artists) ) { opt$min.artists = MIN.ARTISTS }
	if ( is.null(opt$min.pct) ) { opt$min.pct = MIN.PCT }


	# read in the training table
	train = read.csv(opt$train_file, header=T)

	# read in the top artists table
	top_artists = read.csv(opt$top_file, header=T)


	#### Noramlize the table to min.artists
	to.keep.i = c() # an empty vector that will contain ALL the indicies to keep

	for (g in levels(train$genre.1)) {
	sub.i = which(train$genre.1 == g) # all the indicies with genre g

	if ( length(sub.i) <= opt$min.artists ) {
	# keep all the artists in g.sub
	to.keep.i = append(to.keep.i, sub.i)
	}
	else {
	# make a subset of count min.artists
	weights = rep(.5, length(sub.i))
	top_artists_i = train$artist %in% top_artists$artist #indicies of top artist
	weights[top_artists_i] == 100 # increase weights of top artists

	samp = sample(sub.i, opt$min.artists, replace = F, prob = weights) # sample
	to.keep.i = append(to.keep.i, samp) # record the sample
	}
	}

	# get only the sampled indicies
	train.samp = train[to.keep.i,]

	# output the subset for future reference (if needed)
	samp.file = paste(opt$out_dir, "/train.samp.txt", sep="")
	write.table(train.samp, samp.file, sep="\t", quote=F, row.names=F)

	#### Done Normalizing


	# save the artists name in a new vector and remove them from the table
	artist_names = train.samp$artist
	train.samp = train.samp[,-1]

	# remove zero and near-zero variance predictors
	min = floor(nrow(train.samp) * opt$min.pct) # number needed to be in > min.pct of observations
	to_keep = which(apply(train.samp[,-ncol(train.samp)], 2, sum) > min) # predictors to keep
	train.samp.nzv = train.samp[,to_keep]

	# function for graphing correlation
	printSimMat = function(x, col.clus){
	heatmap.2(x,
	Rowv=as.dendrogram(col.clus),
	Colv=as.dendrogram(col.clus),
	dendrogram='none',
	trace='none',
	col=redblue,
	xlab="Predictors",
	ylab="Predictors",
	main="Pearson's Phi Similarity of Predictor Variables",
	labRow="",
	labCol="",
	margins=c(2,2),
	density.info="none")
	}

	# remove predictors that are correlated
	# make a similarity plot using the phi dissimilarity
	# I changed from jaccard (1) to phi (9) because jaccard is not symmetric -
	# at least how they were doing it (which I think was wrong)
	range01 <- function(x){(x-min(x))/(max(x)-min(x))}
	pred.dist = dist.binary(t(train.samp.nzv),method = 9)
	col.clus = hclust(pred.dist)
	sim.mat = 1 - range01(as.matrix(pred.dist))
	printSimMat(sim.mat, col.clus)

	cor_to_remove = findCorrelation(sim.mat, cutoff=.9)
	print(length(cor_to_remove))
	if ( length(cor_to_remove) > 0 ) {
	train.samp.nzv.cor = train.samp.nzv[,-cor_to_remove]
	} else {
	train.samp.nzv.cor = train.samp.nzv
	}

	# this builds the same correlation figure as above but for the
	# matrix where correlated coefficients have been removed
	pred.dist2 = dist.binary(t(train.samp.nzv.cor),method = 9)
	col.clus2 = hclust(pred.dist2)
	sim.mat2 = 1 - range01(as.matrix(pred.dist2))
	printSimMat(sim.mat2, col.clus2)

	###
	# Now I have a well curated dataset from with to build a model
	###
	# NOTE: the numbers is the comments below refer to the top artists training set
	# and NOT the expanded training set. Also, I didn't do the second and third
	# models for the expanded training set because in the top artist analysis having
	# more predictors made the model more accurate.

	# add on the outcomes to the last colunm of the training data
	train.samp.nzv.cor.final = cbind(train.samp.nzv.cor,
	"genre" = train.samp[rownames(train.samp.nzv.cor), ncol(train.samp)])

	out.file = paste(opt$out_dir, opt$out_file, sep="/")
	write.csv(train.samp.nzv.cor.final, file=out.file, quote=F, row.names=F)