Skip to content

Instantly share code, notes, and snippets.

Created May 13, 2015 21:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/62c9295b2a07fb28dafe to your computer and use it in GitHub Desktop.
Save anonymous/62c9295b2a07fb28dafe to your computer and use it in GitHub Desktop.
# an R script for building a model for predicting artist genre
# load the libraries
library(caret) # for the model training
library(doMC) # for parallel processing
library(getopt) # for script options
# GLOBAL variables (defaults)
CORES = 10
NAME = 'model'
# Read in the arguments
spec = matrix(c(
'help', 'h', 0, 'logical',
'train_data_file', 't', 1, 'character',
'out_dir', 'd', 1, 'character',
'cores', 'c', 2, 'integer',
'name', 'n', 2, 'character'
),byrow=T, ncol=4);
opt=getopt(spec)
# print a help message if needed
if ( !is.null(opt$help) ) {
cat(getopt(spec, usage=T));
q(status=1);
}
# sets some defualts if needed
if ( is.null opt$name ) { opt$name = NAME }
if ( is.null opt$cores ) { opt$cores = CORES }
# read in the data
train_data = read.csv(opt$train_data_file, header=T)
# set up the parallel processing
registerDoMC(opt$cores) #use X cores
# set up the training control object
tc = trainControl(method = "repeatedcv", number=10, repeats=10, classProbs=T)
### make the nnet model
# set up the tunning parameter grid
tune.grid = expand.grid(.decay=c(1e-4, 1e-3, 1e-2, 1e-1, .5), .size=(1:5)*5)
# train the model
my.nnet = train(genre ~ ., data=train_data, method="nnet", trControl=tc, MaxNWts=27999, maxit=2000, tuneGrid=tune.grid)
# assign the name
name_var = opt$name
assign(name_var, my.nnet)
# save the model
out_file = paste(opt$name, ".RData", sep="")
save(name_var, file=out_file)
### make the multinom model --- NOT USING ANYMORE
# set up the tunning parameter grid
#tune.grid = expand.grid(.decay=c(1e-4, 1e-3, 1e-2, 1e-1, .5))
# train the model
#multinom = train(genre ~ ., data=train_data, method="multinom", trControl=tc, MaxNWts=27000, maxit=2000, tuneGrid=tune.grid)
# save the model
#save(multinom, file="multinom_model3.RData")
### make the avNNet model --- NOT WORKING
# set up the tunning parameter grid
#tune.grid = expand.grid(.decay=c(1e-4, 1e-3, 1e-2, 1e-1, .5), .size=(1:5)*5)
# train the model
#my.nnet = train(genre ~ ., data=train_data, method="avNNet", trControl=tc, MaxNWts=27999, maxit=2000, tuneGrid=tune.grid)
# save the model
#save(my.nnet, file="avNNet_model3.RData")
# an R script for postprocessing the model
# load some libraries
library(ggplot2)
library(ggdendro) # creates a dendrogram
library(caret) # used for findCorrelation function
library(lattice) # not sure what this is for
library(ade4) # for dist.binary
library(gplots) # for heatmap.2
# set the working directory
setwd("/Users/Scott/for_fun/lyrics/")
#########
# Operations using the model built on Kure
#########
# the final model with expaneded training data
load("models/expanded/nnet_model3.RData")
### DESCRIBE THE DATABASE
term_count = read.csv("predict_genre/term_count.tbl",header=T)
ggplot(term_count) +
geom_bar(aes(x=1:length(term),y=rev(sort(log10(count)))),
stat="identity")
### DESCRIBE THE MODEL
# make some variables
predictions = predict(multifit, newdata=multifit$trainingData)
train.outcome = multifit$trainingData$.outcome
model_bool = predictions == train.outcome
model_diag = data.frame(predictions, train.outcome, model_bool)
# distribution of genres in training data
ggplot(model_diag, aes(x=train.outcome)) +
geom_histogram() +
theme(axis.text.x = element_text(angle = 55, hjust = 1, size=18),
axis.title.x = element_text(size=20),
axis.title.y = element_text(size=20),
axis.text.y = element_text(size=14)) +
xlab("Genre") +
ylab("Count")
# show how well this model works on the training data
ggplot(model_diag, aes(predictions, train.outcome)) +
geom_tile(aes(fill = length(which(model_bool*1 == 1))))
# convert to a matrix for ploting w/o ggplot
term_mat = matrix(0,
nrow = length(levels(model_diag$predictions)),
ncol = length(levels(model_diag$predictions)))
colnames(term_mat) = levels(model_diag$predictions)
rownames(term_mat) = levels(model_diag$predictions)
for (i in 1:nrow(model_diag)) {
# rows are predictions (i.e. Observed)
# cols are from the training set (i.e. Expected)
term_mat[model_diag[i,1], model_diag[i,2]] = term_mat[model_diag[i,1], model_diag[i,2]] + 1
}
heatmap.2(scale(term_mat),
#symm=T,
scale='none',
Rowv=F,
Colv=F,
dendrogram='none',
trace='none',
col=redblue,
xlab = "Observed",
ylab = "Expected",
margins=c(8,8),
density.info="none",
keysize=1)
####
# show how the nnet model works
###
predictions = predict(my.nnet, newdata=my.nnet$trainingData)
train.outcome = my.nnet$trainingData$.outcome
model_bool = predictions == train.outcome
model_diag = data.frame(predictions, train.outcome, model_bool)
# distribution of genres in training data
ggplot(model_diag, aes(x=train.outcome)) +
geom_histogram() +
theme(axis.text.x = element_text(angle = 55, hjust = 1, size=18),
axis.title.x = element_text(size=20),
axis.title.y = element_text(size=20),
axis.text.y = element_text(size=14)) +
xlab("Genre") +
ylab("Count")
# show how well this model works using a confusion matrix
# convert to a matrix for ploting w/o ggplot
term_mat = matrix(0,
nrow = length(levels(model_diag$predictions)),
ncol = length(levels(model_diag$predictions)))
colnames(term_mat) = levels(model_diag$predictions)
rownames(term_mat) = levels(model_diag$predictions)
for (i in 1:nrow(model_diag)) {
# rows are predictions (i.e. Observed)
# cols are from the training set (i.e. Expected)
term_mat[model_diag[i,1], model_diag[i,2]] = term_mat[model_diag[i,1], model_diag[i,2]] + 1
}
tmp = log(term_mat)
tmp[is.infinite(tmp)] = 0
cmat = confusionMatrix(my.nnet)
heatmap.2(cmat$table,
#symm=T,
scale='none',
Rowv=F,
Colv=F,
dendrogram='none',
trace='none',
tracecol='black',
col=redblue,
xlab = "Observed",
ylab = "Expected",
margins=c(8,8),
density.info="none",
keysize=1,
sepcolor="black",
sepwidth=c(0.05, 0.05),
colsep=1:ncol(cmat$table),
rowsep=1:nrow(cmat$table))
# remember to manually save this plot as figures/confusion_matrix.png
##
# genre dendrogram
#
hc = hclust(dist(cmat$table))
ggdendrogram(hc) +
labs(title="Dendrogram Built from Confusion Matrix") +
theme(axis.text.x = element_text(size=16),
axis.text.y = element_text(size=16),
plot.title = element_text(size=20),
axis.ticks.y = element_blank())
ggsave("figures/genre_denro.png")
##
# Model Accuracy figure
##
d = my.nnet$results
d$xnames = paste(d$decay, d$size, sep=", ") # x axis names
limits = aes(ymax=Accuracy + AccuracySD, ymin=Accuracy - AccuracySD) # for error bars
ggplot(d, aes(x=xnames, y=Accuracy)) +
geom_point() +
geom_errorbar(limits) +
theme(axis.text.x = element_text(angle=90, hjust=1)) +
scale_x_discrete(limits=d$xnames) +
xlab("Parameters -- decay, size")
ggsave("figures/model_accuracy.png")
###
# get similarity of terms in heatmap format
# read in the table - this table no longer exists!
terms = read.csv("data/head_100_artist_term.tbl",header=T, row.names=1)
# I migth be missing a library here too - ???
# graph the dissimilarity using the pearson dissimilarity measure
levelplot(as.matrix(dist.binary(t(terms),9)))
# cluster the rows and columns
row.clus = hclust(vegdist(data))
col.clus = hclust(vegdist(t(data)))
# a helper script for calculating AIC for my models
calc_aic = function(model) {
sse = sum(model$residuals^2)
n = dim(model$fitted.values)[1]
aic = 2*model$nuits + n*log(sse/n)
return(aic)
}
# an R script for preprocessing the training table for musician genre classification
# load the libraries
library('getopt')
library(gplots) # for heatmap.2
library(ade4) # for dist.binary
library(lattice) # not sure what this is for
library(caret) # used for findCorrelation function
# GLOBAL variables (defaults)
MIN.ARTISTS = 80 # number of artists to keep for each genre
MIN.PCT = .01 # min percent to classify as low varience
# read in the arguments
spec = matrix(c(
'help', 'h', 0, "logical",
'train_file', 't', 1, "character",
'top_file', 'f', 1, "character",
'out_file', 'o', 1, "character",
'out_dir', 'd', 1, "character",
'min.artists', 'a', 2, "integer",
'min.pct', 'p', 2, "integer"
), byrow=T, ncol=4);
opt = getopt(spec);
# print a help message if needed
if ( !is.null(opt$help) ) {
cat(getopt(spec, usage=T));
q(status=1);
}
# some parameter checks and set defaults if necessary
if ( is.null(opt$min.artists) ) { opt$min.artists = MIN.ARTISTS }
if ( is.null(opt$min.pct) ) { opt$min.pct = MIN.PCT }
# read in the training table
train = read.csv(opt$train_file, header=T)
# read in the top artists table
top_artists = read.csv(opt$top_file, header=T)
#### Noramlize the table to min.artists
to.keep.i = c() # an empty vector that will contain ALL the indicies to keep
for (g in levels(train$genre.1)) {
sub.i = which(train$genre.1 == g) # all the indicies with genre g
if ( length(sub.i) <= opt$min.artists ) {
# keep all the artists in g.sub
to.keep.i = append(to.keep.i, sub.i)
}
else {
# make a subset of count min.artists
weights = rep(.5, length(sub.i))
top_artists_i = train$artist %in% top_artists$artist #indicies of top artist
weights[top_artists_i] == 100 # increase weights of top artists
samp = sample(sub.i, opt$min.artists, replace = F, prob = weights) # sample
to.keep.i = append(to.keep.i, samp) # record the sample
}
}
# get only the sampled indicies
train.samp = train[to.keep.i,]
# output the subset for future reference (if needed)
samp.file = paste(opt$out_dir, "/train.samp.txt", sep="")
write.table(train.samp, samp.file, sep="\t", quote=F, row.names=F)
#### Done Normalizing
# save the artists name in a new vector and remove them from the table
artist_names = train.samp$artist
train.samp = train.samp[,-1]
# remove zero and near-zero variance predictors
min = floor(nrow(train.samp) * opt$min.pct) # number needed to be in > min.pct of observations
to_keep = which(apply(train.samp[,-ncol(train.samp)], 2, sum) > min) # predictors to keep
train.samp.nzv = train.samp[,to_keep]
# function for graphing correlation
printSimMat = function(x, col.clus){
heatmap.2(x,
Rowv=as.dendrogram(col.clus),
Colv=as.dendrogram(col.clus),
dendrogram='none',
trace='none',
col=redblue,
xlab="Predictors",
ylab="Predictors",
main="Pearson's Phi Similarity of Predictor Variables",
labRow="",
labCol="",
margins=c(2,2),
density.info="none")
}
# remove predictors that are correlated
# make a similarity plot using the phi dissimilarity
# I changed from jaccard (1) to phi (9) because jaccard is not symmetric -
# at least how they were doing it (which I think was wrong)
range01 <- function(x){(x-min(x))/(max(x)-min(x))}
pred.dist = dist.binary(t(train.samp.nzv),method = 9)
col.clus = hclust(pred.dist)
sim.mat = 1 - range01(as.matrix(pred.dist))
printSimMat(sim.mat, col.clus)
cor_to_remove = findCorrelation(sim.mat, cutoff=.9)
print(length(cor_to_remove))
if ( length(cor_to_remove) > 0 ) {
train.samp.nzv.cor = train.samp.nzv[,-cor_to_remove]
} else {
train.samp.nzv.cor = train.samp.nzv
}
# this builds the same correlation figure as above but for the
# matrix where correlated coefficients have been removed
pred.dist2 = dist.binary(t(train.samp.nzv.cor),method = 9)
col.clus2 = hclust(pred.dist2)
sim.mat2 = 1 - range01(as.matrix(pred.dist2))
printSimMat(sim.mat2, col.clus2)
###
# Now I have a well curated dataset from with to build a model
###
# NOTE: the numbers is the comments below refer to the top artists training set
# and NOT the expanded training set. Also, I didn't do the second and third
# models for the expanded training set because in the top artist analysis having
# more predictors made the model more accurate.
# add on the outcomes to the last colunm of the training data
train.samp.nzv.cor.final = cbind(train.samp.nzv.cor,
"genre" = train.samp[rownames(train.samp.nzv.cor), ncol(train.samp)])
out.file = paste(opt$out_dir, opt$out_file, sep="/")
write.csv(train.samp.nzv.cor.final, file=out.file, quote=F, row.names=F)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment