Created
May 13, 2015 21:21
-
-
Save anonymous/62c9295b2a07fb28dafe to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# an R script for building a model for predicting artist genre | |
# load the libraries | |
library(caret) # for the model training | |
library(doMC) # for parallel processing | |
library(getopt) # for script options | |
# GLOBAL variables (defaults) | |
CORES = 10 | |
NAME = 'model' | |
# Read in the arguments | |
spec = matrix(c( | |
'help', 'h', 0, 'logical', | |
'train_data_file', 't', 1, 'character', | |
'out_dir', 'd', 1, 'character', | |
'cores', 'c', 2, 'integer', | |
'name', 'n', 2, 'character' | |
),byrow=T, ncol=4); | |
opt=getopt(spec) | |
# print a help message if needed | |
if ( !is.null(opt$help) ) { | |
cat(getopt(spec, usage=T)); | |
q(status=1); | |
} | |
# sets some defualts if needed | |
if ( is.null opt$name ) { opt$name = NAME } | |
if ( is.null opt$cores ) { opt$cores = CORES } | |
# read in the data | |
train_data = read.csv(opt$train_data_file, header=T) | |
# set up the parallel processing | |
registerDoMC(opt$cores) #use X cores | |
# set up the training control object | |
tc = trainControl(method = "repeatedcv", number=10, repeats=10, classProbs=T) | |
### make the nnet model | |
# set up the tunning parameter grid | |
tune.grid = expand.grid(.decay=c(1e-4, 1e-3, 1e-2, 1e-1, .5), .size=(1:5)*5) | |
# train the model | |
my.nnet = train(genre ~ ., data=train_data, method="nnet", trControl=tc, MaxNWts=27999, maxit=2000, tuneGrid=tune.grid) | |
# assign the name | |
name_var = opt$name | |
assign(name_var, my.nnet) | |
# save the model | |
out_file = paste(opt$name, ".RData", sep="") | |
save(name_var, file=out_file) | |
### make the multinom model --- NOT USING ANYMORE | |
# set up the tunning parameter grid | |
#tune.grid = expand.grid(.decay=c(1e-4, 1e-3, 1e-2, 1e-1, .5)) | |
# train the model | |
#multinom = train(genre ~ ., data=train_data, method="multinom", trControl=tc, MaxNWts=27000, maxit=2000, tuneGrid=tune.grid) | |
# save the model | |
#save(multinom, file="multinom_model3.RData") | |
### make the avNNet model --- NOT WORKING | |
# set up the tunning parameter grid | |
#tune.grid = expand.grid(.decay=c(1e-4, 1e-3, 1e-2, 1e-1, .5), .size=(1:5)*5) | |
# train the model | |
#my.nnet = train(genre ~ ., data=train_data, method="avNNet", trControl=tc, MaxNWts=27999, maxit=2000, tuneGrid=tune.grid) | |
# save the model | |
#save(my.nnet, file="avNNet_model3.RData") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# an R script for postprocessing the model | |
# load some libraries | |
library(ggplot2) | |
library(ggdendro) # creates a dendrogram | |
library(caret) # used for findCorrelation function | |
library(lattice) # not sure what this is for | |
library(ade4) # for dist.binary | |
library(gplots) # for heatmap.2 | |
# set the working directory | |
setwd("/Users/Scott/for_fun/lyrics/") | |
######### | |
# Operations using the model built on Kure | |
######### | |
# the final model with expaneded training data | |
load("models/expanded/nnet_model3.RData") | |
### DESCRIBE THE DATABASE | |
term_count = read.csv("predict_genre/term_count.tbl",header=T) | |
ggplot(term_count) + | |
geom_bar(aes(x=1:length(term),y=rev(sort(log10(count)))), | |
stat="identity") | |
### DESCRIBE THE MODEL | |
# make some variables | |
predictions = predict(multifit, newdata=multifit$trainingData) | |
train.outcome = multifit$trainingData$.outcome | |
model_bool = predictions == train.outcome | |
model_diag = data.frame(predictions, train.outcome, model_bool) | |
# distribution of genres in training data | |
ggplot(model_diag, aes(x=train.outcome)) + | |
geom_histogram() + | |
theme(axis.text.x = element_text(angle = 55, hjust = 1, size=18), | |
axis.title.x = element_text(size=20), | |
axis.title.y = element_text(size=20), | |
axis.text.y = element_text(size=14)) + | |
xlab("Genre") + | |
ylab("Count") | |
# show how well this model works on the training data | |
ggplot(model_diag, aes(predictions, train.outcome)) + | |
geom_tile(aes(fill = length(which(model_bool*1 == 1)))) | |
# convert to a matrix for ploting w/o ggplot | |
term_mat = matrix(0, | |
nrow = length(levels(model_diag$predictions)), | |
ncol = length(levels(model_diag$predictions))) | |
colnames(term_mat) = levels(model_diag$predictions) | |
rownames(term_mat) = levels(model_diag$predictions) | |
for (i in 1:nrow(model_diag)) { | |
# rows are predictions (i.e. Observed) | |
# cols are from the training set (i.e. Expected) | |
term_mat[model_diag[i,1], model_diag[i,2]] = term_mat[model_diag[i,1], model_diag[i,2]] + 1 | |
} | |
heatmap.2(scale(term_mat), | |
#symm=T, | |
scale='none', | |
Rowv=F, | |
Colv=F, | |
dendrogram='none', | |
trace='none', | |
col=redblue, | |
xlab = "Observed", | |
ylab = "Expected", | |
margins=c(8,8), | |
density.info="none", | |
keysize=1) | |
#### | |
# show how the nnet model works | |
### | |
predictions = predict(my.nnet, newdata=my.nnet$trainingData) | |
train.outcome = my.nnet$trainingData$.outcome | |
model_bool = predictions == train.outcome | |
model_diag = data.frame(predictions, train.outcome, model_bool) | |
# distribution of genres in training data | |
ggplot(model_diag, aes(x=train.outcome)) + | |
geom_histogram() + | |
theme(axis.text.x = element_text(angle = 55, hjust = 1, size=18), | |
axis.title.x = element_text(size=20), | |
axis.title.y = element_text(size=20), | |
axis.text.y = element_text(size=14)) + | |
xlab("Genre") + | |
ylab("Count") | |
# show how well this model works using a confusion matrix | |
# convert to a matrix for ploting w/o ggplot | |
term_mat = matrix(0, | |
nrow = length(levels(model_diag$predictions)), | |
ncol = length(levels(model_diag$predictions))) | |
colnames(term_mat) = levels(model_diag$predictions) | |
rownames(term_mat) = levels(model_diag$predictions) | |
for (i in 1:nrow(model_diag)) { | |
# rows are predictions (i.e. Observed) | |
# cols are from the training set (i.e. Expected) | |
term_mat[model_diag[i,1], model_diag[i,2]] = term_mat[model_diag[i,1], model_diag[i,2]] + 1 | |
} | |
tmp = log(term_mat) | |
tmp[is.infinite(tmp)] = 0 | |
cmat = confusionMatrix(my.nnet) | |
heatmap.2(cmat$table, | |
#symm=T, | |
scale='none', | |
Rowv=F, | |
Colv=F, | |
dendrogram='none', | |
trace='none', | |
tracecol='black', | |
col=redblue, | |
xlab = "Observed", | |
ylab = "Expected", | |
margins=c(8,8), | |
density.info="none", | |
keysize=1, | |
sepcolor="black", | |
sepwidth=c(0.05, 0.05), | |
colsep=1:ncol(cmat$table), | |
rowsep=1:nrow(cmat$table)) | |
# remember to manually save this plot as figures/confusion_matrix.png | |
## | |
# genre dendrogram | |
# | |
hc = hclust(dist(cmat$table)) | |
ggdendrogram(hc) + | |
labs(title="Dendrogram Built from Confusion Matrix") + | |
theme(axis.text.x = element_text(size=16), | |
axis.text.y = element_text(size=16), | |
plot.title = element_text(size=20), | |
axis.ticks.y = element_blank()) | |
ggsave("figures/genre_denro.png") | |
## | |
# Model Accuracy figure | |
## | |
d = my.nnet$results | |
d$xnames = paste(d$decay, d$size, sep=", ") # x axis names | |
limits = aes(ymax=Accuracy + AccuracySD, ymin=Accuracy - AccuracySD) # for error bars | |
ggplot(d, aes(x=xnames, y=Accuracy)) + | |
geom_point() + | |
geom_errorbar(limits) + | |
theme(axis.text.x = element_text(angle=90, hjust=1)) + | |
scale_x_discrete(limits=d$xnames) + | |
xlab("Parameters -- decay, size") | |
ggsave("figures/model_accuracy.png") | |
### | |
# get similarity of terms in heatmap format | |
# read in the table - this table no longer exists! | |
terms = read.csv("data/head_100_artist_term.tbl",header=T, row.names=1) | |
# I migth be missing a library here too - ??? | |
# graph the dissimilarity using the pearson dissimilarity measure | |
levelplot(as.matrix(dist.binary(t(terms),9))) | |
# cluster the rows and columns | |
row.clus = hclust(vegdist(data)) | |
col.clus = hclust(vegdist(t(data))) | |
# a helper script for calculating AIC for my models | |
calc_aic = function(model) { | |
sse = sum(model$residuals^2) | |
n = dim(model$fitted.values)[1] | |
aic = 2*model$nuits + n*log(sse/n) | |
return(aic) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# an R script for preprocessing the training table for musician genre classification | |
# load the libraries | |
library('getopt') | |
library(gplots) # for heatmap.2 | |
library(ade4) # for dist.binary | |
library(lattice) # not sure what this is for | |
library(caret) # used for findCorrelation function | |
# GLOBAL variables (defaults) | |
MIN.ARTISTS = 80 # number of artists to keep for each genre | |
MIN.PCT = .01 # min percent to classify as low varience | |
# read in the arguments | |
spec = matrix(c( | |
'help', 'h', 0, "logical", | |
'train_file', 't', 1, "character", | |
'top_file', 'f', 1, "character", | |
'out_file', 'o', 1, "character", | |
'out_dir', 'd', 1, "character", | |
'min.artists', 'a', 2, "integer", | |
'min.pct', 'p', 2, "integer" | |
), byrow=T, ncol=4); | |
opt = getopt(spec); | |
# print a help message if needed | |
if ( !is.null(opt$help) ) { | |
cat(getopt(spec, usage=T)); | |
q(status=1); | |
} | |
# some parameter checks and set defaults if necessary | |
if ( is.null(opt$min.artists) ) { opt$min.artists = MIN.ARTISTS } | |
if ( is.null(opt$min.pct) ) { opt$min.pct = MIN.PCT } | |
# read in the training table | |
train = read.csv(opt$train_file, header=T) | |
# read in the top artists table | |
top_artists = read.csv(opt$top_file, header=T) | |
#### Noramlize the table to min.artists | |
to.keep.i = c() # an empty vector that will contain ALL the indicies to keep | |
for (g in levels(train$genre.1)) { | |
sub.i = which(train$genre.1 == g) # all the indicies with genre g | |
if ( length(sub.i) <= opt$min.artists ) { | |
# keep all the artists in g.sub | |
to.keep.i = append(to.keep.i, sub.i) | |
} | |
else { | |
# make a subset of count min.artists | |
weights = rep(.5, length(sub.i)) | |
top_artists_i = train$artist %in% top_artists$artist #indicies of top artist | |
weights[top_artists_i] == 100 # increase weights of top artists | |
samp = sample(sub.i, opt$min.artists, replace = F, prob = weights) # sample | |
to.keep.i = append(to.keep.i, samp) # record the sample | |
} | |
} | |
# get only the sampled indicies | |
train.samp = train[to.keep.i,] | |
# output the subset for future reference (if needed) | |
samp.file = paste(opt$out_dir, "/train.samp.txt", sep="") | |
write.table(train.samp, samp.file, sep="\t", quote=F, row.names=F) | |
#### Done Normalizing | |
# save the artists name in a new vector and remove them from the table | |
artist_names = train.samp$artist | |
train.samp = train.samp[,-1] | |
# remove zero and near-zero variance predictors | |
min = floor(nrow(train.samp) * opt$min.pct) # number needed to be in > min.pct of observations | |
to_keep = which(apply(train.samp[,-ncol(train.samp)], 2, sum) > min) # predictors to keep | |
train.samp.nzv = train.samp[,to_keep] | |
# function for graphing correlation | |
printSimMat = function(x, col.clus){ | |
heatmap.2(x, | |
Rowv=as.dendrogram(col.clus), | |
Colv=as.dendrogram(col.clus), | |
dendrogram='none', | |
trace='none', | |
col=redblue, | |
xlab="Predictors", | |
ylab="Predictors", | |
main="Pearson's Phi Similarity of Predictor Variables", | |
labRow="", | |
labCol="", | |
margins=c(2,2), | |
density.info="none") | |
} | |
# remove predictors that are correlated | |
# make a similarity plot using the phi dissimilarity | |
# I changed from jaccard (1) to phi (9) because jaccard is not symmetric - | |
# at least how they were doing it (which I think was wrong) | |
range01 <- function(x){(x-min(x))/(max(x)-min(x))} | |
pred.dist = dist.binary(t(train.samp.nzv),method = 9) | |
col.clus = hclust(pred.dist) | |
sim.mat = 1 - range01(as.matrix(pred.dist)) | |
printSimMat(sim.mat, col.clus) | |
cor_to_remove = findCorrelation(sim.mat, cutoff=.9) | |
print(length(cor_to_remove)) | |
if ( length(cor_to_remove) > 0 ) { | |
train.samp.nzv.cor = train.samp.nzv[,-cor_to_remove] | |
} else { | |
train.samp.nzv.cor = train.samp.nzv | |
} | |
# this builds the same correlation figure as above but for the | |
# matrix where correlated coefficients have been removed | |
pred.dist2 = dist.binary(t(train.samp.nzv.cor),method = 9) | |
col.clus2 = hclust(pred.dist2) | |
sim.mat2 = 1 - range01(as.matrix(pred.dist2)) | |
printSimMat(sim.mat2, col.clus2) | |
### | |
# Now I have a well curated dataset from with to build a model | |
### | |
# NOTE: the numbers is the comments below refer to the top artists training set | |
# and NOT the expanded training set. Also, I didn't do the second and third | |
# models for the expanded training set because in the top artist analysis having | |
# more predictors made the model more accurate. | |
# add on the outcomes to the last colunm of the training data | |
train.samp.nzv.cor.final = cbind(train.samp.nzv.cor, | |
"genre" = train.samp[rownames(train.samp.nzv.cor), ncol(train.samp)]) | |
out.file = paste(opt$out_dir, opt$out_file, sep="/") | |
write.csv(train.samp.nzv.cor.final, file=out.file, quote=F, row.names=F) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment