Last active
August 8, 2017 11:59
-
-
Save mcmtroffaes/8750237 to your computer and use it in GitHub Desktop.
sample script for classification
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# classification.r: generic R library for classification | |
# Copyright (C) 2014 Matthias C. M. Troffaes <matthias.troffaes@gmail.com> | |
# | |
# This program is free software: you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation, either version 3 of the License, or | |
# (at your option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU General Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License | |
# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
########################### | |
### low level functions ### | |
########################### | |
# for type checking | |
.stopifnotdataframe = function(x) { | |
if (!is.data.frame(x)) | |
stop("expected a data frame") | |
} | |
.stopifnottable = function(x) { | |
if (!is.table(x)) | |
stop("expected a table") | |
} | |
.stopifnotlistoftables = function(xs) { | |
if (!is.list(xs)) stop("expected a list of tables") | |
lapply(xs, function(x) { .stopifnottable(x) }) | |
NULL | |
} | |
.stopifnotnumericvalue = function(x) { | |
if (!is.numeric(x)) stop("expected a single numeric value") | |
if (length(x) != 1) stop("expected a single numeric value") | |
} | |
.stopifnotnumericvector = function(x) { | |
if (!is.numeric(x)) stop("expected a numeric vector") | |
} | |
# calculate empirical probability mass function | |
# rows = the data frame; each column is a variable, each row is a joint | |
# observation | |
# colid = column of the variable of which to calculate probability mass function | |
.empirical.prob.table = function(rows, colid) { | |
.stopifnotdataframe(rows) | |
counts = table(rows[colid]) | |
prob = counts / sum(counts) | |
# debug: probability table names == values of the variable | |
stopifnot(names(prob) == levels(factor(rows[,colid]))) | |
prob | |
} | |
# calculate empirical conditional probability mass function | |
# rows = the data frame; each column is a variable, each row is a joint | |
# observation | |
# colid1 = column with values of the conditioning variable | |
# colid2 = column with values of the non-conditioning random variable | |
# returns a table where each row is a probability mass function | |
.empirical.conditional.prob.table = function(rows, colid1, colid2) { | |
.stopifnotdataframe(rows) | |
counts = table(rows[c(colid1, colid2)]) | |
counts / apply(counts, 1, sum) | |
} | |
# calculate joint probability p(c,a) = p(c) * p(a1|c) * ... * p(ak|c) | |
.classifier.naive.joint.prob = function(c.prob.table, a.prob.tables, a.colids, c.level, testrow) { | |
# check input types and values | |
.stopifnottable(c.prob.table) | |
.stopifnotlistoftables(a.prob.tables) | |
.stopifnotnumericvector(a.colids) | |
.stopifnotdataframe(testrow) | |
# debug: we should have one p(c|a) per attribute | |
stopifnot(length(a.colids) == length(a.prob.tables)) | |
# calculate p(a1|c), p(a2|c), ..., p(ak|c) | |
p.acs = sapply( | |
1:length(a.colids), | |
function(i) { | |
colid = a.colids[i] | |
a.level = testrow[1, colid] | |
prob.table.a.given.c = a.prob.tables[[i]] | |
prob.table.a.given.c[c.level, a.level] | |
} | |
) | |
# calculate p(c) | |
p.c = c.prob.table[c.level] | |
# joint probability under naive assumption | |
# p(c,a)=p(c) * p(a1|c) * ... * p(ak|c) | |
p.c * prod(p.acs) | |
} | |
# predict the class from a test row | |
.classifier.naive.predict = function(c.prob.table, a.prob.tables, a.colids, testrow) { | |
.stopifnottable(c.prob.table) | |
.stopifnotlistoftables(a.prob.tables) | |
.stopifnotnumericvector(a.colids) | |
.stopifnotdataframe(testrow) | |
# calculate p(c,a) for each class level c | |
# (the a_i are stored in testrow, the class levels are c.levels) | |
c.levels = names(c.prob.table) | |
p.cas = lapply( | |
c.levels, | |
function(c.level) { | |
.classifier.naive.joint.prob(c.prob.table, a.prob.tables, a.colids, c.level, testrow) | |
} | |
) | |
c.levels[which.max(p.cas)] | |
} | |
# get actual class from a test row | |
.classifier.actual = function(c.colid, testrow) { | |
.stopifnotdataframe(testrow) | |
.stopifnotnumericvalue(c.colid) | |
testrow[1, c.colid] | |
} | |
############################ | |
### high level interface ### | |
############################ | |
# apply the function *testfunc* to all rows of the data frame *testrows* | |
# this is like apply(testrows, 1, testfunc) | |
# but *testfunc* is called with a proper data frame for each row | |
classifier.test = function(testrows, testfunc) { | |
.stopifnotdataframe(testrows) | |
lapply( | |
1:(dim(testrows)[1]), # all row indices | |
function(rowid) { testfunc(testrows[rowid,]) } | |
) | |
} | |
# get a test function which gets the actual class | |
classifier.testfunc.actual = function(c.colid) { | |
function(testrow) { .classifier.actual(c.colid, testrow) } | |
} | |
# get a test function which predicts the class from training data | |
classifier.testfunc.naive.predict = function(trainrows, c.colid, a.colids) { | |
.stopifnotdataframe(trainrows) | |
.stopifnotnumericvalue(c.colid) | |
.stopifnotnumericvector(a.colids) | |
c.prob.table = .empirical.prob.table(trainrows[c.colid]) | |
a.prob.tables = lapply( | |
a.colids, | |
function(colid) { | |
.empirical.conditional.prob.table(trainrows, c.colid, colid) | |
} | |
) | |
function(testrow) { | |
.classifier.naive.predict(c.prob.table, a.prob.tables, a.colids, testrow) | |
} | |
} | |
################## | |
### an example ### | |
################## | |
# a helper function to randomly split rows into training and testing | |
split.train.test = function(rows) { | |
.stopifnotdataframe(rows) | |
numrows = dim(rows)[1] | |
trainrows = sample(numrows, numrows %/% 2) | |
list( | |
"train"=rows[trainrows,], | |
"test"=rows[-trainrows,] | |
) | |
} | |
# classify iris data | |
classifier.example = function() { | |
# discretize iris data and store in new data frame | |
# also split into train and test data | |
plength = cut(iris$Petal.Length, 3, labels=FALSE) | |
pwidth = cut(iris$Petal.Width, 2, labels=FALSE) | |
slength = cut(iris$Sepal.Length, 4, labels=FALSE) | |
swidth = cut(iris$Sepal.Width, 3, labels=FALSE) | |
species = iris$Species | |
irisdata = split.train.test( | |
data.frame(species, plength, pwidth, slength, swidth)) | |
# create test functions for predicted class and for actual class | |
predict = classifier.testfunc.naive.predict(irisdata$train, c.colid=1, a.colids=2:5) | |
actual = classifier.testfunc.actual(c.colid=1) | |
# list correct classifications on test set | |
correct = classifier.test( | |
irisdata$test, | |
function(testrow) { predict(testrow) == actual(testrow) } | |
) | |
# calculate accuracy | |
print("accuracy") | |
# sum(correct) does not work because correct is a list and R is stupid | |
print(Reduce("+", correct) / length(correct)) | |
} | |
main = function() { | |
classifier.example() | |
} | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment