Skip to content

Instantly share code, notes, and snippets.

@mcmtroffaes
Last active August 8, 2017 11:59
Show Gist options
  • Save mcmtroffaes/8750237 to your computer and use it in GitHub Desktop.
Save mcmtroffaes/8750237 to your computer and use it in GitHub Desktop.
sample script for classification
# classification.r: generic R library for classification
# Copyright (C) 2014 Matthias C. M. Troffaes <matthias.troffaes@gmail.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
###########################
### low level functions ###
###########################
# for type checking
.stopifnotdataframe = function(x) {
if (!is.data.frame(x))
stop("expected a data frame")
}
.stopifnottable = function(x) {
if (!is.table(x))
stop("expected a table")
}
.stopifnotlistoftables = function(xs) {
if (!is.list(xs)) stop("expected a list of tables")
lapply(xs, function(x) { .stopifnottable(x) })
NULL
}
.stopifnotnumericvalue = function(x) {
if (!is.numeric(x)) stop("expected a single numeric value")
if (length(x) != 1) stop("expected a single numeric value")
}
.stopifnotnumericvector = function(x) {
if (!is.numeric(x)) stop("expected a numeric vector")
}
# calculate empirical probability mass function
# rows = the data frame; each column is a variable, each row is a joint
# observation
# colid = column of the variable of which to calculate probability mass function
.empirical.prob.table = function(rows, colid) {
.stopifnotdataframe(rows)
counts = table(rows[colid])
prob = counts / sum(counts)
# debug: probability table names == values of the variable
stopifnot(names(prob) == levels(factor(rows[,colid])))
prob
}
# calculate empirical conditional probability mass function
# rows = the data frame; each column is a variable, each row is a joint
# observation
# colid1 = column with values of the conditioning variable
# colid2 = column with values of the non-conditioning random variable
# returns a table where each row is a probability mass function
.empirical.conditional.prob.table = function(rows, colid1, colid2) {
.stopifnotdataframe(rows)
counts = table(rows[c(colid1, colid2)])
counts / apply(counts, 1, sum)
}
# calculate joint probability p(c,a) = p(c) * p(a1|c) * ... * p(ak|c)
.classifier.naive.joint.prob = function(c.prob.table, a.prob.tables, a.colids, c.level, testrow) {
# check input types and values
.stopifnottable(c.prob.table)
.stopifnotlistoftables(a.prob.tables)
.stopifnotnumericvector(a.colids)
.stopifnotdataframe(testrow)
# debug: we should have one p(c|a) per attribute
stopifnot(length(a.colids) == length(a.prob.tables))
# calculate p(a1|c), p(a2|c), ..., p(ak|c)
p.acs = sapply(
1:length(a.colids),
function(i) {
colid = a.colids[i]
a.level = testrow[1, colid]
prob.table.a.given.c = a.prob.tables[[i]]
prob.table.a.given.c[c.level, a.level]
}
)
# calculate p(c)
p.c = c.prob.table[c.level]
# joint probability under naive assumption
# p(c,a)=p(c) * p(a1|c) * ... * p(ak|c)
p.c * prod(p.acs)
}
# predict the class from a test row
.classifier.naive.predict = function(c.prob.table, a.prob.tables, a.colids, testrow) {
.stopifnottable(c.prob.table)
.stopifnotlistoftables(a.prob.tables)
.stopifnotnumericvector(a.colids)
.stopifnotdataframe(testrow)
# calculate p(c,a) for each class level c
# (the a_i are stored in testrow, the class levels are c.levels)
c.levels = names(c.prob.table)
p.cas = lapply(
c.levels,
function(c.level) {
.classifier.naive.joint.prob(c.prob.table, a.prob.tables, a.colids, c.level, testrow)
}
)
c.levels[which.max(p.cas)]
}
# get actual class from a test row
.classifier.actual = function(c.colid, testrow) {
.stopifnotdataframe(testrow)
.stopifnotnumericvalue(c.colid)
testrow[1, c.colid]
}
############################
### high level interface ###
############################
# apply the function *testfunc* to all rows of the data frame *testrows*
# this is like apply(testrows, 1, testfunc)
# but *testfunc* is called with a proper data frame for each row
classifier.test = function(testrows, testfunc) {
.stopifnotdataframe(testrows)
lapply(
1:(dim(testrows)[1]), # all row indices
function(rowid) { testfunc(testrows[rowid,]) }
)
}
# get a test function which gets the actual class
classifier.testfunc.actual = function(c.colid) {
function(testrow) { .classifier.actual(c.colid, testrow) }
}
# get a test function which predicts the class from training data
classifier.testfunc.naive.predict = function(trainrows, c.colid, a.colids) {
.stopifnotdataframe(trainrows)
.stopifnotnumericvalue(c.colid)
.stopifnotnumericvector(a.colids)
c.prob.table = .empirical.prob.table(trainrows[c.colid])
a.prob.tables = lapply(
a.colids,
function(colid) {
.empirical.conditional.prob.table(trainrows, c.colid, colid)
}
)
function(testrow) {
.classifier.naive.predict(c.prob.table, a.prob.tables, a.colids, testrow)
}
}
##################
### an example ###
##################
# a helper function to randomly split rows into training and testing
split.train.test = function(rows) {
.stopifnotdataframe(rows)
numrows = dim(rows)[1]
trainrows = sample(numrows, numrows %/% 2)
list(
"train"=rows[trainrows,],
"test"=rows[-trainrows,]
)
}
# classify iris data
classifier.example = function() {
# discretize iris data and store in new data frame
# also split into train and test data
plength = cut(iris$Petal.Length, 3, labels=FALSE)
pwidth = cut(iris$Petal.Width, 2, labels=FALSE)
slength = cut(iris$Sepal.Length, 4, labels=FALSE)
swidth = cut(iris$Sepal.Width, 3, labels=FALSE)
species = iris$Species
irisdata = split.train.test(
data.frame(species, plength, pwidth, slength, swidth))
# create test functions for predicted class and for actual class
predict = classifier.testfunc.naive.predict(irisdata$train, c.colid=1, a.colids=2:5)
actual = classifier.testfunc.actual(c.colid=1)
# list correct classifications on test set
correct = classifier.test(
irisdata$test,
function(testrow) { predict(testrow) == actual(testrow) }
)
# calculate accuracy
print("accuracy")
# sum(correct) does not work because correct is a list and R is stupid
print(Reduce("+", correct) / length(correct))
}
main = function() {
classifier.example()
}
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment