mcmtroffaes/classification.r

## classification.r
# classification.r: generic R library for classification
# Copyright (C) 2014  Matthias C. M. Troffaes  <matthias.troffaes@gmail.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

###########################
### low level functions ###
###########################

# for type checking
.stopifnotdataframe = function(x) {
  if (!is.data.frame(x))
    stop("expected a data frame")
}

.stopifnottable = function(x) {
  if (!is.table(x))
    stop("expected a table")
}

.stopifnotlistoftables = function(xs) {
  if (!is.list(xs)) stop("expected a list of tables")
  lapply(xs, function(x) { .stopifnottable(x) })
  NULL
}

.stopifnotnumericvalue = function(x) {
  if (!is.numeric(x)) stop("expected a single numeric value")
  if (length(x) != 1) stop("expected a single numeric value")
}

.stopifnotnumericvector = function(x) {
  if (!is.numeric(x)) stop("expected a numeric vector")
}

# calculate empirical probability mass function
# rows = the data frame; each column is a variable, each row is a joint
#        observation
# colid = column of the variable of which to calculate probability mass function
.empirical.prob.table = function(rows, colid) {
  .stopifnotdataframe(rows)
  counts = table(rows[colid])
  prob = counts / sum(counts)
  # debug: probability table names == values of the variable
  stopifnot(names(prob) == levels(factor(rows[,colid])))
  prob
}

# calculate empirical conditional probability mass function
# rows = the data frame; each column is a variable, each row is a joint
#        observation
# colid1 = column with values of the conditioning variable
# colid2 = column with values of the non-conditioning random variable
# returns a table where each row is a probability mass function
.empirical.conditional.prob.table = function(rows, colid1, colid2) {
  .stopifnotdataframe(rows)
  counts = table(rows[c(colid1, colid2)])
  counts / apply(counts, 1, sum)
}

# calculate joint probability p(c,a) = p(c) * p(a1|c) * ... * p(ak|c)
.classifier.naive.joint.prob = function(c.prob.table, a.prob.tables, a.colids, c.level, testrow) {
  # check input types and values
  .stopifnottable(c.prob.table)
  .stopifnotlistoftables(a.prob.tables)
  .stopifnotnumericvector(a.colids)
  .stopifnotdataframe(testrow)
  # debug: we should have one p(c|a) per attribute
  stopifnot(length(a.colids) == length(a.prob.tables))
  # calculate p(a1|c), p(a2|c), ..., p(ak|c)
  p.acs = sapply(
    1:length(a.colids),
    function(i) {
      colid = a.colids[i]
      a.level = testrow[1, colid]
      prob.table.a.given.c = a.prob.tables[[i]]
      prob.table.a.given.c[c.level, a.level]
    }
    )
  # calculate p(c)
  p.c = c.prob.table[c.level]
  # joint probability under naive assumption
  # p(c,a)=p(c) * p(a1|c) * ... * p(ak|c)
  p.c * prod(p.acs)
}

# predict the class from a test row
.classifier.naive.predict = function(c.prob.table, a.prob.tables, a.colids, testrow) {
  .stopifnottable(c.prob.table)
  .stopifnotlistoftables(a.prob.tables)
  .stopifnotnumericvector(a.colids)
  .stopifnotdataframe(testrow)
  # calculate p(c,a) for each class level c
  # (the a_i are stored in testrow, the class levels are c.levels)
  c.levels = names(c.prob.table)
  p.cas = lapply(
      c.levels,
      function(c.level) {
        .classifier.naive.joint.prob(c.prob.table, a.prob.tables, a.colids, c.level, testrow)
      }
      )
  c.levels[which.max(p.cas)]
}

# get actual class from a test row
.classifier.actual = function(c.colid, testrow) {
  .stopifnotdataframe(testrow)
  .stopifnotnumericvalue(c.colid)
  testrow[1, c.colid]
}

############################
### high level interface ###
############################

# apply the function *testfunc* to all rows of the data frame *testrows*
# this is like apply(testrows, 1, testfunc)
# but *testfunc* is called with a proper data frame for each row
classifier.test = function(testrows, testfunc) {
  .stopifnotdataframe(testrows)
  lapply(
    1:(dim(testrows)[1]),  # all row indices
    function(rowid) { testfunc(testrows[rowid,]) }
    )
}

# get a test function which gets the actual class
classifier.testfunc.actual = function(c.colid) {
  function(testrow) { .classifier.actual(c.colid, testrow) }
}

# get a test function which predicts the class from training data
classifier.testfunc.naive.predict = function(trainrows, c.colid, a.colids) {
  .stopifnotdataframe(trainrows)
  .stopifnotnumericvalue(c.colid)
  .stopifnotnumericvector(a.colids)
  c.prob.table = .empirical.prob.table(trainrows[c.colid])
  a.prob.tables = lapply(
    a.colids,
    function(colid) {
      .empirical.conditional.prob.table(trainrows, c.colid, colid)
    }
    )
  function(testrow) {
    .classifier.naive.predict(c.prob.table, a.prob.tables, a.colids, testrow)
  }
}

##################
### an example ###
##################

# a helper function to randomly split rows into training and testing
split.train.test = function(rows) {
  .stopifnotdataframe(rows)
  numrows = dim(rows)[1]
  trainrows = sample(numrows, numrows %/% 2)
  list(
    "train"=rows[trainrows,],
    "test"=rows[-trainrows,]
    )
}

# classify iris data
classifier.example = function() {
  # discretize iris data and store in new data frame
  # also split into train and test data
  plength = cut(iris$Petal.Length, 3, labels=FALSE)
  pwidth = cut(iris$Petal.Width, 2, labels=FALSE)
  slength = cut(iris$Sepal.Length, 4, labels=FALSE)
  swidth = cut(iris$Sepal.Width, 3, labels=FALSE)
  species = iris$Species
  irisdata = split.train.test(
    data.frame(species, plength, pwidth, slength, swidth))
  # create test functions for predicted class and for actual class
  predict = classifier.testfunc.naive.predict(irisdata$train, c.colid=1, a.colids=2:5)
  actual = classifier.testfunc.actual(c.colid=1)
  # list correct classifications on test set
  correct = classifier.test(
    irisdata$test,
    function(testrow) { predict(testrow) == actual(testrow) }
    )
  # calculate accuracy
  print("accuracy")
  # sum(correct) does not work because correct is a list and R is stupid
  print(Reduce("+", correct) / length(correct))
}

main = function() {
  classifier.example()
}

main()
	# classification.r: generic R library for classification
	# Copyright (C) 2014 Matthias C. M. Troffaes <matthias.troffaes@gmail.com>
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <http://www.gnu.org/licenses/>.

	###########################
	### low level functions ###
	###########################

	# for type checking
	.stopifnotdataframe = function(x) {
	if (!is.data.frame(x))
	stop("expected a data frame")
	}

	.stopifnottable = function(x) {
	if (!is.table(x))
	stop("expected a table")
	}

	.stopifnotlistoftables = function(xs) {
	if (!is.list(xs)) stop("expected a list of tables")
	lapply(xs, function(x) { .stopifnottable(x) })
	NULL
	}

	.stopifnotnumericvalue = function(x) {
	if (!is.numeric(x)) stop("expected a single numeric value")
	if (length(x) != 1) stop("expected a single numeric value")
	}

	.stopifnotnumericvector = function(x) {
	if (!is.numeric(x)) stop("expected a numeric vector")
	}

	# calculate empirical probability mass function
	# rows = the data frame; each column is a variable, each row is a joint
	# observation
	# colid = column of the variable of which to calculate probability mass function
	.empirical.prob.table = function(rows, colid) {
	.stopifnotdataframe(rows)
	counts = table(rows[colid])
	prob = counts / sum(counts)
	# debug: probability table names == values of the variable
	stopifnot(names(prob) == levels(factor(rows[,colid])))
	prob
	}

	# calculate empirical conditional probability mass function
	# rows = the data frame; each column is a variable, each row is a joint
	# observation
	# colid1 = column with values of the conditioning variable
	# colid2 = column with values of the non-conditioning random variable
	# returns a table where each row is a probability mass function
	.empirical.conditional.prob.table = function(rows, colid1, colid2) {
	.stopifnotdataframe(rows)
	counts = table(rows[c(colid1, colid2)])
	counts / apply(counts, 1, sum)
	}

	# calculate joint probability p(c,a) = p(c) * p(a1\|c) * ... * p(ak\|c)
	.classifier.naive.joint.prob = function(c.prob.table, a.prob.tables, a.colids, c.level, testrow) {
	# check input types and values
	.stopifnottable(c.prob.table)
	.stopifnotlistoftables(a.prob.tables)
	.stopifnotnumericvector(a.colids)
	.stopifnotdataframe(testrow)
	# debug: we should have one p(c\|a) per attribute
	stopifnot(length(a.colids) == length(a.prob.tables))
	# calculate p(a1\|c), p(a2\|c), ..., p(ak\|c)
	p.acs = sapply(
	1:length(a.colids),
	function(i) {
	colid = a.colids[i]
	a.level = testrow[1, colid]
	prob.table.a.given.c = a.prob.tables[[i]]
	prob.table.a.given.c[c.level, a.level]
	}
	)
	# calculate p(c)
	p.c = c.prob.table[c.level]
	# joint probability under naive assumption
	# p(c,a)=p(c) * p(a1\|c) * ... * p(ak\|c)
	p.c * prod(p.acs)
	}

	# predict the class from a test row
	.classifier.naive.predict = function(c.prob.table, a.prob.tables, a.colids, testrow) {
	.stopifnottable(c.prob.table)
	.stopifnotlistoftables(a.prob.tables)
	.stopifnotnumericvector(a.colids)
	.stopifnotdataframe(testrow)
	# calculate p(c,a) for each class level c
	# (the a_i are stored in testrow, the class levels are c.levels)
	c.levels = names(c.prob.table)
	p.cas = lapply(
	c.levels,
	function(c.level) {
	.classifier.naive.joint.prob(c.prob.table, a.prob.tables, a.colids, c.level, testrow)
	}
	)
	c.levels[which.max(p.cas)]
	}

	# get actual class from a test row
	.classifier.actual = function(c.colid, testrow) {
	.stopifnotdataframe(testrow)
	.stopifnotnumericvalue(c.colid)
	testrow[1, c.colid]
	}

	############################
	### high level interface ###
	############################

	# apply the function testfunc to all rows of the data frame testrows
	# this is like apply(testrows, 1, testfunc)
	# but testfunc is called with a proper data frame for each row
	classifier.test = function(testrows, testfunc) {
	.stopifnotdataframe(testrows)
	lapply(
	1:(dim(testrows)[1]), # all row indices
	function(rowid) { testfunc(testrows[rowid,]) }
	)
	}

	# get a test function which gets the actual class
	classifier.testfunc.actual = function(c.colid) {
	function(testrow) { .classifier.actual(c.colid, testrow) }
	}

	# get a test function which predicts the class from training data
	classifier.testfunc.naive.predict = function(trainrows, c.colid, a.colids) {
	.stopifnotdataframe(trainrows)
	.stopifnotnumericvalue(c.colid)
	.stopifnotnumericvector(a.colids)
	c.prob.table = .empirical.prob.table(trainrows[c.colid])
	a.prob.tables = lapply(
	a.colids,
	function(colid) {
	.empirical.conditional.prob.table(trainrows, c.colid, colid)
	}
	)
	function(testrow) {
	.classifier.naive.predict(c.prob.table, a.prob.tables, a.colids, testrow)
	}
	}

	##################
	### an example ###
	##################

	# a helper function to randomly split rows into training and testing
	split.train.test = function(rows) {
	.stopifnotdataframe(rows)
	numrows = dim(rows)[1]
	trainrows = sample(numrows, numrows %/% 2)
	list(
	"train"=rows[trainrows,],
	"test"=rows[-trainrows,]
	)
	}

	# classify iris data
	classifier.example = function() {
	# discretize iris data and store in new data frame
	# also split into train and test data
	plength = cut(iris$Petal.Length, 3, labels=FALSE)
	pwidth = cut(iris$Petal.Width, 2, labels=FALSE)
	slength = cut(iris$Sepal.Length, 4, labels=FALSE)
	swidth = cut(iris$Sepal.Width, 3, labels=FALSE)
	species = iris$Species
	irisdata = split.train.test(
	data.frame(species, plength, pwidth, slength, swidth))
	# create test functions for predicted class and for actual class
	predict = classifier.testfunc.naive.predict(irisdata$train, c.colid=1, a.colids=2:5)
	actual = classifier.testfunc.actual(c.colid=1)
	# list correct classifications on test set
	correct = classifier.test(
	irisdata$test,
	function(testrow) { predict(testrow) == actual(testrow) }
	)
	# calculate accuracy
	print("accuracy")
	# sum(correct) does not work because correct is a list and R is stupid
	print(Reduce("+", correct) / length(correct))
	}

	main = function() {
	classifier.example()
	}

	main()