mbannert/calc_shares.R

## calc_shares.R
# This file documents calcshares using the oxygen documentation standard
# google for roxygen
# you don't need to source this file, it's just for documentation
# puporses.

#' Calculate the weighted share of an answer item
#'
#' This function calculates how often an answer item was selected relatively
#' to the other answer item of a question. The result is given as a share.
#' Besides the function calculates additional information that is described
#' in greated detail below.
#'
#' @param dt a data.table containing the entire relevant data
#' @param var a character representation denoting a particular column
#' of the data.table given in dt. Typically this column represents a
#' qualitative question
#' @param sizeColumn a character representation of the data.table column
#' that holds the quantitative weight. If you want to aggregate without
#' weighting hand a vector of 1s which is equal to nrow(dt).
#'
#' @return returns a data table that consists of the following columns:
#' - all columns that were keys of the input data.table (this varies depending on the input)
#' typically this is: time, group, question
#' - sumTest: sum of all employees working in key group that chose a particular answer item
#' - sumTestTotal: sum of all employees working in a key sector
#' - AN: number of participants in the key group that chose a particular answer item
#' - ANTot: number of participants in the key group
#' - share: weighted share, sumTest/sumTestTotal
#' @author Matthias Bannert
calcShares <- function (dt, var, sizeColumn)
{
  # turn character representations into an R name Object
  v <- as.name(var)
  sizeColumn <- as.name(sizeColumn)

  # make sophisticated use of data.table to highly dynamic subsets
  # i checks selects all the observations in the data.table for which
  # question v is NOT NA (NA means not available, very much similar to SQL's NULL)
  dt[i = !(is.na(eval(v))),
     j = {
       # .SD stands for subset of data.table we use SD to
       # create different subsets within the same function call
       # see also What does .SD mean:
       # http://stackoverflow.com/questions/8508482/what-does-sd-stand-for-in-data-table-in-r
       # n is the subset of all employees that represent a firm that
       # has a valid value for question (that means not NA)
       # sum just creates the sum of all employees for whom this condition holds
    n <- sum(.SD[, eval(sizeColumn)])
       # n1 is similar to n, it uses the same subset condition, but only counts the number of elements in
       # in the vector which is equal to the number of answers under that condition
    n1 <- length(.SD[, eval(sizeColumn)])
    # subsequently another, more constrained subset is built
    # the by statement simply creates a GROUP BY (just like SQL's GROUP BY)
    # and groups all the elements with respect to their value of answer
    # typically 2,3 or 4
    # list prepares the output
    # name of the output column = value that is calculated
    .SD[, list(sumTest = sum(eval(sizeColumn)),
               # just use n defined above
               sumTestTotal = n,
               # this is a nested subset, we count
               # the number of answers under the restriction of the GROUP BY answer item
               AN = length(.SD[, eval(sizeColumn)]),
               # just use n defined above
               ANTot = n1,
               # simply calculate the share
               share = sum(eval(sizeColumn))/n
               # end list
               ),
        # the inner group by statement respecting the answer items of question v
        by = eval(as.character(v))]
    # outer group by statement over all keys of the input dt
  }, by = key(dt)]
}
	# This file documents calcshares using the oxygen documentation standard
	# google for roxygen
	# you don't need to source this file, it's just for documentation
	# puporses.

	#' Calculate the weighted share of an answer item
	#'
	#' This function calculates how often an answer item was selected relatively
	#' to the other answer item of a question. The result is given as a share.
	#' Besides the function calculates additional information that is described
	#' in greated detail below.
	#'
	#' @param dt a data.table containing the entire relevant data
	#' @param var a character representation denoting a particular column
	#' of the data.table given in dt. Typically this column represents a
	#' qualitative question
	#' @param sizeColumn a character representation of the data.table column
	#' that holds the quantitative weight. If you want to aggregate without
	#' weighting hand a vector of 1s which is equal to nrow(dt).
	#'
	#' @return returns a data table that consists of the following columns:
	#' - all columns that were keys of the input data.table (this varies depending on the input)
	#' typically this is: time, group, question
	#' - sumTest: sum of all employees working in key group that chose a particular answer item
	#' - sumTestTotal: sum of all employees working in a key sector
	#' - AN: number of participants in the key group that chose a particular answer item
	#' - ANTot: number of participants in the key group
	#' - share: weighted share, sumTest/sumTestTotal
	#' @author Matthias Bannert
	calcShares <- function (dt, var, sizeColumn)
	{
	# turn character representations into an R name Object
	v <- as.name(var)
	sizeColumn <- as.name(sizeColumn)

	# make sophisticated use of data.table to highly dynamic subsets
	# i checks selects all the observations in the data.table for which
	# question v is NOT NA (NA means not available, very much similar to SQL's NULL)
	dt[i = !(is.na(eval(v))),
	j = {
	# .SD stands for subset of data.table we use SD to
	# create different subsets within the same function call
	# see also What does .SD mean:
	# http://stackoverflow.com/questions/8508482/what-does-sd-stand-for-in-data-table-in-r
	# n is the subset of all employees that represent a firm that
	# has a valid value for question (that means not NA)
	# sum just creates the sum of all employees for whom this condition holds
	n <- sum(.SD[, eval(sizeColumn)])
	# n1 is similar to n, it uses the same subset condition, but only counts the number of elements in
	# in the vector which is equal to the number of answers under that condition
	n1 <- length(.SD[, eval(sizeColumn)])
	# subsequently another, more constrained subset is built
	# the by statement simply creates a GROUP BY (just like SQL's GROUP BY)
	# and groups all the elements with respect to their value of answer
	# typically 2,3 or 4
	# list prepares the output
	# name of the output column = value that is calculated
	.SD[, list(sumTest = sum(eval(sizeColumn)),
	# just use n defined above
	sumTestTotal = n,
	# this is a nested subset, we count
	# the number of answers under the restriction of the GROUP BY answer item
	AN = length(.SD[, eval(sizeColumn)]),
	# just use n defined above
	ANTot = n1,
	# simply calculate the share
	share = sum(eval(sizeColumn))/n
	# end list
	),
	# the inner group by statement respecting the answer items of question v
	by = eval(as.character(v))]
	# outer group by statement over all keys of the input dt
	}, by = key(dt)]
	}