Gedevan-Aleksizde/mlogit.R

## mlogit.R
require(mlogit) # 0.2-4

as.mldata <- function(data){
  # convert HC dataset
  # The alternatives are
  # 1. Gas central heat with cooling (gcc)
  # 2. Electric central resistence heat with cooling (ecc)
  # 3. Electric room resistence heat with cooling (erc)
  # 4. Electric heat pump, which provides cooling also (hpc)
  # 5. Gas central heat without cooling (gc)
  # 6. Electric central resistence heat without cooling (ec)
  # 7. Electric room resistence heat without cooling (er)
  # depvar gives the name of the chosen alternative,
  # ich.alt are the installation cost for the heating portion of the system,
  # icca is the installation cost for cooling
  # och.alt are the operating cost for the heating portion of the system
  # occa is the operating cost for cooling
  # income is the annual income of the household
  data <- mlogit.data(data, varying = c(2:8, 10:16), choice = "depvar", shape = "wide")
  cooling.modes <- index(data)$alt %in% c('gcc', 'ecc', 'erc', 'hpc')
  room.modes <- index(data)$alt %in% c('erc', 'er')
  # installation / operating costs for cooling are constants,
  # only relevant for mixed systems
  data$icca[!cooling.modes] <- 0
  data$occa[!cooling.modes] <- 0
  # create income variables for two sets cooling and rooms
  data$inc.cooling <- data$inc.room <- 0
  data$inc.cooling[cooling.modes] <- data$income[cooling.modes]
  data$inc.room[room.modes] <- data$income[room.modes]
  # create an intercet for cooling modes
  data$int.cooling <- as.numeric(cooling.modes)
  return(data)
}

logloss <- function(y, pred){
  # y: vector
  # pred: wide matrix
  return(-mean(apply(data.frame(y=y, pred), 1, function(x){sum(log(as.numeric(x[x[1]])))})))
}

data("HC", package = "mlogit")
set.seed(42)
train <- 1: nrow(HC) %in% sample(x = 1:nrow(HC), size = 100, replace = F)
df.train <- as.mldata(HC[train, ])
df.test <- as.mldata(HC[!train, ])
y <- as.character(HC$depvar)

# nested
nl <- mlogit(depvar ~ ich + och + icca + occa + inc.room + inc.cooling + int.cooling | 0, df.train,
             nests =  list(cooling = c('gcc','ecc','erc','hpc'),
                            other = c('gc', 'ec', 'er')),
             un.nest.el = TRUE)
# multinomial
ml <- update(nl, nests = NULL)

summary(nl)
summary(ml)

logloss(y[!train], predict(object = nl, newdata=df.test))
logloss(y[!train], predict(object = ml, newdata=df.test))
	require(mlogit) # 0.2-4

	as.mldata <- function(data){
	# convert HC dataset
	# The alternatives are
	# 1. Gas central heat with cooling (gcc)
	# 2. Electric central resistence heat with cooling (ecc)
	# 3. Electric room resistence heat with cooling (erc)
	# 4. Electric heat pump, which provides cooling also (hpc)
	# 5. Gas central heat without cooling (gc)
	# 6. Electric central resistence heat without cooling (ec)
	# 7. Electric room resistence heat without cooling (er)
	# depvar gives the name of the chosen alternative,
	# ich.alt are the installation cost for the heating portion of the system,
	# icca is the installation cost for cooling
	# och.alt are the operating cost for the heating portion of the system
	# occa is the operating cost for cooling
	# income is the annual income of the household
	data <- mlogit.data(data, varying = c(2:8, 10:16), choice = "depvar", shape = "wide")
	cooling.modes <- index(data)$alt %in% c('gcc', 'ecc', 'erc', 'hpc')
	room.modes <- index(data)$alt %in% c('erc', 'er')
	# installation / operating costs for cooling are constants,
	# only relevant for mixed systems
	data$icca[!cooling.modes] <- 0
	data$occa[!cooling.modes] <- 0
	# create income variables for two sets cooling and rooms
	data$inc.cooling <- data$inc.room <- 0
	data$inc.cooling[cooling.modes] <- data$income[cooling.modes]
	data$inc.room[room.modes] <- data$income[room.modes]
	# create an intercet for cooling modes
	data$int.cooling <- as.numeric(cooling.modes)
	return(data)
	}

	logloss <- function(y, pred){
	# y: vector
	# pred: wide matrix
	return(-mean(apply(data.frame(y=y, pred), 1, function(x){sum(log(as.numeric(x[x[1]])))})))
	}

	data("HC", package = "mlogit")
	set.seed(42)
	train <- 1: nrow(HC) %in% sample(x = 1:nrow(HC), size = 100, replace = F)
	df.train <- as.mldata(HC[train, ])
	df.test <- as.mldata(HC[!train, ])
	y <- as.character(HC$depvar)

	# nested
	nl <- mlogit(depvar ~ ich + och + icca + occa + inc.room + inc.cooling + int.cooling \| 0, df.train,
	nests = list(cooling = c('gcc','ecc','erc','hpc'),
	other = c('gc', 'ec', 'er')),
	un.nest.el = TRUE)
	# multinomial
	ml <- update(nl, nests = NULL)

	summary(nl)
	summary(ml)

	logloss(y[!train], predict(object = nl, newdata=df.test))
	logloss(y[!train], predict(object = ml, newdata=df.test))