frbl/bounded_logistic_regression.R

## bounded_logistic_regression.R
#!/usr/bin/env Rscript

## Constrained logistic regression
## In this gist we create a regression for which the predicted probabilities are contstrained. That is, they can not be
## less than a minimum of delta, or a maxiumum of 1 - delta.

## Based on:
## - https://stats.idre.ucla.edu/r/dae/logit-regression/
## - https://github.com/achambaz/tsml.cara.rct/blob/2b2aa282d4a11c601b37cacb368b67d03f6e8fc9/R/misc.R#L440
## - https://github.com/achambaz/tsml.cara.rct/blob/fd426c2a6f91b692d379b4765d9900151d09daa6/R/targetGstar.R#L1
## - https://stackoverflow.com/questions/15931403/modify-glm-function-to-adopt-user-specified-link-function-in-r
## - help file of `family` and `glm`

## First fit a regular logistic regression on some toy example
mydata <- read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
mydata$rank <- factor(mydata$rank)
mylogit <- glm(admit ~ gre + gpa + rank, data = mydata, family = "binomial")
summary(mylogit)
range(predict(mylogit, type='response'))


## Then create the bounded link functions. Change delta in the function to the actual bounds
mydelta = 0.1
bounded_logit <- function(delta) {
  structure(
    list(## mu mapsto logit( [mu - delta]/[1 - 2 delta]  ).
      linkfun = function(mu) {
        qlogis((mu-delta)/(1-2*delta))
      },

      ## eta mapsto delta + (1 - 2 delta) expit (eta).
      linkinv = function(eta) {
        delta + (1-2*delta)*plogis(eta)
      },


      ## derivative of inverse link wrt eta
      mu.eta = function(eta) {
        expit.eta <- plogis(eta)
        (1-2*delta)*expit.eta*(1-expit.eta)
      },


      ## test of validity for eta
      valideta = function(...) TRUE,
      name = 'bounded-logit'
    ),
    class = "link-glm"
  )
}

## Run som basic tests to check wether the derivatives are correct
bd_logit <- bounded_logit(mydelta)

## check invertibility
bd_logit$linkfun(bd_logit$linkinv(27))
library("numDeriv")

## check derivative
all.equal(grad(bd_logit$linkinv,2),bd_logit$mu.eta(2))

## Fit the constrained regression
mylogit_bounded <- NULL
mylogit_bounded <- glm(admit ~ gre + gpa + rank, data = mydata, family = binomial(link = bd_logit), start = rep(1/6, 6))
summary(mylogit_bounded)

## Do some checks / print a histogram
min(predict(mylogit_bounded, type='response')) > delta
max(predict(mylogit_bounded, type='response')) < 1 - delta

h1 <- predict(mylogit, type='response')
h2 <- predict(mylogit_bounded, type='response')

hist(h1, breaks=100, col=rgb(1,0,0,0.5), xlim=c(0,1), ylim=c(0,15), main="Overlapping Histogram", xlab="Variable")
hist(h2, breaks=100, col=rgb(0,0,1,0.5), add=T)
box()
	#!/usr/bin/env Rscript

	## Constrained logistic regression
	## In this gist we create a regression for which the predicted probabilities are contstrained. That is, they can not be
	## less than a minimum of delta, or a maxiumum of 1 - delta.

	## Based on:
	## - https://stats.idre.ucla.edu/r/dae/logit-regression/
	## - https://github.com/achambaz/tsml.cara.rct/blob/2b2aa282d4a11c601b37cacb368b67d03f6e8fc9/R/misc.R#L440
	## - https://github.com/achambaz/tsml.cara.rct/blob/fd426c2a6f91b692d379b4765d9900151d09daa6/R/targetGstar.R#L1
	## - https://stackoverflow.com/questions/15931403/modify-glm-function-to-adopt-user-specified-link-function-in-r
	## - help file of `family` and `glm`

	## First fit a regular logistic regression on some toy example
	mydata <- read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
	mydata$rank <- factor(mydata$rank)
	mylogit <- glm(admit ~ gre + gpa + rank, data = mydata, family = "binomial")
	summary(mylogit)
	range(predict(mylogit, type='response'))


	## Then create the bounded link functions. Change delta in the function to the actual bounds
	mydelta = 0.1
	bounded_logit <- function(delta) {
	structure(
	list(## mu mapsto logit( [mu - delta]/[1 - 2 delta] ).
	linkfun = function(mu) {
	qlogis((mu-delta)/(1-2*delta))
	},

	## eta mapsto delta + (1 - 2 delta) expit (eta).
	linkinv = function(eta) {
	delta + (1-2delta)plogis(eta)
	},


	## derivative of inverse link wrt eta
	mu.eta = function(eta) {
	expit.eta <- plogis(eta)
	(1-2delta)expit.eta*(1-expit.eta)
	},


	## test of validity for eta
	valideta = function(...) TRUE,
	name = 'bounded-logit'
	),
	class = "link-glm"
	)
	}

	## Run som basic tests to check wether the derivatives are correct
	bd_logit <- bounded_logit(mydelta)

	## check invertibility
	bd_logit$linkfun(bd_logit$linkinv(27))
	library("numDeriv")

	## check derivative
	all.equal(grad(bd_logit$linkinv,2),bd_logit$mu.eta(2))

	## Fit the constrained regression
	mylogit_bounded <- NULL
	mylogit_bounded <- glm(admit ~ gre + gpa + rank, data = mydata, family = binomial(link = bd_logit), start = rep(1/6, 6))
	summary(mylogit_bounded)

	## Do some checks / print a histogram
	min(predict(mylogit_bounded, type='response')) > delta
	max(predict(mylogit_bounded, type='response')) < 1 - delta

	h1 <- predict(mylogit, type='response')
	h2 <- predict(mylogit_bounded, type='response')

	hist(h1, breaks=100, col=rgb(1,0,0,0.5), xlim=c(0,1), ylim=c(0,15), main="Overlapping Histogram", xlab="Variable")
	hist(h2, breaks=100, col=rgb(0,0,1,0.5), add=T)
	box()