frbl/glm_logistic_treatment.R

## glm_logistic_treatment.R
Osample_p_full <- read.csv('http://frbl.eu/files/gist_ac602828fedba735b4d1e4cb16472cbe.csv')
formula <- Delta ~ W + A
glm_pred <- glm(formula = formula(formula), data = Osample_p_full, family= binomial())
print(glm_pred)
# Call:  glm(formula = formula(formula), family = binomial(), data = Osample_p_full)
#
# Coefficients:
# (Intercept)            A            W
#      18.378      -19.241        0.379

# Degrees of Freedom: 1999 Total (i.e. Null);  1997 Residual
# Null Deviance:      2773
# Residual Deviance: 1940         AIC: 1946

# Predict P(Delta | A = 1)
mean(predict(glm_pred, newdata = Osample_p_full[Osample_p_full$A == 1,], type='response'))
# [1] 0.339934

# Predict P(Delta | A = 0)
mean(predict(glm_pred, newdata = Osample_p_full[Osample_p_full$A == 0,], type='response'))
# [1] 1 (!)

# This makes sense, as A = never 0 when Delta = 0, and sometimes 0 when Delta = 1.
# Then, what we need is P(Delta | C_x), which boils down to the following:
P <- mean(predict(glm_pred, type = 'response'))
P / (1 - P)
# [1] 1 (P == 0.5)

# However, in the influence curve we only use a single observation for calculating the H-ratio
# h*(C_x(t)) / h(C_x(t)). As per the eq on p25 in the chapter, this boils down to P(Delta | C_x) / (1 - P(Delta | C_x)).
# Unfortunately, in this case, P(Delta = 1 | A = 0, W) ~= 1, and as such, lim_{p -> 1} 1 / (1 - p) goes to infty.
set.seed(12345)
newdata = data.frame(A = 0, W = rnorm(1,0,1))
P <- predict(glm_pred, newdata = newdata,  type = 'response')
P / (1 - P)
# [1] 119575498 (Large!!, because P ~= 1)
	Osample_p_full <- read.csv('http://frbl.eu/files/gist_ac602828fedba735b4d1e4cb16472cbe.csv')
	formula <- Delta ~ W + A
	glm_pred <- glm(formula = formula(formula), data = Osample_p_full, family= binomial())
	print(glm_pred)
	# Call: glm(formula = formula(formula), family = binomial(), data = Osample_p_full)
	#
	# Coefficients:
	# (Intercept) A W
	# 18.378 -19.241 0.379

	# Degrees of Freedom: 1999 Total (i.e. Null); 1997 Residual
	# Null Deviance: 2773
	# Residual Deviance: 1940 AIC: 1946

	# Predict P(Delta \| A = 1)
	mean(predict(glm_pred, newdata = Osample_p_full[Osample_p_full$A == 1,], type='response'))
	# [1] 0.339934

	# Predict P(Delta \| A = 0)
	mean(predict(glm_pred, newdata = Osample_p_full[Osample_p_full$A == 0,], type='response'))
	# [1] 1 (!)

	# This makes sense, as A = never 0 when Delta = 0, and sometimes 0 when Delta = 1.
	# Then, what we need is P(Delta \| C_x), which boils down to the following:
	P <- mean(predict(glm_pred, type = 'response'))
	P / (1 - P)
	# [1] 1 (P == 0.5)

	# However, in the influence curve we only use a single observation for calculating the H-ratio
	# h*(C_x(t)) / h(C_x(t)). As per the eq on p25 in the chapter, this boils down to P(Delta \| C_x) / (1 - P(Delta \| C_x)).
	# Unfortunately, in this case, P(Delta = 1 \| A = 0, W) ~= 1, and as such, lim_{p -> 1} 1 / (1 - p) goes to infty.
	set.seed(12345)
	newdata = data.frame(A = 0, W = rnorm(1,0,1))
	P <- predict(glm_pred, newdata = newdata, type = 'response')
	P / (1 - P)
	# [1] 119575498 (Large!!, because P ~= 1)