sergeyf/double_ml.py

## double_ml.py
'''
References:
    https://medium.com/teconomics-blog/using-ml-to-resolve-experiments-faster-bd8053ff602e
    https://insightr.wordpress.com/2017/06/28/cross-fitting-double-machine-learning-estimator/
    https://arxiv.org/pdf/1608.00060.pdf
'''

import numpy as np

from sklearn.linear_model import LassoCV, LinearRegression, BayesianRidge, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_predict

# make some data
# ref: https://arxiv.org/pdf/1712.04912.pdf
def friedman_function(n=500, d=6, sigma=0.1):
    assert d >= 5
    X = np.random.random(size=(n, d))
    noise = np.random.randn(n)
    b = 10*np.sin(np.pi * X[:, 0] * X[:, 1]) + 20*(X[:, 2] - 0.5)**2 + 10*X[:, 3] + 5*X[:, 4]
    e = np.sin(np.pi * X[:, 0] * X[:, 1])
    tau = (X[:, 0] + X[:, 1]) / 2.0

    T = np.random.binomial(n=1, p=e)
    y = b + (T - 0.5)*tau + sigma*noise
    return X, y, T, tau

X, y, T, tau = friedman_function(n=500, d=6, sigma=0.1)

'''
Approach 1: Double Selection with LassoCV
'''
lasso_y = LassoCV(normalize=True, cv=10).fit(X, y)
H = lasso_y.coef_.nonzero()[0]

lasso_T = LassoCV(normalize=True, cv=10).fit(X, T)
K = lasso_T.coef_.nonzero()[0]

# get union of H and K
H_union_K = np.sort(list(set(H).union(set(K))))
X_sub = np.hstack((X[:, H_union_K], T[:, np.newaxis]))

theta_est_ds = LinearRegression().fit(X_sub, y).coef_[-1]

'''
Approach 2: Double Machine Learning with Random Forest
'''
def get_theta_est(X_train, y_train, T_train, X_test, y_test, T_test, regressor, classifier):
    # ref: https://arxiv.org/pdf/1608.00060.pdf
    y_test_pred = regressor.fit(X_train, y_train).predict(X_test)

    T_test_pred = classifier.fit(X_train, T_train).predict_proba(X_test)[:, 1] # w - p(w=1|x)

    # eq (1.5)
    V_hat = T_test - T_test_pred
    theta_est = np.mean(V_hat * (y_test - y_test_pred)) / np.mean(V_hat * T_test)
    return theta_est

regressor = RandomForestRegressor(n_estimators=100)
classifier = RandomForestClassifier(n_estimators=100)
splitter = KFold(n_splits=10, shuffle=True)
theta_ests = []
for tr, ts in splitter.split(X):
    theta_est = get_theta_est(X[tr, :], y[tr], T[tr], X[ts, :], y[ts], T[ts], regressor, classifier)
    theta_ests.append(theta_est)

theta_est_dml = np.mean(theta_ests)

print('Average ITE', np.mean(tau))
print('Estimate of treatment effect using double selection with LassoCV:', theta_est_ds)
print('Estimate of treatment effect using double machine learning with Random Forest:', theta_est_dml)
	'''
	References:
	https://medium.com/teconomics-blog/using-ml-to-resolve-experiments-faster-bd8053ff602e
	https://insightr.wordpress.com/2017/06/28/cross-fitting-double-machine-learning-estimator/
	https://arxiv.org/pdf/1608.00060.pdf
	'''

	import numpy as np

	from sklearn.linear_model import LassoCV, LinearRegression, BayesianRidge, LogisticRegression
	from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
	from sklearn.model_selection import KFold, cross_val_predict

	# make some data
	# ref: https://arxiv.org/pdf/1712.04912.pdf
	def friedman_function(n=500, d=6, sigma=0.1):
	assert d >= 5
	X = np.random.random(size=(n, d))
	noise = np.random.randn(n)
	b = 10np.sin(np.pi X[:, 0] * X[:, 1]) + 20(X[:, 2] - 0.5)2 + 10X[:, 3] + 5*X[:, 4]
	e = np.sin(np.pi * X[:, 0] * X[:, 1])
	tau = (X[:, 0] + X[:, 1]) / 2.0

	T = np.random.binomial(n=1, p=e)
	y = b + (T - 0.5)tau + sigmanoise
	return X, y, T, tau

	X, y, T, tau = friedman_function(n=500, d=6, sigma=0.1)

	'''
	Approach 1: Double Selection with LassoCV
	'''
	lasso_y = LassoCV(normalize=True, cv=10).fit(X, y)
	H = lasso_y.coef_.nonzero()[0]

	lasso_T = LassoCV(normalize=True, cv=10).fit(X, T)
	K = lasso_T.coef_.nonzero()[0]

	# get union of H and K
	H_union_K = np.sort(list(set(H).union(set(K))))
	X_sub = np.hstack((X[:, H_union_K], T[:, np.newaxis]))

	theta_est_ds = LinearRegression().fit(X_sub, y).coef_[-1]

	'''
	Approach 2: Double Machine Learning with Random Forest
	'''
	def get_theta_est(X_train, y_train, T_train, X_test, y_test, T_test, regressor, classifier):
	# ref: https://arxiv.org/pdf/1608.00060.pdf
	y_test_pred = regressor.fit(X_train, y_train).predict(X_test)

	T_test_pred = classifier.fit(X_train, T_train).predict_proba(X_test)[:, 1] # w - p(w=1\|x)

	# eq (1.5)
	V_hat = T_test - T_test_pred
	theta_est = np.mean(V_hat * (y_test - y_test_pred)) / np.mean(V_hat * T_test)
	return theta_est

	regressor = RandomForestRegressor(n_estimators=100)
	classifier = RandomForestClassifier(n_estimators=100)
	splitter = KFold(n_splits=10, shuffle=True)
	theta_ests = []
	for tr, ts in splitter.split(X):
	theta_est = get_theta_est(X[tr, :], y[tr], T[tr], X[ts, :], y[ts], T[ts], regressor, classifier)
	theta_ests.append(theta_est)

	theta_est_dml = np.mean(theta_ests)

	print('Average ITE', np.mean(tau))
	print('Estimate of treatment effect using double selection with LassoCV:', theta_est_ds)
	print('Estimate of treatment effect using double machine learning with Random Forest:', theta_est_dml)