Skip to content

Instantly share code, notes, and snippets.

@jtilly

jtilly/glm_qc.py Secret

Last active April 17, 2020 15:06
Show Gist options
  • Save jtilly/d2ff9b7bd6c690a35db052d1730e0a06 to your computer and use it in GitHub Desktop.
Save jtilly/d2ff9b7bd6c690a35db052d1730e0a06 to your computer and use it in GitHub Desktop.
Compare GLM implementations
# Setup
#
# # You need to clone
# - https://github.axa.com/martin-rohbeck-external/k-claims-modelling
# - https://github.axa.com/jan-tilly/glmnet_python
# - https://github.axa.com/jan-tilly/glm_benchmarks
# and then run (adjust paths to repos accordingly)
# conda run -n ${PROJECT_FLAVOR} mamba install -y fortran-compiler
# conda run -n ${PROJECT_FLAVOR} pip install --no-use-pep517 --disable-pip-version-check -e ~/code/k-claims-modelling
# conda run -n ${PROJECT_FLAVOR} --cwd ~/code/glmnet_python python setup.py install
# conda run -n ${PROJECT_FLAVOR} pip install --no-use-pep517 --disable-pip-version-check -e ~/code/glm_benchmarks
import os
from time import perf_counter
import glm_benchmarks.sklearn_fork as sklearn_qc
import numpy as np
import pandas as pd
import scipy.sparse
from glmnet_python import glmnet, glmnetPredict
from npp.config import get_logger
import k_claims_modelling.external.sklearn.linear_model.glm as sklearn
from k_claims_modelling.evaluation.gini import compute_gini
from k_claims_modelling.modelling.regressors import exposure_correction
logger = get_logger()
data_dir = "/home/shared_folders/pricing/claims_modelling/test_data_for_profiling_glms"
# %%
# load data
df = pd.read_parquet(os.path.join(data_dir, "outcomes.parquet"))
X = pd.read_parquet(os.path.join(data_dir, "X.parquet"))
# subsample
idx = df.sample(n=1_000_000).index
df = df.loc[idx].reset_index(drop=True)
X = X.loc[idx].reset_index(drop=True)
print(f"Dim of data: {X.shape}")
print(f"Sparseness: {(np.abs(X.values) < 1e-10).mean():.3f}")
print(f"Size of data frame: {X.memory_usage().sum() / float(1<<30):.2f} GB")
# %%
y = df["sanzkh02"].to_numpy()
exposure = df["je"].to_numpy()
train_set = (df["sample"] == "train").to_numpy()
test_set = (df["sample"] == "test").to_numpy()
future_set = (df["sample"] == "future").to_numpy()
offset = df["offset_kh_sach_frequenz"].to_numpy()
y_adj, sample_weight = exposure_correction(
power=1, y=y, exposure=exposure, offset=offset
)
# %%
model_args = dict(
family="poisson",
max_iter=100,
random_state=12345,
copy_X=True,
selection="random",
tol=1e-5,
)
# %%
for alpha, l1_ratio, sparse in (
(0.01, 0.00, False),
(1.00, 0.50, False),
(0.01, 0.00, True),
(1.00, 0.50, True),
):
print("")
print("*********************************")
print(f"{alpha=}, {l1_ratio=}, {sparse=}")
print("")
if sparse:
print("Creating sparse matrix...")
Xtrain = scipy.sparse.csc_matrix(X[train_set])
else:
Xtrain = X[train_set].to_numpy()
# new implementation
print("")
print("Using new implementation...")
tic = perf_counter()
poisson_regressor_qc = sklearn_qc.GeneralizedLinearRegressor(
standardize=True, alpha=alpha, l1_ratio=l1_ratio, **model_args
).fit(X=Xtrain, y=y_adj[train_set], sample_weight=sample_weight[train_set])
toc = perf_counter()
print(f"{toc-tic:.4f} seconds")
y_pred_qc = poisson_regressor_qc.predict(X[test_set]) * sample_weight[test_set]
gini = compute_gini(
y_true=y[test_set], y_score=y_pred_qc, exposure=exposure[test_set]
)
print(f"{gini=}")
print(f"mean outcome: {y[test_set].sum() / exposure[test_set].sum()}")
print(f"mean prediction: {y_pred_qc.sum() / exposure[test_set].sum()}")
# glmnet
print("")
print("Using glmnet...")
tic = perf_counter()
poisson_regressor_glmnet = glmnet(
x=Xtrain,
y=y_adj[train_set].copy(),
weights=sample_weight[train_set],
standardize=True,
lambdau=np.array([alpha]),
alpha=l1_ratio,
family=model_args["family"],
thresh=model_args["tol"],
)
toc = perf_counter()
print(f"{toc-tic:.4f} seconds")
y_pred_glmnet = (
np.exp(glmnetPredict(poisson_regressor_glmnet, X[test_set]).flatten())
* sample_weight[test_set]
)
gini = compute_gini(
y_true=y[test_set], y_score=y_pred_glmnet, exposure=exposure[test_set]
)
print(f"{gini=}")
print(f"mean outcome: {y[test_set].sum() / exposure[test_set].sum()}")
print(f"mean prediction: {y_pred_glmnet.sum() / exposure[test_set].sum()}")
# compare predictions
print(f"all close? {np.allclose(y_pred_qc, y_pred_glmnet, atol=1e-04)}")
# old implementation
print("")
print("Using old implementation... (no standardization)")
tic = perf_counter()
poisson_regressor = sklearn.GeneralizedLinearRegressor(
alpha=alpha, l1_ratio=l1_ratio, **model_args
).fit(X=Xtrain, y=y_adj[train_set], sample_weight=sample_weight[train_set])
toc = perf_counter()
print(f"{toc-tic:.4f} seconds")
y_pred = poisson_regressor.predict(X[test_set]) * sample_weight[test_set]
gini = compute_gini(y_true=y[test_set], y_score=y_pred, exposure=exposure[test_set])
print(f"{gini=}")
print(f"mean outcome: {y[test_set].sum() / exposure[test_set].sum()}")
print(f"mean prediction: {y_pred.sum() / exposure[test_set].sum()}")
# compare predictions
print(f"all close? {np.allclose(y_pred_qc, y_pred, atol=1e-04)}")
# %%
Dim of data: (10000000, 107)
Sparseness: 0.683
Size of data frame: 7.97 GB
*********************************
alpha=0.01, l1_ratio=0.0, sparse=False
Using new implementation...
41.0648 seconds
gini=0.26368907871297714
mean outcome: 0.04747663840483652
mean prediction: 0.047279319092816983
Using glmnet...
91.3531 seconds
gini=0.27551655391490293
mean outcome: 0.04747663840483652
mean prediction: 0.04728485394366472
all close? False
Using old implementation... (no standardization)
36.6776 seconds
gini=0.25560893366357507
mean outcome: 0.04747663840483652
mean prediction: 0.047348574848387524
all close? False
*********************************
alpha=1.0, l1_ratio=0.5, sparse=False
Using new implementation...
0.09614459466725817 0 0
0.034378559002931665 1 1
0.01170238383401215 2 2
0.0034775112012216183 3 3
0.000708324797992112 4 4
5.2231794878223296e-05 5 5
56.4671 seconds
gini=0.11972711376286291
mean outcome: 0.04747663840483652
mean prediction: 0.04729607839402956
Using glmnet...
7.4578 seconds
gini=0.12023107566353006
mean outcome: 0.04747663840483652
mean prediction: 0.04729181545853376
all close? True
Using old implementation... (no standardization)
0.9614459466725815 0 0
0.34378559002931663 1 1
0.11702383834012149 2 2
0.03477511201221618 3 3
0.00708324797992112 4 4
0.0005223179487822329 5 5
52.6287 seconds
gini=0.11972711376286291
mean outcome: 0.04747663840483652
mean prediction: 0.04729607839402956
all close? True
*********************************
alpha=0.01, l1_ratio=0.0, sparse=True
Creating sparse matrix...
Using new implementation...
338.1123 seconds
gini=0.263689078359197
mean outcome: 0.04747663840483652
mean prediction: 0.047279319092816886
Using glmnet...
57.4690 seconds
gini=0.2755165538902576
mean outcome: 0.04747663840483652
mean prediction: 0.0472848539436662
all close? False
Using old implementation... (no standardization)
291.9076 seconds
gini=0.2636890785530665
mean outcome: 0.04747663840483652
mean prediction: 0.04727931909281683
all close? True
*********************************
alpha=1.0, l1_ratio=0.5, sparse=True
Creating sparse matrix...
Using new implementation...
0.09614459466725817 0 0
0.034378559002931665 1 1
0.01170238383401215 2 2
0.0034775112012216183 3 3
0.000708324797992112 4 4
5.2231794878223296e-05 5 5
456.6564 seconds
gini=0.11972711376286291
mean outcome: 0.04747663840483652
mean prediction: 0.04729607839402956
Using glmnet...
3.9307 seconds
gini=0.12011246482376858
mean outcome: 0.04747663840483652
mean prediction: 0.04729181545858199
all close? True
Using old implementation... (no standardization)
1.344542742914321 0 0
0.6710968536517165 1 1
0.11795095198240992 2 2
0.03510388281768133 3 3
0.007180409077755724 4 4
0.0005352931468411203 5 5
389.8593 seconds
gini=0.12033554412788987
mean outcome: 0.04747663840483652
mean prediction: 0.04729629084703527
all close? True
Dim of data: (1000000, 107)
Sparseness: 0.683
Size of data frame: 0.80 GB
*********************************
alpha=0.01, l1_ratio=0.0, sparse=False
Using new implementation...
4.1740 seconds
gini=0.263818790086771
mean outcome: 0.04676540361757755
mean prediction: 0.04670313684179153
Using glmnet...
7.9436 seconds
gini=0.2724562578231543
mean outcome: 0.04676540361757755
mean prediction: 0.04674076928845567
all close? False
Using old implementation... (no standardization)
3.6724 seconds
gini=0.2532641359545414
mean outcome: 0.04676540361757755
mean prediction: 0.04619259199982138
all close? False
*********************************
alpha=1.0, l1_ratio=0.5, sparse=False
Using new implementation...
0.09619816120501168 0 0
0.03441165041414234 1 1
0.011726676309588263 2 2
0.003495483932480208 3 3
0.000718099939306369 4 4
5.41389992328904e-05 5 5
5.6504 seconds
gini=0.12005884445047671
mean outcome: 0.04676540361757755
mean prediction: 0.04666568650088688
Using glmnet...
0.6818 seconds
gini=0.12027297891459444
mean outcome: 0.04676540361757755
mean prediction: 0.046661043630974286
all close? True
Using old implementation... (no standardization)
0.9619816120501168 0 0
0.3441165041414234 1 1
0.11726676309588263 2 2
0.034954839324802076 3 3
0.0071809993930636905 4 4
0.000541389992328904 5 5
5.2115 seconds
gini=0.12005884445047671
mean outcome: 0.04676540361757755
mean prediction: 0.04666568650088688
all close? True
*********************************
alpha=0.01, l1_ratio=0.0, sparse=True
Creating sparse matrix...
Using new implementation...
32.5009 seconds
gini=0.2638187936317561
mean outcome: 0.04676540361757755
mean prediction: 0.046703136841791504
Using glmnet...
4.2908 seconds
gini=0.2724562578231521
mean outcome: 0.04676540361757755
mean prediction: 0.046740769288456385
all close? False
Using old implementation... (no standardization)
28.1692 seconds
gini=0.2638187961688558
mean outcome: 0.04676540361757755
mean prediction: 0.04670313684179154
all close? True
*********************************
alpha=1.0, l1_ratio=0.5, sparse=True
Creating sparse matrix...
Using new implementation...
0.09619816120501168 0 0
0.03441165041414234 1 1
0.011726676309588263 2 2
0.003495483932480208 3 3
0.000718099939306369 4 4
5.41389992328904e-05 5 5
42.4812 seconds
gini=0.12005884445047671
mean outcome: 0.04676540361757755
mean prediction: 0.04666568650088688
Using glmnet...
0.3295 seconds
gini=0.11954649640739994
mean outcome: 0.04676540361757755
mean prediction: 0.04666104363097935
all close? True
Using old implementation... (no standardization)
1.3457867135381405 0 0
0.6713103191660521 1 1
0.11819264109487121 2 2
0.03528348997508118 3 3
0.0072786327343033605 4 4
0.0005546862828521083 5 5
36.9609 seconds
gini=0.11950432517998748
mean outcome: 0.04676540361757755
mean prediction: 0.04666591512425464
all close? True
Dim of data: (1000000, 107)
Sparseness: 0.683
Size of data frame: 0.80 GB
*********************************
alpha=1.0, l1_ratio=0.5, sparse=False
Using new implementation...
0.09615708450804317 0 0
0.034386273780650345 1 1
0.011708045105922351 2 2
0.003481694975107442 3 3
0.000710592744205067 4 4
5.2670328567151794e-05 5 5
8.5725 seconds
gini=0.11747903153166883
mean outcome: 0.04873558735427359
mean prediction: 0.04714181671456767
Wrote profile results to glm_qc.py.lprof
Timer unit: 1e-06 s
Total time: 6.57312 s
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py
Function: _safe_sandwich_dot at line 105
Line # Hits Time Per Hit % Time Line Contents
==============================================================
105 @profile
106 def _safe_sandwich_dot(X, d, intercept=False):
107 """Compute sandwich product X.T @ diag(d) @ X.
108
109 With ``intercept=True``, X is treated as if a column of 1 were appended as
110 first column of X.
111 X can be sparse, d must be an ndarray. Always returns a ndarray."""
112 8 74.0 9.2 0.0 if sparse.issparse(X):
113 # TODO: There's a bit of room to accelerate this by avoiding the
114 # allocation of a new sparse matrix every time we pass through this
115 # step. The X.multiply creates a new matrix. Avoiding the allocation
116 # and updating the rows in place accelerates this line by ~20%.
117 temp = X.transpose() @ X.multiply(d[:, np.newaxis])
118 # for older versions of numpy and scipy, temp may be a np.matrix
119 temp = _safe_toarray(temp)
120 8 22.0 2.8 0.0 elif type(X) is ColScaledSpMat:
121 term1 = X.mat.transpose() @ X.mat.multiply(d[:, np.newaxis])
122 term2 = X.mat.transpose().dot(d)[:, np.newaxis] * X.shift
123 term3 = term2.T
124 term4 = (X.shift.T * X.shift) * d.sum()
125 temp = term1 + term2 + term3 + term4
126 temp = _safe_toarray(temp)
127 else:
128 8 6477524.0 809690.5 98.5 temp = (X.T * d) @ X
129 8 40.0 5.0 0.0 if intercept:
130 8 38.0 4.8 0.0 dim = X.shape[1] + 1
131 8 31.0 3.9 0.0 if type(X) is ColScaledSpMat:
132 order = "F"
133 8 79.0 9.9 0.0 elif sparse.issparse(X):
134 order = "F" if sparse.isspmatrix_csc(X) else "C"
135 else:
136 8 36.0 4.5 0.0 order = "F" if X.flags["F_CONTIGUOUS"] else "C"
137 8 159.0 19.9 0.0 res = np.empty((dim, dim), dtype=max(X.dtype, d.dtype), order=order)
138 8 4656.0 582.0 0.1 res[0, 0] = d.sum()
139 8 90118.0 11264.8 1.4 res[1:, 0] = d @ X
140 8 56.0 7.0 0.0 res[0, 1:] = res[1:, 0]
141 8 272.0 34.0 0.0 res[1:, 1:] = temp
142 else:
143 res = temp
144 8 12.0 1.5 0.0 return res
Total time: 6.3131 s
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py
Function: _eta_mu_score_fisher at line 730
Line # Hits Time Per Hit % Time Line Contents
==============================================================
730 @profile
731 def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link, diag_fisher=False):
732 """Compute linear predictor, mean, score function and fisher matrix.
733
734 It calculates the linear predictor, the mean, score function
735 (derivative of log-likelihood) and Fisher information matrix
736 all in one go as function of `coef` (:math:`w`) and the data.
737
738 Parameters
739 ----------
740 diag_fisher : boolean, optional (default=False)
741 If ``True``, returns only an array d such that
742 fisher = X.T @ np.diag(d) @ X.
743
744 Returns
745 -------
746 (eta, mu, score, fisher) : tuple with 4 elements
747 The 4 elements are:
748
749 * eta: ndarray, shape (X.shape[0],)
750 * mu: ndarray, shape (X.shape[0],)
751 * score: ndarray, shape (X.shape[0],)
752 * fisher:
753
754 * If diag_fisher is ``False``, the full fisher matrix,
755 an array of shape (X.shape[1], X.shape[1])
756 * If diag_fisher is ``True`, an array of shape (X.shape[0])
757 """
758 7 22.0 3.1 0.0 intercept = coef.size == X.shape[1] + 1
759 # eta = linear predictor
760 7 87047.0 12435.3 1.4 eta = _safe_lin_pred(X, coef)
761 7 109902.0 15700.3 1.7 mu = link.inverse(eta)
762
763 # FOR TWEEDIE: sigma_inv = weights / (mu ** p) during optimization bc phi = 1
764 7 77727.0 11103.9 1.2 sigma_inv = 1.0 / self.variance(mu, phi=phi, weights=weights)
765 7 107635.0 15376.4 1.7 d1 = link.inverse_derivative(eta) # = h'(eta)
766 # Alternatively:
767 # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g
768 # d1 = 1./link.derivative(mu)
769 7 12412.0 1773.1 0.2 d1_sigma_inv = d1 * sigma_inv
770 7 19982.0 2854.6 0.3 temp = d1_sigma_inv * (y - mu)
771 7 19.0 2.7 0.0 if intercept:
772 7 78613.0 11230.4 1.2 score = np.concatenate(([temp.sum()], temp @ X))
773 else:
774 score = temp @ X
775
776 7 12680.0 1811.4 0.2 d2_sigma_inv = d1 * d1_sigma_inv
777 7 17.0 2.4 0.0 if diag_fisher:
778 fisher_matrix = d2_sigma_inv
779 else:
780 7 5807028.0 829575.4 92.0 fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv, intercept=intercept)
781 7 12.0 1.7 0.0 return eta, mu, score, fisher_matrix
Total time: 0.02796 s
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py
Function: _cd_cycle at line 1131
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1131 @profile
1132 def _cd_cycle(
1133 d,
1134 X,
1135 coef,
1136 score,
1137 fisher,
1138 P1,
1139 P2,
1140 n_cycles,
1141 inner_tol,
1142 max_inner_iter=1000,
1143 selection="cyclic",
1144 random_state=None,
1145 diag_fisher=False,
1146 ):
1147 """Compute inner loop of coordinate descent, i.e. cycles through features.
1148
1149 Minimization of 1-d subproblems::
1150
1151 min_z q(d+z*e_j) - q(d)
1152 = min_z A_j z + 1/2 B_jj z^2 + ||P1_j (w_j+d_j+z)||_1
1153
1154 A = f'(w) + d*H(w) + (w+d)*P2
1155 B = H+P2
1156 Note: f'=-score and H=fisher are updated at the end of outer iteration.
1157 """
1158 # TODO: use sparsity (coefficient already 0 due to L1 penalty)
1159 # => active set of features for featurelist, see paper
1160 # of Improved GLMNET or Gap Safe Screening Rules
1161 # https://arxiv.org/abs/1611.05780
1162 7 24.0 3.4 0.1 n_samples, n_features = X.shape
1163 7 22.0 3.1 0.1 intercept = coef.size == X.shape[1] + 1
1164 7 10.0 1.4 0.0 idx = 1 if intercept else 0 # offset if coef[0] is intercept
1165 # ES: This is weird. Should this be B = fisher.copy()? Why would you do this?
1166 7 10.0 1.4 0.0 B = fisher
1167 7 14.0 2.0 0.1 if P2.ndim == 1:
1168 7 93.0 13.3 0.3 coef_P2 = coef[idx:] * P2
1169 7 10.0 1.4 0.0 if not diag_fisher:
1170 7 92.0 13.1 0.3 idiag = np.arange(start=idx, stop=B.shape[0])
1171 # B[np.diag_indices_from(B)] += P2
1172 7 198.0 28.3 0.7 B[(idiag, idiag)] += P2
1173 else:
1174 coef_P2 = coef[idx:] @ P2
1175 if not diag_fisher:
1176 if sparse.issparse(P2):
1177 B[idx:, idx:] += P2.toarray()
1178 else:
1179 B[idx:, idx:] += P2
1180 7 32.0 4.6 0.1 A = -score
1181 7 44.0 6.3 0.2 A[idx:] += coef_P2
1182 # A += d @ (H+P2) but so far d=0
1183 # inner loop
1184 7 33.0 4.7 0.1 for inner_iter in range(1, max_inner_iter + 1):
1185 7 13.0 1.9 0.0 inner_iter += 1
1186 7 11.0 1.6 0.0 n_cycles += 1
1187 # cycle through features, update intercept separately at the end
1188 7 15.0 2.1 0.1 if selection == "random":
1189 7 232.0 33.1 0.8 featurelist = random_state.permutation(n_features)
1190 else:
1191 featurelist = np.arange(n_features)
1192 756 1432.0 1.9 5.1 for j in featurelist:
1193 # minimize_z: a z + 1/2 b z^2 + c |d+z|
1194 # a = A_j
1195 # b = B_jj > 0
1196 # c = |P1_j| = P1_j > 0, see 1.3
1197 # d = w_j + d_j
1198 # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4)
1199 # with beta = z+d, beta_hat = d-a/b and gamma = c/b
1200 # z = 1/b * S(bd-a,c) - d
1201 # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding
1202 749 1523.0 2.0 5.4 jdx = j + idx # index for arrays containing entries for intercept
1203 749 1355.0 1.8 4.8 a = A[jdx]
1204 749 1101.0 1.5 3.9 if diag_fisher:
1205 # Note: fisher is ndarray of shape (n_samples,) => no idx
1206 # Calculate Bj = B[j, :] = B[:, j] as it is needed later anyway
1207 Bj = np.zeros_like(A)
1208 if intercept:
1209 Bj[0] = fisher.sum()
1210 if sparse.issparse(X):
1211 Bj[idx:] = _safe_toarray(
1212 X[:, j].transpose() @ X.multiply(fisher[:, np.newaxis])
1213 ).ravel()
1214 else:
1215 Bj[idx:] = (fisher * X[:, j]) @ X
1216
1217 if P2.ndim == 1:
1218 Bj[idx:] += P2[j]
1219 else:
1220 if sparse.issparse(P2):
1221 # slice columns as P2 is csc
1222 Bj[idx:] += P2[:, j].toarray().ravel()
1223 else:
1224 Bj[idx:] += P2[:, j]
1225 b = Bj[jdx]
1226 else:
1227 749 1421.0 1.9 5.1 b = B[jdx, jdx]
1228
1229 # those ten lines are what it is all about
1230 749 1513.0 2.0 5.4 if b <= 0:
1231 z = 0
1232 749 1674.0 2.2 6.0 elif P1[j] == 0:
1233 z = -a / b
1234 749 2334.0 3.1 8.3 elif a + P1[j] < b * (coef[jdx] + d[jdx]):
1235 z = -(a + P1[j]) / b
1236 749 2238.0 3.0 8.0 elif a - P1[j] > b * (coef[jdx] + d[jdx]):
1237 z = -(a - P1[j]) / b
1238 else:
1239 749 1744.0 2.3 6.2 z = -(coef[jdx] + d[jdx])
1240
1241 # update direction d
1242 749 1634.0 2.2 5.8 d[jdx] += z
1243 # update A because d_j is now d_j+z
1244 # A = f'(w) + d*H(w) + (w+d)*P2
1245 # => A += (H+P2)*e_j z = B_j * z
1246 # Note: B is symmetric B = B.transpose
1247 749 1117.0 1.5 4.0 if diag_fisher:
1248 # Bj = B[:, j] calculated above, still valid
1249 A += Bj * z
1250 else:
1251 # B is symmetric, C- or F-contiguous, but never sparse
1252 749 1478.0 2.0 5.3 if B.flags["F_CONTIGUOUS"]:
1253 # slice columns like for sparse csc
1254 749 5177.0 6.9 18.5 A += B[:, jdx] * z
1255 else: # B.flags['C_CONTIGUOUS'] might be true
1256 # slice rows
1257 A += B[jdx, :] * z
1258 # end of cycle over features
1259 # update intercept
1260 7 12.0 1.7 0.0 if intercept:
1261 7 9.0 1.3 0.0 if diag_fisher:
1262 Bj = np.zeros_like(A)
1263 Bj[0] = fisher.sum()
1264 Bj[1:] = fisher @ X
1265 b = Bj[0]
1266 else:
1267 7 15.0 2.1 0.1 b = B[0, 0]
1268 7 24.0 3.4 0.1 z = 0 if b <= 0 else -A[0] / b
1269 7 16.0 2.3 0.1 d[0] += z
1270 7 11.0 1.6 0.0 if diag_fisher:
1271 A += Bj * z
1272 else:
1273 7 12.0 1.7 0.0 if B.flags["F_CONTIGUOUS"]:
1274 7 47.0 6.7 0.2 A += B[:, 0] * z
1275 else:
1276 A += B[0, :] * z
1277 # end of complete cycle
1278 # stopping criterion for inner loop
1279 # sum_i(|minimum of norm of subgrad of q(d)_i|)
1280 # subgrad q(d) = A + subgrad ||P1*(w+d)||_1
1281 7 571.0 81.6 2.0 mn_subgrad = _min_norm_sugrad(coef=coef + d, grad=A, P2=None, P1=P1)
1282 7 605.0 86.4 2.2 mn_subgrad = linalg.norm(mn_subgrad, ord=1)
1283 7 14.0 2.0 0.1 if mn_subgrad <= inner_tol:
1284 7 11.0 1.6 0.0 if inner_iter == 1:
1285 inner_tol = inner_tol / 4.0
1286 7 9.0 1.3 0.0 break
1287 # end of inner loop
1288 7 10.0 1.4 0.0 return d, coef_P2, n_cycles, inner_tol
Total time: 6.94774 s
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py
Function: _cd_solver at line 1291
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1291 @profile
1292 def _cd_solver(
1293 coef,
1294 X,
1295 y,
1296 weights,
1297 P1,
1298 P2,
1299 fit_intercept,
1300 family,
1301 link,
1302 max_iter=100,
1303 max_inner_iter=1000,
1304 tol=1e-4,
1305 selection="cyclic ",
1306 random_state=None,
1307 diag_fisher=False,
1308 copy_X=True,
1309 ):
1310 """Solve GLM with L1 and L2 penalty by coordinate descent algorithm.
1311
1312 The objective being minimized in the coefficients w=coef is::
1313
1314 F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1
1315
1316 An Improved GLMNET for L1-regularized Logistic Regression:
1317
1318 1. Find optimal descent direction d by minimizing
1319 min_d F(w+d) = min_d F(w+d) - F(w)
1320 2. Quadratic approximation of F(w+d)-F(w) = q(d):
1321 using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives:
1322 q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d
1323 + ||P1*(w+d)||_1 - ||P1*w||_1
1324 Then minimize q(d): min_d q(d)
1325 3. Coordinate descent by updating coordinate j (d -> d+z*e_j):
1326 min_z q(d+z*e_j)
1327 = min_z q(d+z*e_j) - q(d)
1328 = min_z A_j z + 1/2 B_jj z^2
1329 + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1
1330 A = f'(w) + d*H(w) + (w+d)*P2
1331 B = H + P2
1332
1333 Repeat steps 1-3 until convergence.
1334 Note: Use Fisher matrix instead of Hessian for H.
1335 Note: f' = -score, H = Fisher matrix
1336
1337 Parameters
1338 ----------
1339 coef : ndarray, shape (c,)
1340 If fit_intercept=False, shape c=X.shape[1].
1341 If fit_intercept=True, then c=X.shape[1] + 1.
1342
1343 X : {ndarray, csc sparse matrix}, shape (n_samples, n_features)
1344 Training data (with intercept included if present). If not sparse,
1345 pass directly as Fortran-contiguous data to avoid
1346 unnecessary memory duplication.
1347
1348 y : ndarray, shape (n_samples,)
1349 Target values.
1350
1351 weights: ndarray, shape (n_samples,)
1352 Sample weights with which the deviance is weighted. The weights must
1353 bee normalized and sum to 1.
1354
1355 P1 : {ndarray}, shape (n_features,)
1356 The L1-penalty vector (=diagonal matrix)
1357
1358 P2 : {ndarray, csc sparse matrix}, shape (n_features, n_features)
1359 The L2-penalty matrix or vector (=diagonal matrix). If a matrix is
1360 passed, it must be symmetric. If X is sparse, P2 must also be sparse.
1361
1362 fit_intercept : boolean, optional (default=True)
1363 Specifies if a constant (a.k.a. bias or intercept) should be
1364 added to the linear predictor (X*coef+intercept).
1365
1366 family : ExponentialDispersionModel
1367
1368 link : Link
1369
1370 max_iter : int, optional (default=100)
1371 Maximum numer of outer (Newton) iterations.
1372
1373 max_inner_iter : int, optional (default=1000)
1374 Maximum number of iterations in each inner loop, i.e. max number of
1375 cycles over all features per inner loop.
1376
1377 tol : float, optional (default=1e-4)
1378 Convergence criterion is
1379 sum_i(|minimum of norm of subgrad of objective_i|)<=tol.
1380
1381 selection : str, optional (default='cyclic')
1382 If 'random', randomly chose features in inner loop.
1383
1384 random_state : {int, RandomState instance, None}, optional (default=None)
1385
1386 diag_fisher : boolean, optional (default=False)
1387 ``False`` calculates full fisher matrix, ``True`` only diagonal matrix
1388 s.t. fisher = X.T @ diag @ X. This saves storage but needs more
1389 matrix-vector multiplications.
1390
1391 copy_X : boolean, optional (default=True)
1392 If ``True``, X will be copied; else, it may be overwritten.
1393
1394 Returns
1395 -------
1396 coef : ndarray, shape (c,)
1397 If fit_intercept=False, shape c=X.shape[1].
1398 If fit_intercept=True, then c=X.shape[1] + 1.
1399
1400 n_iter : number of outer iterations = newton iterations
1401
1402 n_cycles : number of cycles over features
1403
1404 References
1405 ----------
1406 Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
1407 An Improved GLMNET for L1-regularized Logistic Regression,
1408 Journal of Machine Learning Research 13 (2012) 1999-2030
1409 https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
1410 """
1411 1 4.0 4.0 0.0 if type(X) is not ColScaledSpMat:
1412 # TODO: because copy_X is being passed here and is also being passed in
1413 # check_X_y in fit(...), X is being copied twice. Check if this is true.
1414 2 279886.0 139943.0 4.0 X = check_array(
1415 1 4.0 4.0 0.0 X, "csc", dtype=[np.float64, np.float32], order="F", copy=copy_X
1416 )
1417 1 7.0 7.0 0.0 if P2.ndim == 2:
1418 P2 = check_array(
1419 P2, "csc", dtype=[np.float64, np.float32], order="F", copy=copy_X
1420 )
1421 1 9.0 9.0 0.0 if sparse.issparse(X):
1422 if not sparse.isspmatrix_csc(P2):
1423 raise ValueError(
1424 "If X is sparse, P2 must also be sparse csc"
1425 "format. Got P2 not sparse."
1426 )
1427 1 30.0 30.0 0.0 random_state = check_random_state(random_state)
1428 # Note: we already set P2 = l2*P2, P1 = l1*P1
1429 # Note: we already symmetrized P2 = 1/2 (P2 + P2')
1430 1 2.0 2.0 0.0 n_iter = 0 # number of outer iterations
1431 1 2.0 2.0 0.0 n_cycles = 0 # number of (complete) cycles over features
1432 1 2.0 2.0 0.0 converged = False
1433 1 2.0 2.0 0.0 n_samples, n_features = X.shape
1434 1 2.0 2.0 0.0 idx = 1 if fit_intercept else 0 # offset if coef[0] is intercept
1435 # line search parameters
1436 1 2.0 2.0 0.0 (beta, sigma) = (0.5, 0.01)
1437 # some precalculations
1438 # Note: For diag_fisher=False, fisher = X.T @ fisher @ X and fisher is a
1439 # 1d array representing a diagonal matrix.
1440 2 859869.0 429934.5 12.4 eta, mu, score, fisher = family._eta_mu_score_fisher(
1441 1 3.0 3.0 0.0 coef=coef, phi=1, X=X, y=y, weights=weights, link=link, diag_fisher=diag_fisher
1442 )
1443 # set up space for search direction d for inner loop
1444 1 50.0 50.0 0.0 d = np.zeros_like(coef)
1445
1446 # minimum subgradient norm
1447 1 4.0 4.0 0.0 def calc_mn_subgrad_norm():
1448 return linalg.norm(
1449 _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1), ord=1
1450 )
1451
1452 # the ratio of inner _cd_cycle tolerance to the minimum subgradient norm
1453 # This wasn't explored in the newGLMNET paper linked above.
1454 # That paper essentially uses inner_tol_ratio = 1.0, but using a slightly
1455 # lower value is much faster.
1456 # By comparison, the original GLMNET paper uses inner_tol = tol.
1457 # So, inner_tol_ratio < 1 is sort of a compromise between the two papers.
1458 # The value should probably be between 0.01 and 0.5. 0.1 works well for many problems
1459 1 3.0 3.0 0.0 inner_tol_ratio = 0.1
1460
1461 1 3.0 3.0 0.0 def calc_inner_tol(mn_subgrad_norm):
1462 # Another potential rule limits the inner tol to be no smaller than tol
1463 # return max(mn_subgrad_norm * inner_tol_ratio, tol)
1464 return mn_subgrad_norm * inner_tol_ratio
1465
1466 # initial stopping tolerance of inner loop
1467 # use L1-norm of minimum of norm of subgradient of F
1468 1 197.0 197.0 0.0 inner_tol = calc_inner_tol(calc_mn_subgrad_norm())
1469
1470 1 3.0 3.0 0.0 Fw = None
1471
1472 # outer loop
1473 6 19.0 3.2 0.0 while n_iter < max_iter:
1474 6 23751.0 3958.5 0.3 print(inner_tol, n_iter, n_cycles)
1475 6 45.0 7.5 0.0 n_iter += 1
1476 # initialize search direction d (to be optimized) with zero
1477 6 1800.0 300.0 0.0 d.fill(0)
1478 # inner loop = _cd_cycle
1479 12 41985.0 3498.8 0.6 d, coef_P2, n_cycles, inner_tol = _cd_cycle(
1480 6 18.0 3.0 0.0 d,
1481 6 18.0 3.0 0.0 X,
1482 6 19.0 3.2 0.0 coef,
1483 6 19.0 3.2 0.0 score,
1484 6 17.0 2.8 0.0 fisher,
1485 6 20.0 3.3 0.0 P1,
1486 6 892.0 148.7 0.0 P2,
1487 6 19.0 3.2 0.0 n_cycles,
1488 6 17.0 2.8 0.0 inner_tol,
1489 6 19.0 3.2 0.0 max_inner_iter=max_inner_iter,
1490 6 17.0 2.8 0.0 selection=selection,
1491 6 18.0 3.0 0.0 random_state=random_state,
1492 6 19.0 3.2 0.0 diag_fisher=diag_fisher,
1493 )
1494
1495 # line search by sequence beta^k, k=0, 1, ..
1496 # F(w + lambda d) - F(w) <= lambda * bound
1497 # bound = sigma * (f'(w)*d + w*P2*d
1498 # +||P1 (w+d)||_1 - ||P1 w||_1)
1499 6 303.0 50.5 0.0 P1w_1 = linalg.norm(P1 * coef[idx:], ord=1)
1500 6 282.0 47.0 0.0 P1wd_1 = linalg.norm(P1 * (coef + d)[idx:], ord=1)
1501 # Note: coef_P2 already calculated and still valid
1502 6 109.0 18.2 0.0 bound = sigma * (-(score @ d) + coef_P2 @ d[idx:] + P1wd_1 - P1w_1)
1503
1504 # In the first iteration, we must compute Fw explicitly.
1505 # In later iterations, we just use Fwd from the previous iteration
1506 # as set after the line search loop below.
1507 6 18.0 3.0 0.0 if Fw is None:
1508 1 3.0 3.0 0.0 Fw = (
1509 3 10457.0 3485.7 0.2 0.5 * family.deviance(y, mu, weights)
1510 1 21.0 21.0 0.0 + 0.5 * (coef_P2 @ coef[idx:])
1511 1 3.0 3.0 0.0 + P1w_1
1512 )
1513
1514 6 24.0 4.0 0.0 la = 1.0 / beta
1515
1516 6 83303.0 13883.8 1.2 X_dot_d = _safe_lin_pred(X, d)
1517
1518 # Try progressively shorter line search steps.
1519 6 53.0 8.8 0.0 for k in range(20):
1520 6 21.0 3.5 0.0 la *= beta # starts with la=1
1521 6 86.0 14.3 0.0 coef_wd = coef + la * d
1522
1523 # The simple version of the next line is:
1524 # mu_wd = link.inverse(_safe_lin_pred(X, coef_wd))
1525 # but because coef_wd can be factored as
1526 # coef_wd = coef + la * d
1527 # we can rewrite to only perform one dot product with the data
1528 # matrix per loop which is substantially faster
1529 6 123883.0 20647.2 1.8 mu_wd = link.inverse(eta + la * X_dot_d)
1530
1531 # TODO - optimize: for Tweedie that isn't one of the special cases
1532 # (gaussian, poisson, gamma), family.deviance is quite slow! Can we
1533 # fix that somehow?
1534 12 64803.0 5400.2 0.9 Fwd = 0.5 * family.deviance(y, mu_wd, weights) + linalg.norm(
1535 6 78.0 13.0 0.0 P1 * coef_wd[idx:], ord=1
1536 )
1537 6 19.0 3.2 0.0 if P2.ndim == 1:
1538 6 164.0 27.3 0.0 Fwd += 0.5 * ((coef_wd[idx:] * P2) @ coef_wd[idx:])
1539 else:
1540 Fwd += 0.5 * (coef_wd[idx:] @ (P2 @ coef_wd[idx:]))
1541 6 27.0 4.5 0.0 if Fwd - Fw <= sigma * la * bound:
1542 6 19.0 3.2 0.0 break
1543
1544 # Fw in the next iteration will be equal to Fwd this iteration.
1545 6 18.0 3.0 0.0 Fw = Fwd
1546
1547 # update coefficients
1548 6 76.0 12.7 0.0 coef += la * d
1549
1550 # calculate eta, mu, score, Fisher matrix for next iteration
1551 12 5453705.0 454475.4 78.5 eta, mu, score, fisher = family._eta_mu_score_fisher(
1552 6 17.0 2.8 0.0 coef=coef,
1553 6 17.0 2.8 0.0 phi=1,
1554 6 17.0 2.8 0.0 X=X,
1555 6 16.0 2.7 0.0 y=y,
1556 6 18.0 3.0 0.0 weights=weights,
1557 6 17.0 2.8 0.0 link=link,
1558 6 17.0 2.8 0.0 diag_fisher=diag_fisher,
1559 )
1560
1561 # stopping criterion for outer loop
1562 # sum_i(|minimum-norm of subgrad of F(w)_i|)
1563 # fp_wP2 = f'(w) + w*P2
1564 # Note: eta, mu and score are already updated
1565 # this also updates the inner tolerance for the next loop!
1566 6 1286.0 214.3 0.0 mn_subgrad_norm = calc_mn_subgrad_norm()
1567 6 29.0 4.8 0.0 if mn_subgrad_norm <= tol:
1568 1 3.0 3.0 0.0 converged = True
1569 1 3.0 3.0 0.0 break
1570
1571 5 27.0 5.4 0.0 inner_tol = calc_inner_tol(mn_subgrad_norm)
1572 # end of outer loop
1573
1574 1 3.0 3.0 0.0 if not converged:
1575 warnings.warn(
1576 "Coordinate descent failed to converge. Increase"
1577 " the maximum number of iterations max_iter"
1578 " (currently {})".format(max_iter),
1579 ConvergenceWarning,
1580 )
1581
1582 1 3.0 3.0 0.0 return coef, n_iter, n_cycles
Total time: 8.55864 s
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py
Function: fit at line 1884
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1884 @profile
1885 def fit(self, X, y, sample_weight=None):
1886 """Fit a Generalized Linear Model.
1887
1888 Parameters
1889 ----------
1890 X : {array-like, sparse matrix}, shape (n_samples, n_features)
1891 Training data.
1892
1893 y : array-like, shape (n_samples,)
1894 Target values.
1895
1896 sample_weight : {None, array-like}, shape (n_samples,),\
1897 optional (default=None)
1898 Individual weights w_i for each sample. Note that for an
1899 Exponential Dispersion Model (EDM), one has
1900 Var[Y_i]=phi/w_i * v(mu).
1901 If Y_i ~ EDM(mu, phi/w_i), then
1902 sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a
1903 weighted average with weights=sample_weight.
1904
1905 Returns
1906 -------
1907 self : returns an instance of self.
1908 """
1909 # 99.8% of time is in the _cd_solver function
1910 #######################################################################
1911 # 1. input validation #
1912 #######################################################################
1913 # 1.1 validate arguments of __init__ ##################################
1914 # Guarantee that self._family_instance is an instance of class
1915 # ExponentialDispersionModel
1916 1 56.0 56.0 0.0 if isinstance(self.family, ExponentialDispersionModel):
1917 self._family_instance = self.family
1918 else:
1919 1 5.0 5.0 0.0 if self.family == "normal":
1920 self._family_instance = NormalDistribution()
1921 1 5.0 5.0 0.0 elif self.family == "poisson":
1922 1 27.0 27.0 0.0 self._family_instance = PoissonDistribution()
1923 elif self.family == "gamma":
1924 self._family_instance = GammaDistribution()
1925 elif self.family == "inverse.gaussian":
1926 self._family_instance = InverseGaussianDistribution()
1927 elif self.family == "binomial":
1928 self._family_instance = BinomialDistribution()
1929 else:
1930 raise ValueError(
1931 "The family must be an instance of class"
1932 " ExponentialDispersionModel or an element of"
1933 " ['normal', 'poisson', 'gamma', 'inverse.gaussian', "
1934 "'binomial']; got (family={})".format(self.family)
1935 )
1936
1937 # Guarantee that self._link_instance is set to an instance of
1938 # class Link
1939 1 16.0 16.0 0.0 if isinstance(self.link, Link):
1940 self._link_instance = self.link
1941 else:
1942 1 5.0 5.0 0.0 if self.link == "auto":
1943 1 7.0 7.0 0.0 if isinstance(self._family_instance, TweedieDistribution):
1944 1 6.0 6.0 0.0 if self._family_instance.power <= 0:
1945 self._link_instance = IdentityLink()
1946 1 5.0 5.0 0.0 if self._family_instance.power >= 1:
1947 1 8.0 8.0 0.0 self._link_instance = LogLink()
1948 elif isinstance(self._family_instance, GeneralizedHyperbolicSecant):
1949 self._link_instance = IdentityLink()
1950 elif isinstance(self._family_instance, BinomialDistribution):
1951 self._link_instance = LogitLink()
1952 else:
1953 raise ValueError(
1954 "No default link known for the "
1955 "specified distribution family. Please "
1956 "set link manually, i.e. not to 'auto'; "
1957 "got (link='auto', family={}".format(self.family)
1958 )
1959 elif self.link == "identity":
1960 self._link_instance = IdentityLink()
1961 elif self.link == "log":
1962 self._link_instance = LogLink()
1963 elif self.link == "logit":
1964 self._link_instance = LogitLink()
1965 else:
1966 raise ValueError(
1967 "The link must be an instance of class Link or "
1968 "an element of ['auto', 'identity', 'log', 'logit']; "
1969 "got (link={})".format(self.link)
1970 )
1971
1972 1 15.0 15.0 0.0 if not isinstance(self.alpha, numbers.Number) or self.alpha < 0:
1973 raise ValueError(
1974 "Penalty term must be a non-negative number;"
1975 " got (alpha={})".format(self.alpha)
1976 )
1977 3 15.0 5.0 0.0 if (
1978 1 5.0 5.0 0.0 not isinstance(self.l1_ratio, numbers.Number)
1979 1 5.0 5.0 0.0 or self.l1_ratio < 0
1980 1 5.0 5.0 0.0 or self.l1_ratio > 1
1981 ):
1982 raise ValueError(
1983 "l1_ratio must be a number in interval [0, 1];"
1984 " got (l1_ratio={})".format(self.l1_ratio)
1985 )
1986 1 6.0 6.0 0.0 if not isinstance(self.fit_intercept, bool):
1987 raise ValueError(
1988 "The argument fit_intercept must be bool;"
1989 " got {}".format(self.fit_intercept)
1990 )
1991 1 5.0 5.0 0.0 if self.solver not in ["auto", "irls", "lbfgs", "newton-cg", "cd"]:
1992 raise ValueError(
1993 "GeneralizedLinearRegressor supports only solvers"
1994 " 'auto', 'irls', 'lbfgs', 'newton-cg' and 'cd';"
1995 " got {}".format(self.solver)
1996 )
1997 1 4.0 4.0 0.0 solver = self.solver
1998 1 5.0 5.0 0.0 if self.solver == "auto":
1999 1 5.0 5.0 0.0 if self.l1_ratio == 0:
2000 solver = "irls"
2001 else:
2002 1 5.0 5.0 0.0 solver = "cd"
2003 1 4.0 4.0 0.0 if self.alpha > 0 and self.l1_ratio > 0 and solver not in ["cd"]:
2004 raise ValueError(
2005 "The chosen solver (solver={}) can't deal "
2006 "with L1 penalties, which are included with "
2007 "(alpha={}) and (l1_ratio={}).".format(
2008 solver, self.alpha, self.l1_ratio
2009 )
2010 )
2011 1 5.0 5.0 0.0 if not isinstance(self.max_iter, int) or self.max_iter <= 0:
2012 raise ValueError(
2013 "Maximum number of iteration must be a positive "
2014 "integer;"
2015 " got (max_iter={!r})".format(self.max_iter)
2016 )
2017 1 6.0 6.0 0.0 if not isinstance(self.tol, numbers.Number) or self.tol <= 0:
2018 raise ValueError(
2019 "Tolerance for stopping criteria must be "
2020 "positive; got (tol={!r})".format(self.tol)
2021 )
2022 1 5.0 5.0 0.0 if not isinstance(self.warm_start, bool):
2023 raise ValueError(
2024 "The argument warm_start must be bool;"
2025 " got {}".format(self.warm_start)
2026 )
2027 1 4.0 4.0 0.0 if self.selection not in ["cyclic", "random"]:
2028 raise ValueError(
2029 "The argument selection must be 'cyclic' or "
2030 "'random'; got (selection={})".format(self.selection)
2031 )
2032 1 316.0 316.0 0.0 random_state = check_random_state(self.random_state)
2033 1 6.0 6.0 0.0 if not isinstance(self.diag_fisher, bool):
2034 raise ValueError(
2035 "The argument diag_fisher must be bool;"
2036 " got {}".format(self.diag_fisher)
2037 )
2038 1 5.0 5.0 0.0 if not isinstance(self.copy_X, bool):
2039 raise ValueError(
2040 "The argument copy_X must be bool;" " got {}".format(self.copy_X)
2041 )
2042 1 5.0 5.0 0.0 if not isinstance(self.check_input, bool):
2043 raise ValueError(
2044 "The argument check_input must be bool; got "
2045 "(check_input={})".format(self.check_input)
2046 )
2047
2048 1 4.0 4.0 0.0 family = self._family_instance
2049 1 5.0 5.0 0.0 link = self._link_instance
2050
2051 # 1.2 validate arguments of fit #######################################
2052 1 6.0 6.0 0.0 _dtype = [np.float64, np.float32]
2053 1 5.0 5.0 0.0 if solver == "cd":
2054 1 5.0 5.0 0.0 _stype = ["csc"]
2055 else:
2056 _stype = ["csc", "csr"]
2057 2 260252.0 130126.0 3.0 X, y = check_X_y(
2058 1 4.0 4.0 0.0 X,
2059 1 4.0 4.0 0.0 y,
2060 1 5.0 5.0 0.0 accept_sparse=_stype,
2061 1 5.0 5.0 0.0 dtype=_dtype,
2062 1 5.0 5.0 0.0 y_numeric=True,
2063 1 5.0 5.0 0.0 multi_output=False,
2064 1 5.0 5.0 0.0 copy=self.copy_X,
2065 )
2066 # Without converting y to float, deviance might raise
2067 # ValueError: Integers to negative integer powers are not allowed.
2068 # Also, y must not be sparse.
2069 1 10.0 10.0 0.0 y = np.asarray(y, dtype=np.float64)
2070
2071 1 1742.0 1742.0 0.0 weights = _check_weights(sample_weight, y.shape[0])
2072
2073 1 7.0 7.0 0.0 n_samples, n_features = X.shape
2074
2075 # 1.3 arguments to take special care ##################################
2076 # P1, P2, start_params
2077 1 6.0 6.0 0.0 if isinstance(self.P1, str) and self.P1 == "identity":
2078 1 25.0 25.0 0.0 P1 = np.ones(n_features)
2079 else:
2080 P1 = np.atleast_1d(self.P1)
2081 try:
2082 P1 = P1.astype(np.float64, casting="safe", copy=False)
2083 except TypeError:
2084 raise TypeError(
2085 "The given P1 cannot be converted to a numeric"
2086 "array; got (P1.dtype={}).".format(P1.dtype)
2087 )
2088 if (P1.ndim != 1) or (P1.shape[0] != n_features):
2089 raise ValueError(
2090 "P1 must be either 'identity' or a 1d array "
2091 "with the length of X.shape[1]; "
2092 "got (P1.shape[0]={}), "
2093 "needed (X.shape[1]={}).".format(P1.shape[0], n_features)
2094 )
2095 # If X is sparse, make P2 sparse, too.
2096 1 5.0 5.0 0.0 if isinstance(self.P2, str) and self.P2 == "identity":
2097 1 8.0 8.0 0.0 if sparse.issparse(X):
2098 P2 = (
2099 sparse.dia_matrix(
2100 (np.ones(n_features), 0), shape=(n_features, n_features)
2101 )
2102 ).tocsc()
2103 else:
2104 1 10.0 10.0 0.0 P2 = np.ones(n_features)
2105 else:
2106 P2 = check_array(
2107 self.P2, copy=True, accept_sparse=_stype, dtype=_dtype, ensure_2d=False
2108 )
2109 if P2.ndim == 1:
2110 P2 = np.asarray(P2)
2111 if P2.shape[0] != n_features:
2112 raise ValueError(
2113 "P2 should be a 1d array of shape "
2114 "(n_features,) with "
2115 "n_features=X.shape[1]; "
2116 "got (P2.shape=({},)), needed ({},)".format(
2117 P2.shape[0], X.shape[1]
2118 )
2119 )
2120 if sparse.issparse(X):
2121 P2 = (
2122 sparse.dia_matrix((P2, 0), shape=(n_features, n_features))
2123 ).tocsc()
2124 elif (
2125 P2.ndim == 2
2126 and P2.shape[0] == P2.shape[1]
2127 and P2.shape[0] == X.shape[1]
2128 ):
2129 if sparse.issparse(X):
2130 P2 = sparse.csc_matrix(P2)
2131 else:
2132 raise ValueError(
2133 "P2 must be either None or an array of shape "
2134 "(n_features, n_features) with "
2135 "n_features=X.shape[1]; "
2136 "got (P2.shape=({0}, {1})), needed ({2}, {2})".format(
2137 P2.shape[0], P2.shape[1], X.shape[1]
2138 )
2139 )
2140
2141 1 6.0 6.0 0.0 start_params = self.start_params
2142 1 5.0 5.0 0.0 if isinstance(start_params, str):
2143 1 5.0 5.0 0.0 if start_params not in ["guess", "zero"]:
2144 raise ValueError(
2145 "The argument start_params must be 'guess', "
2146 "'zero' or an array of correct length; "
2147 "got(start_params={})".format(start_params)
2148 )
2149 else:
2150 start_params = check_array(
2151 start_params,
2152 accept_sparse=False,
2153 force_all_finite=True,
2154 ensure_2d=False,
2155 dtype=_dtype,
2156 copy=True,
2157 )
2158 if (start_params.shape[0] != X.shape[1] + self.fit_intercept) or (
2159 start_params.ndim != 1
2160 ):
2161 raise ValueError(
2162 "Start values for parameters must have the"
2163 "right length and dimension; required (length"
2164 "={}, ndim=1); got (length={}, ndim={}).".format(
2165 X.shape[1] + self.fit_intercept,
2166 start_params.shape[0],
2167 start_params.ndim,
2168 )
2169 )
2170
2171 1 6.0 6.0 0.0 l1 = self.alpha * self.l1_ratio
2172 1 6.0 6.0 0.0 l2 = self.alpha * (1 - self.l1_ratio)
2173 # P1 and P2 are now for sure copies
2174 1 11.0 11.0 0.0 P1 = l1 * P1
2175 1 6.0 6.0 0.0 P2 = l2 * P2
2176 # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2')
2177 # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric
2178 1 5.0 5.0 0.0 if P2.ndim == 2:
2179 if sparse.issparse(P2):
2180 if sparse.isspmatrix_csc(P2):
2181 P2 = 0.5 * (P2 + P2.transpose()).tocsc()
2182 else:
2183 P2 = 0.5 * (P2 + P2.transpose()).tocsr()
2184 else:
2185 P2 = 0.5 * (P2 + P2.T)
2186
2187 # For coordinate descent, if X is sparse, P2 must also be csc
2188 1 5.0 5.0 0.0 if solver == "cd" and sparse.issparse(X):
2189 P2 = sparse.csc_matrix(P2)
2190
2191 # 1.4 additional validations ##########################################
2192 1 5.0 5.0 0.0 if self.check_input:
2193 1 1086.0 1086.0 0.0 if not np.all(family.in_y_range(y)):
2194 raise ValueError(
2195 "Some value(s) of y are out of the valid "
2196 "range for family {}".format(family.__class__.__name__)
2197 )
2198 # check if P1 has only non-negative values, negative values might
2199 # indicate group lasso in the future.
2200 1 6.0 6.0 0.0 if not isinstance(self.P1, str): # if self.P1 != 'identity':
2201 if not np.all(P1 >= 0):
2202 raise ValueError("P1 must not have negative values.")
2203 # check if P2 is positive semidefinite
2204 # np.linalg.cholesky(P2) 'only' asserts positive definite
2205 1 5.0 5.0 0.0 if not isinstance(self.P2, str): # self.P2 != 'identity'
2206 # due to numerical precision, we allow eigenvalues to be a
2207 # tiny bit negative
2208 epsneg = -10 * np.finfo(P2.dtype).epsneg
2209 if P2.ndim == 1 or P2.shape[0] == 1:
2210 p2 = P2
2211 if sparse.issparse(P2):
2212 p2 = P2.toarray()
2213 if not np.all(p2 >= 0):
2214 raise ValueError(
2215 "1d array P2 must not have negative " "values."
2216 )
2217 elif sparse.issparse(P2):
2218 # for sparse matrices, not all eigenvals can be computed
2219 # efficiently, use only half of n_features
2220 # k = how many eigenvals to compute
2221 k = np.min([10, n_features // 10 + 1])
2222 sigma = -1000 * epsneg # start searching near this value
2223 which = "SA" # find smallest algebraic eigenvalues first
2224 eigenvalues = splinalg.eigsh(
2225 P2, k=k, sigma=sigma, which=which, return_eigenvectors=False
2226 )
2227 if not np.all(eigenvalues >= epsneg):
2228 raise ValueError("P2 must be positive semi-definite.")
2229 else:
2230 if not np.all(linalg.eigvalsh(P2) >= epsneg):
2231 raise ValueError("P2 must be positive semi-definite.")
2232 # TODO: if alpha=0 check that X is not rank deficient
2233 # TODO: what else to check?
2234
2235 #######################################################################
2236 # 2a. rescaling of weights (sample_weight) #
2237 #######################################################################
2238 # IMPORTANT NOTE: Since we want to minimize
2239 # 1/(2*sum(sample_weight)) * deviance + L1 + L2,
2240 # deviance = sum(sample_weight * unit_deviance),
2241 # we rescale weights such that sum(weights) = 1 and this becomes
2242 # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance)
2243 1 351.0 351.0 0.0 weights_sum = np.sum(weights)
2244 1 1916.0 1916.0 0.0 weights = weights / weights_sum
2245
2246 #######################################################################
2247 # 2b. potentially rescale predictors
2248 #######################################################################
2249 1 7.0 7.0 0.0 if self.standardize:
2250 1 356277.0 356277.0 4.2 X, P1, P2, col_means, col_stds = _standardize(X, P1, P2, weights)
2251
2252 #######################################################################
2253 # 3. initialization of coef = (intercept_, coef_) #
2254 #######################################################################
2255 # Note: Since phi=self.dispersion_ does not enter the estimation
2256 # of mu_i=E[y_i], set it to 1.
2257
2258 # set start values for coef
2259 1 11.0 11.0 0.0 if self.warm_start and hasattr(self, "coef_"):
2260 coef = self.coef_
2261 intercept = self.intercept_
2262 if self.fit_intercept:
2263 coef = np.concatenate((np.array([intercept]), coef))
2264 if self.standardize:
2265 _standardize_warm_start(coef, col_means, col_stds)
2266 1 9.0 9.0 0.0 elif isinstance(start_params, str):
2267 1 8.0 8.0 0.0 if start_params == "guess":
2268 # Set mu=starting_mu of the family and do one Newton step
2269 # If solver=cd use cd, else irls
2270 1 7813.0 7813.0 0.1 mu = family.starting_mu(y, weights=weights)
2271 1 21668.0 21668.0 0.3 eta = link.link(mu) # linear predictor
2272 1 11.0 11.0 0.0 if solver in ["cd", "lbfgs", "newton-cg"]:
2273 # see function _cd_solver
2274 1 14785.0 14785.0 0.2 sigma_inv = 1 / family.variance(mu, phi=1, weights=weights)
2275 1 17775.0 17775.0 0.2 d1 = link.inverse_derivative(eta)
2276 1 7121.0 7121.0 0.1 temp = sigma_inv * d1 * (y - mu)
2277 1 11.0 11.0 0.0 if self.fit_intercept:
2278 1 11419.0 11419.0 0.1 score = np.concatenate(([temp.sum()], temp @ X))
2279 else:
2280 score = temp @ X # same as X.T @ temp
2281
2282 1 1965.0 1965.0 0.0 d2_sigma_inv = d1 * d1 * sigma_inv
2283 1 9.0 9.0 0.0 diag_fisher = self.diag_fisher
2284 1 7.0 7.0 0.0 if diag_fisher:
2285 fisher = d2_sigma_inv
2286 else:
2287 2 766539.0 383269.5 9.0 fisher = _safe_sandwich_dot(
2288 1 7.0 7.0 0.0 X, d2_sigma_inv, intercept=self.fit_intercept
2289 )
2290 # set up space for search direction d for inner loop
2291 1 9.0 9.0 0.0 if self.fit_intercept:
2292 1 20.0 20.0 0.0 coef = np.zeros(n_features + 1)
2293 else:
2294 coef = np.zeros(n_features)
2295 1 65.0 65.0 0.0 d = np.zeros_like(coef)
2296 # initial stopping tolerance of inner loop
2297 # use L1-norm of minimum of norm of subgradient of F
2298 # use less restrictive tolerance for initial guess
2299 1 122.0 122.0 0.0 inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1)
2300 1 114.0 114.0 0.0 inner_tol = 4 * linalg.norm(inner_tol, ord=1)
2301 # just one outer loop = Newton step
2302 1 9.0 9.0 0.0 n_cycles = 0
2303 2 6689.0 3344.5 0.1 d, coef_P2, n_cycles, inner_tol = _cd_cycle(
2304 1 8.0 8.0 0.0 d,
2305 1 7.0 7.0 0.0 X,
2306 1 7.0 7.0 0.0 coef,
2307 1 8.0 8.0 0.0 score,
2308 1 8.0 8.0 0.0 fisher,
2309 1 8.0 8.0 0.0 P1,
2310 1 8.0 8.0 0.0 P2,
2311 1 8.0 8.0 0.0 n_cycles,
2312 1 8.0 8.0 0.0 inner_tol,
2313 1 8.0 8.0 0.0 max_inner_iter=1000,
2314 1 9.0 9.0 0.0 selection=self.selection,
2315 1 8.0 8.0 0.0 random_state=random_state,
2316 1 8.0 8.0 0.0 diag_fisher=self.diag_fisher,
2317 )
2318 1 13.0 13.0 0.0 coef += d # for simplicity no line search here
2319 else:
2320 # See _irls_solver
2321 # h'(eta)
2322 hp = link.inverse_derivative(eta)
2323 # working weights W, in principle a diagonal matrix
2324 # therefore here just as 1d array
2325 W = hp ** 2 / family.variance(mu, phi=1, weights=weights)
2326 # working observations
2327 z = eta + (y - mu) / hp
2328 # solve A*coef = b
2329 # A = X' W X + l2 P2, b = X' W z
2330 coef = _irls_step(X, W, P2, z, fit_intercept=self.fit_intercept)
2331 else: # start_params == 'zero'
2332 if self.fit_intercept:
2333 coef = np.zeros(n_features + 1)
2334 coef[0] = link.link(np.average(y, weights=weights))
2335 else:
2336 coef = np.zeros(n_features)
2337 else: # assign given array as start values
2338 coef = start_params
2339 if self.standardize:
2340 _standardize_warm_start(coef, col_means, col_stds)
2341
2342 #######################################################################
2343 # 4. fit #
2344 #######################################################################
2345 # algorithms for optimization
2346 # TODO: Parallelize it?
2347
2348 # 4.1 IRLS ############################################################
2349 # Note: we already set P2 = l2*P2, see above
2350 # Note: we already symmetrized P2 = 1/2 (P2 + P2')
2351 1 8.0 8.0 0.0 if solver == "irls":
2352 coef, self.n_iter_ = _irls_solver(
2353 coef=coef,
2354 X=X,
2355 y=y,
2356 weights=weights,
2357 P2=P2,
2358 fit_intercept=self.fit_intercept,
2359 family=family,
2360 link=link,
2361 max_iter=self.max_iter,
2362 tol=self.tol,
2363 )
2364
2365 # 4.2 L-BFGS ##########################################################
2366 1 8.0 8.0 0.0 elif solver == "lbfgs":
2367
2368 def func(coef, X, y, weights, P2, family, link):
2369 mu, devp = family._mu_deviance_derivative(coef, X, y, weights, link)
2370 dev = family.deviance(y, mu, weights)
2371 intercept = coef.size == X.shape[1] + 1
2372 idx = 1 if intercept else 0 # offset if coef[0] is intercept
2373 if P2.ndim == 1:
2374 L2 = P2 * coef[idx:]
2375 else:
2376 L2 = P2 @ coef[idx:]
2377 obj = 0.5 * dev + 0.5 * (coef[idx:] @ L2)
2378 objp = 0.5 * devp
2379 objp[idx:] += L2
2380 return obj, objp
2381
2382 args = (X, y, weights, P2, family, link)
2383 coef, loss, info = fmin_l_bfgs_b(
2384 func,
2385 coef,
2386 fprime=None,
2387 args=args,
2388 iprint=(self.verbose > 0) - 1,
2389 pgtol=self.tol,
2390 maxiter=self.max_iter,
2391 factr=1e3,
2392 )
2393 if info["warnflag"] == 1:
2394 warnings.warn(
2395 "lbfgs failed to converge." " Increase the number of iterations.",
2396 ConvergenceWarning,
2397 )
2398 elif info["warnflag"] == 2:
2399 warnings.warn("lbfgs failed for the reason: {}".format(info["task"]))
2400 self.n_iter_ = info["nit"]
2401
2402 # 4.3 Newton-CG #######################################################
2403 # We use again the fisher matrix instead of the hessian. More
2404 # precisely, expected hessian of deviance.
2405 1 8.0 8.0 0.0 elif solver == "newton-cg":
2406
2407 def func(coef, X, y, weights, P2, family, link):
2408 intercept = coef.size == X.shape[1] + 1
2409 idx = 1 if intercept else 0 # offset if coef[0] is intercept
2410 if P2.ndim == 1:
2411 L2 = coef[idx:] @ (P2 * coef[idx:])
2412 else:
2413 L2 = coef[idx:] @ (P2 @ coef[idx:])
2414 mu = link.inverse(_safe_lin_pred(X, coef))
2415 return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2
2416
2417 def grad(coef, X, y, weights, P2, family, link):
2418 mu, devp = family._mu_deviance_derivative(coef, X, y, weights, link)
2419 intercept = coef.size == X.shape[1] + 1
2420 idx = 1 if intercept else 0 # offset if coef[0] is intercept
2421 if P2.ndim == 1:
2422 L2 = P2 * coef[idx:]
2423 else:
2424 L2 = P2 @ coef[idx:]
2425 objp = 0.5 * devp
2426 objp[idx:] += L2
2427 return objp
2428
2429 def grad_hess(coef, X, y, weights, P2, family, link):
2430 intercept = coef.size == X.shape[1] + 1
2431 idx = 1 if intercept else 0 # offset if coef[0] is intercept
2432 if P2.ndim == 1:
2433 L2 = P2 * coef[idx:]
2434 else:
2435 L2 = P2 @ coef[idx:]
2436 eta = _safe_lin_pred(X, coef)
2437 mu = link.inverse(eta)
2438 d1 = link.inverse_derivative(eta)
2439 temp = d1 * family.deviance_derivative(y, mu, weights)
2440 if intercept:
2441 grad = np.concatenate(([0.5 * temp.sum()], 0.5 * temp @ X + L2))
2442 else:
2443 grad = 0.5 * temp @ X + L2 # same as 0.5* X.T @ temp + L2
2444
2445 # expected hessian = fisher = X.T @ diag_matrix @ X
2446 # calculate only diag_matrix
2447 diag = d1 ** 2 / family.variance(mu, phi=1, weights=weights)
2448 if intercept:
2449 h0i = np.concatenate(([diag.sum()], diag @ X))
2450
2451 def Hs(coef):
2452 # return (0.5 * fisher + P2) @ coef
2453 # ret = 0.5 * (X.T @ (diag * (X @ coef)))
2454 ret = 0.5 * ((diag * (X @ coef[idx:])) @ X)
2455 if P2.ndim == 1:
2456 ret += P2 * coef[idx:]
2457 else:
2458 ret += P2 @ coef[idx:]
2459 if intercept:
2460 ret = np.concatenate(
2461 ([0.5 * (h0i @ coef)], ret + 0.5 * coef[0] * h0i[1:])
2462 )
2463 return ret
2464
2465 return grad, Hs
2466
2467 args = (X, y, weights, P2, family, link)
2468 coef, self.n_iter_ = newton_cg(
2469 grad_hess,
2470 func,
2471 grad,
2472 coef,
2473 args=args,
2474 maxiter=self.max_iter,
2475 tol=self.tol,
2476 )
2477
2478 # 4.4 coordinate descent ##############################################
2479 # Note: we already set P1 = l1*P1, see above
2480 # Note: we already set P2 = l2*P2, see above
2481 # Note: we already symmetrized P2 = 1/2 (P2 + P2')
2482 1 8.0 8.0 0.0 elif solver == "cd":
2483 2 6952782.0 3476391.0 81.2 coef, self.n_iter_, self._n_cycles = _cd_solver(
2484 1 8.0 8.0 0.0 coef=coef,
2485 1 8.0 8.0 0.0 X=X,
2486 1 8.0 8.0 0.0 y=y,
2487 1 9.0 9.0 0.0 weights=weights,
2488 1 9.0 9.0 0.0 P1=P1,
2489 1 8.0 8.0 0.0 P2=P2,
2490 1 8.0 8.0 0.0 fit_intercept=self.fit_intercept,
2491 1 8.0 8.0 0.0 family=family,
2492 1 8.0 8.0 0.0 link=link,
2493 1 8.0 8.0 0.0 max_iter=self.max_iter,
2494 1 8.0 8.0 0.0 tol=self.tol,
2495 1 8.0 8.0 0.0 selection=self.selection,
2496 1 8.0 8.0 0.0 random_state=random_state,
2497 1 8.0 8.0 0.0 diag_fisher=self.diag_fisher,
2498 1 8.0 8.0 0.0 copy_X=self.copy_X,
2499 )
2500
2501 #######################################################################
2502 # 5a. handle intercept
2503 #######################################################################
2504 1 12.0 12.0 0.0 if self.fit_intercept:
2505 1 12.0 12.0 0.0 self.intercept_ = coef[0]
2506 1 13.0 13.0 0.0 self.coef_ = coef[1:]
2507 else:
2508 # set intercept to zero as the other linear models do
2509 self.intercept_ = 0.0
2510 self.coef_ = coef
2511
2512 #######################################################################
2513 # 5a. undo standardization
2514 #######################################################################
2515 1 9.0 9.0 0.0 if self.standardize:
2516 # TODO: Make sure this doesn't copy X
2517 2 126923.0 63461.5 1.5 X, self.intercept_, self.coef_ = _unstandardize(
2518 1 9.0 9.0 0.0 X, col_means, col_stds, self.intercept_, self.coef_
2519 )
2520
2521 1 13.0 13.0 0.0 if self.fit_dispersion in ["chisqr", "deviance"]:
2522 # attention because of rescaling of weights
2523 self.dispersion_ = self.estimate_phi(X, y, weights) * weights_sum
2524
2525 1 7.0 7.0 0.0 return self
Dim of data: (1000000, 107)
Sparseness: 0.683
Size of data frame: 0.80 GB
*********************************
alpha=1.0, l1_ratio=0.5, sparse=True
Creating sparse matrix...
Using new implementation...
0.09618600747421668 0 0
0.034404141330196404 1 1
0.011721161752972374 2 2
0.0034913993816849454 3 3
0.0007158709705561006 4 4
5.3700190453114465e-05 5 5
44.3958 seconds
gini=0.1111423209994666
mean outcome: 0.04787563746463126
mean prediction: 0.046850885751455344
Wrote profile results to glm_qc.py.lprof
Timer unit: 1e-06 s
Total time: 42.2249 s
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py
Function: _safe_sandwich_dot at line 105
Line # Hits Time Per Hit % Time Line Contents
==============================================================
105 @profile
106 def _safe_sandwich_dot(X, d, intercept=False):
107 """Compute sandwich product X.T @ diag(d) @ X.
108
109 With ``intercept=True``, X is treated as if a column of 1 were appended as
110 first column of X.
111 X can be sparse, d must be an ndarray. Always returns a ndarray."""
112 8 44.0 5.5 0.0 if sparse.issparse(X):
113 # TODO: There's a bit of room to accelerate this by avoiding the
114 # allocation of a new sparse matrix every time we pass through this
115 # step. The X.multiply creates a new matrix. Avoiding the allocation
116 # and updating the rows in place accelerates this line by ~20%.
117 temp = X.transpose() @ X.multiply(d[:, np.newaxis])
118 # for older versions of numpy and scipy, temp may be a np.matrix
119 temp = _safe_toarray(temp)
120 8 15.0 1.9 0.0 elif type(X) is ColScaledSpMat:
121 8 41538672.0 5192334.0 98.4 term1 = X.mat.transpose() @ X.mat.multiply(d[:, np.newaxis])
122 8 341964.0 42745.5 0.8 term2 = X.mat.transpose().dot(d)[:, np.newaxis] * X.shift
123 8 34.0 4.2 0.0 term3 = term2.T
124 8 2622.0 327.8 0.0 term4 = (X.shift.T * X.shift) * d.sum()
125 8 3603.0 450.4 0.0 temp = term1 + term2 + term3 + term4
126 8 86.0 10.8 0.0 temp = _safe_toarray(temp)
127 else:
128 temp = (X.T * d) @ X
129 8 7.0 0.9 0.0 if intercept:
130 8 11.0 1.4 0.0 dim = X.shape[1] + 1
131 8 10.0 1.2 0.0 if type(X) is ColScaledSpMat:
132 8 6.0 0.8 0.0 order = "F"
133 elif sparse.issparse(X):
134 order = "F" if sparse.isspmatrix_csc(X) else "C"
135 else:
136 order = "F" if X.flags["F_CONTIGUOUS"] else "C"
137 8 42.0 5.2 0.0 res = np.empty((dim, dim), dtype=max(X.dtype, d.dtype), order=order)
138 8 2157.0 269.6 0.0 res[0, 0] = d.sum()
139 8 335305.0 41913.1 0.8 res[1:, 0] = d @ X
140 8 83.0 10.4 0.0 res[0, 1:] = res[1:, 0]
141 8 233.0 29.1 0.0 res[1:, 1:] = temp
142 else:
143 res = temp
144 8 7.0 0.9 0.0 return res
Total time: 37.7536 s
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py
Function: _eta_mu_score_fisher at line 730
Line # Hits Time Per Hit % Time Line Contents
==============================================================
730 @profile
731 def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link, diag_fisher=False):
732 """Compute linear predictor, mean, score function and fisher matrix.
733
734 It calculates the linear predictor, the mean, score function
735 (derivative of log-likelihood) and Fisher information matrix
736 all in one go as function of `coef` (:math:`w`) and the data.
737
738 Parameters
739 ----------
740 diag_fisher : boolean, optional (default=False)
741 If ``True``, returns only an array d such that
742 fisher = X.T @ np.diag(d) @ X.
743
744 Returns
745 -------
746 (eta, mu, score, fisher) : tuple with 4 elements
747 The 4 elements are:
748
749 * eta: ndarray, shape (X.shape[0],)
750 * mu: ndarray, shape (X.shape[0],)
751 * score: ndarray, shape (X.shape[0],)
752 * fisher:
753
754 * If diag_fisher is ``False``, the full fisher matrix,
755 an array of shape (X.shape[1], X.shape[1])
756 * If diag_fisher is ``True`, an array of shape (X.shape[0])
757 """
758 7 15.0 2.1 0.0 intercept = coef.size == X.shape[1] + 1
759 # eta = linear predictor
760 7 292686.0 41812.3 0.8 eta = _safe_lin_pred(X, coef)
761 7 71816.0 10259.4 0.2 mu = link.inverse(eta)
762
763 # FOR TWEEDIE: sigma_inv = weights / (mu ** p) during optimization bc phi = 1
764 7 61441.0 8777.3 0.2 sigma_inv = 1.0 / self.variance(mu, phi=phi, weights=weights)
765 7 70751.0 10107.3 0.2 d1 = link.inverse_derivative(eta) # = h'(eta)
766 # Alternatively:
767 # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g
768 # d1 = 1./link.derivative(mu)
769 7 11318.0 1616.9 0.0 d1_sigma_inv = d1 * sigma_inv
770 7 17883.0 2554.7 0.0 temp = d1_sigma_inv * (y - mu)
771 7 13.0 1.9 0.0 if intercept:
772 7 297231.0 42461.6 0.8 score = np.concatenate(([temp.sum()], temp @ X))
773 else:
774 score = temp @ X
775
776 7 12631.0 1804.4 0.0 d2_sigma_inv = d1 * d1_sigma_inv
777 7 13.0 1.9 0.0 if diag_fisher:
778 fisher_matrix = d2_sigma_inv
779 else:
780 7 36917769.0 5273967.0 97.8 fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv, intercept=intercept)
781 7 9.0 1.3 0.0 return eta, mu, score, fisher_matrix
Total time: 0.019522 s
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py
Function: _cd_cycle at line 1131
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1131 @profile
1132 def _cd_cycle(
1133 d,
1134 X,
1135 coef,
1136 score,
1137 fisher,
1138 P1,
1139 P2,
1140 n_cycles,
1141 inner_tol,
1142 max_inner_iter=1000,
1143 selection="cyclic",
1144 random_state=None,
1145 diag_fisher=False,
1146 ):
1147 """Compute inner loop of coordinate descent, i.e. cycles through features.
1148
1149 Minimization of 1-d subproblems::
1150
1151 min_z q(d+z*e_j) - q(d)
1152 = min_z A_j z + 1/2 B_jj z^2 + ||P1_j (w_j+d_j+z)||_1
1153
1154 A = f'(w) + d*H(w) + (w+d)*P2
1155 B = H+P2
1156 Note: f'=-score and H=fisher are updated at the end of outer iteration.
1157 """
1158 # TODO: use sparsity (coefficient already 0 due to L1 penalty)
1159 # => active set of features for featurelist, see paper
1160 # of Improved GLMNET or Gap Safe Screening Rules
1161 # https://arxiv.org/abs/1611.05780
1162 7 15.0 2.1 0.1 n_samples, n_features = X.shape
1163 7 17.0 2.4 0.1 intercept = coef.size == X.shape[1] + 1
1164 7 10.0 1.4 0.1 idx = 1 if intercept else 0 # offset if coef[0] is intercept
1165 # ES: This is weird. Should this be B = fisher.copy()? Why would you do this?
1166 7 7.0 1.0 0.0 B = fisher
1167 7 10.0 1.4 0.1 if P2.ndim == 1:
1168 coef_P2 = coef[idx:] * P2
1169 if not diag_fisher:
1170 idiag = np.arange(start=idx, stop=B.shape[0])
1171 # B[np.diag_indices_from(B)] += P2
1172 B[(idiag, idiag)] += P2
1173 else:
1174 7 1828.0 261.1 9.4 coef_P2 = coef[idx:] @ P2
1175 7 12.0 1.7 0.1 if not diag_fisher:
1176 7 23.0 3.3 0.1 if sparse.issparse(P2):
1177 7 602.0 86.0 3.1 B[idx:, idx:] += P2.toarray()
1178 else:
1179 B[idx:, idx:] += P2
1180 7 28.0 4.0 0.1 A = -score
1181 7 37.0 5.3 0.2 A[idx:] += coef_P2
1182 # A += d @ (H+P2) but so far d=0
1183 # inner loop
1184 7 21.0 3.0 0.1 for inner_iter in range(1, max_inner_iter + 1):
1185 7 9.0 1.3 0.0 inner_iter += 1
1186 7 7.0 1.0 0.0 n_cycles += 1
1187 # cycle through features, update intercept separately at the end
1188 7 8.0 1.1 0.0 if selection == "random":
1189 7 152.0 21.7 0.8 featurelist = random_state.permutation(n_features)
1190 else:
1191 featurelist = np.arange(n_features)
1192 756 833.0 1.1 4.3 for j in featurelist:
1193 # minimize_z: a z + 1/2 b z^2 + c |d+z|
1194 # a = A_j
1195 # b = B_jj > 0
1196 # c = |P1_j| = P1_j > 0, see 1.3
1197 # d = w_j + d_j
1198 # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4)
1199 # with beta = z+d, beta_hat = d-a/b and gamma = c/b
1200 # z = 1/b * S(bd-a,c) - d
1201 # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding
1202 749 962.0 1.3 4.9 jdx = j + idx # index for arrays containing entries for intercept
1203 749 851.0 1.1 4.4 a = A[jdx]
1204 749 688.0 0.9 3.5 if diag_fisher:
1205 # Note: fisher is ndarray of shape (n_samples,) => no idx
1206 # Calculate Bj = B[j, :] = B[:, j] as it is needed later anyway
1207 Bj = np.zeros_like(A)
1208 if intercept:
1209 Bj[0] = fisher.sum()
1210 if sparse.issparse(X):
1211 Bj[idx:] = _safe_toarray(
1212 X[:, j].transpose() @ X.multiply(fisher[:, np.newaxis])
1213 ).ravel()
1214 else:
1215 Bj[idx:] = (fisher * X[:, j]) @ X
1216
1217 if P2.ndim == 1:
1218 Bj[idx:] += P2[j]
1219 else:
1220 if sparse.issparse(P2):
1221 # slice columns as P2 is csc
1222 Bj[idx:] += P2[:, j].toarray().ravel()
1223 else:
1224 Bj[idx:] += P2[:, j]
1225 b = Bj[jdx]
1226 else:
1227 749 878.0 1.2 4.5 b = B[jdx, jdx]
1228
1229 # those ten lines are what it is all about
1230 749 913.0 1.2 4.7 if b <= 0:
1231 z = 0
1232 749 1028.0 1.4 5.3 elif P1[j] == 0:
1233 z = -a / b
1234 749 1434.0 1.9 7.3 elif a + P1[j] < b * (coef[jdx] + d[jdx]):
1235 z = -(a + P1[j]) / b
1236 749 1370.0 1.8 7.0 elif a - P1[j] > b * (coef[jdx] + d[jdx]):
1237 z = -(a - P1[j]) / b
1238 else:
1239 749 1096.0 1.5 5.6 z = -(coef[jdx] + d[jdx])
1240
1241 # update direction d
1242 749 1004.0 1.3 5.1 d[jdx] += z
1243 # update A because d_j is now d_j+z
1244 # A = f'(w) + d*H(w) + (w+d)*P2
1245 # => A += (H+P2)*e_j z = B_j * z
1246 # Note: B is symmetric B = B.transpose
1247 749 681.0 0.9 3.5 if diag_fisher:
1248 # Bj = B[:, j] calculated above, still valid
1249 A += Bj * z
1250 else:
1251 # B is symmetric, C- or F-contiguous, but never sparse
1252 749 878.0 1.2 4.5 if B.flags["F_CONTIGUOUS"]:
1253 # slice columns like for sparse csc
1254 749 3088.0 4.1 15.8 A += B[:, jdx] * z
1255 else: # B.flags['C_CONTIGUOUS'] might be true
1256 # slice rows
1257 A += B[jdx, :] * z
1258 # end of cycle over features
1259 # update intercept
1260 7 7.0 1.0 0.0 if intercept:
1261 7 7.0 1.0 0.0 if diag_fisher:
1262 Bj = np.zeros_like(A)
1263 Bj[0] = fisher.sum()
1264 Bj[1:] = fisher @ X
1265 b = Bj[0]
1266 else:
1267 7 13.0 1.9 0.1 b = B[0, 0]
1268 7 15.0 2.1 0.1 z = 0 if b <= 0 else -A[0] / b
1269 7 11.0 1.6 0.1 d[0] += z
1270 7 6.0 0.9 0.0 if diag_fisher:
1271 A += Bj * z
1272 else:
1273 7 8.0 1.1 0.0 if B.flags["F_CONTIGUOUS"]:
1274 7 30.0 4.3 0.2 A += B[:, 0] * z
1275 else:
1276 A += B[0, :] * z
1277 # end of complete cycle
1278 # stopping criterion for inner loop
1279 # sum_i(|minimum of norm of subgrad of q(d)_i|)
1280 # subgrad q(d) = A + subgrad ||P1*(w+d)||_1
1281 7 442.0 63.1 2.3 mn_subgrad = _min_norm_sugrad(coef=coef + d, grad=A, P2=None, P1=P1)
1282 7 454.0 64.9 2.3 mn_subgrad = linalg.norm(mn_subgrad, ord=1)
1283 7 13.0 1.9 0.1 if mn_subgrad <= inner_tol:
1284 7 8.0 1.1 0.0 if inner_iter == 1:
1285 inner_tol = inner_tol / 4.0
1286 7 9.0 1.3 0.0 break
1287 # end of inner loop
1288 7 9.0 1.3 0.0 return d, coef_P2, n_cycles, inner_tol
Total time: 38.1615 s
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py
Function: _cd_solver at line 1291
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1291 @profile
1292 def _cd_solver(
1293 coef,
1294 X,
1295 y,
1296 weights,
1297 P1,
1298 P2,
1299 fit_intercept,
1300 family,
1301 link,
1302 max_iter=100,
1303 max_inner_iter=1000,
1304 tol=1e-4,
1305 selection="cyclic ",
1306 random_state=None,
1307 diag_fisher=False,
1308 copy_X=True,
1309 ):
1310 """Solve GLM with L1 and L2 penalty by coordinate descent algorithm.
1311
1312 The objective being minimized in the coefficients w=coef is::
1313
1314 F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1
1315
1316 An Improved GLMNET for L1-regularized Logistic Regression:
1317
1318 1. Find optimal descent direction d by minimizing
1319 min_d F(w+d) = min_d F(w+d) - F(w)
1320 2. Quadratic approximation of F(w+d)-F(w) = q(d):
1321 using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives:
1322 q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d
1323 + ||P1*(w+d)||_1 - ||P1*w||_1
1324 Then minimize q(d): min_d q(d)
1325 3. Coordinate descent by updating coordinate j (d -> d+z*e_j):
1326 min_z q(d+z*e_j)
1327 = min_z q(d+z*e_j) - q(d)
1328 = min_z A_j z + 1/2 B_jj z^2
1329 + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1
1330 A = f'(w) + d*H(w) + (w+d)*P2
1331 B = H + P2
1332
1333 Repeat steps 1-3 until convergence.
1334 Note: Use Fisher matrix instead of Hessian for H.
1335 Note: f' = -score, H = Fisher matrix
1336
1337 Parameters
1338 ----------
1339 coef : ndarray, shape (c,)
1340 If fit_intercept=False, shape c=X.shape[1].
1341 If fit_intercept=True, then c=X.shape[1] + 1.
1342
1343 X : {ndarray, csc sparse matrix}, shape (n_samples, n_features)
1344 Training data (with intercept included if present). If not sparse,
1345 pass directly as Fortran-contiguous data to avoid
1346 unnecessary memory duplication.
1347
1348 y : ndarray, shape (n_samples,)
1349 Target values.
1350
1351 weights: ndarray, shape (n_samples,)
1352 Sample weights with which the deviance is weighted. The weights must
1353 bee normalized and sum to 1.
1354
1355 P1 : {ndarray}, shape (n_features,)
1356 The L1-penalty vector (=diagonal matrix)
1357
1358 P2 : {ndarray, csc sparse matrix}, shape (n_features, n_features)
1359 The L2-penalty matrix or vector (=diagonal matrix). If a matrix is
1360 passed, it must be symmetric. If X is sparse, P2 must also be sparse.
1361
1362 fit_intercept : boolean, optional (default=True)
1363 Specifies if a constant (a.k.a. bias or intercept) should be
1364 added to the linear predictor (X*coef+intercept).
1365
1366 family : ExponentialDispersionModel
1367
1368 link : Link
1369
1370 max_iter : int, optional (default=100)
1371 Maximum numer of outer (Newton) iterations.
1372
1373 max_inner_iter : int, optional (default=1000)
1374 Maximum number of iterations in each inner loop, i.e. max number of
1375 cycles over all features per inner loop.
1376
1377 tol : float, optional (default=1e-4)
1378 Convergence criterion is
1379 sum_i(|minimum of norm of subgrad of objective_i|)<=tol.
1380
1381 selection : str, optional (default='cyclic')
1382 If 'random', randomly chose features in inner loop.
1383
1384 random_state : {int, RandomState instance, None}, optional (default=None)
1385
1386 diag_fisher : boolean, optional (default=False)
1387 ``False`` calculates full fisher matrix, ``True`` only diagonal matrix
1388 s.t. fisher = X.T @ diag @ X. This saves storage but needs more
1389 matrix-vector multiplications.
1390
1391 copy_X : boolean, optional (default=True)
1392 If ``True``, X will be copied; else, it may be overwritten.
1393
1394 Returns
1395 -------
1396 coef : ndarray, shape (c,)
1397 If fit_intercept=False, shape c=X.shape[1].
1398 If fit_intercept=True, then c=X.shape[1] + 1.
1399
1400 n_iter : number of outer iterations = newton iterations
1401
1402 n_cycles : number of cycles over features
1403
1404 References
1405 ----------
1406 Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
1407 An Improved GLMNET for L1-regularized Logistic Regression,
1408 Journal of Machine Learning Research 13 (2012) 1999-2030
1409 https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
1410 """
1411 1 3.0 3.0 0.0 if type(X) is not ColScaledSpMat:
1412 # TODO: because copy_X is being passed here and is also being passed in
1413 # check_X_y in fit(...), X is being copied twice. Check if this is true.
1414 X = check_array(
1415 X, "csc", dtype=[np.float64, np.float32], order="F", copy=copy_X
1416 )
1417 1 3.0 3.0 0.0 if P2.ndim == 2:
1418 2 457.0 228.5 0.0 P2 = check_array(
1419 1 4.0 4.0 0.0 P2, "csc", dtype=[np.float64, np.float32], order="F", copy=copy_X
1420 )
1421 1 5.0 5.0 0.0 if sparse.issparse(X):
1422 if not sparse.isspmatrix_csc(P2):
1423 raise ValueError(
1424 "If X is sparse, P2 must also be sparse csc"
1425 "format. Got P2 not sparse."
1426 )
1427 1 12.0 12.0 0.0 random_state = check_random_state(random_state)
1428 # Note: we already set P2 = l2*P2, P1 = l1*P1
1429 # Note: we already symmetrized P2 = 1/2 (P2 + P2')
1430 1 3.0 3.0 0.0 n_iter = 0 # number of outer iterations
1431 1 2.0 2.0 0.0 n_cycles = 0 # number of (complete) cycles over features
1432 1 1.0 1.0 0.0 converged = False
1433 1 2.0 2.0 0.0 n_samples, n_features = X.shape
1434 1 2.0 2.0 0.0 idx = 1 if fit_intercept else 0 # offset if coef[0] is intercept
1435 # line search parameters
1436 1 2.0 2.0 0.0 (beta, sigma) = (0.5, 0.01)
1437 # some precalculations
1438 # Note: For diag_fisher=False, fisher = X.T @ fisher @ X and fisher is a
1439 # 1d array representing a diagonal matrix.
1440 2 5409710.0 2704855.0 14.2 eta, mu, score, fisher = family._eta_mu_score_fisher(
1441 1 2.0 2.0 0.0 coef=coef, phi=1, X=X, y=y, weights=weights, link=link, diag_fisher=diag_fisher
1442 )
1443 # set up space for search direction d for inner loop
1444 1 31.0 31.0 0.0 d = np.zeros_like(coef)
1445
1446 # minimum subgradient norm
1447 1 3.0 3.0 0.0 def calc_mn_subgrad_norm():
1448 return linalg.norm(
1449 _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1), ord=1
1450 )
1451
1452 # the ratio of inner _cd_cycle tolerance to the minimum subgradient norm
1453 # This wasn't explored in the newGLMNET paper linked above.
1454 # That paper essentially uses inner_tol_ratio = 1.0, but using a slightly
1455 # lower value is much faster.
1456 # By comparison, the original GLMNET paper uses inner_tol = tol.
1457 # So, inner_tol_ratio < 1 is sort of a compromise between the two papers.
1458 # The value should probably be between 0.01 and 0.5. 0.1 works well for many problems
1459 1 2.0 2.0 0.0 inner_tol_ratio = 0.1
1460
1461 1 2.0 2.0 0.0 def calc_inner_tol(mn_subgrad_norm):
1462 # Another potential rule limits the inner tol to be no smaller than tol
1463 # return max(mn_subgrad_norm * inner_tol_ratio, tol)
1464 return mn_subgrad_norm * inner_tol_ratio
1465
1466 # initial stopping tolerance of inner loop
1467 # use L1-norm of minimum of norm of subgradient of F
1468 1 417.0 417.0 0.0 inner_tol = calc_inner_tol(calc_mn_subgrad_norm())
1469
1470 1 2.0 2.0 0.0 Fw = None
1471
1472 # outer loop
1473 6 12.0 2.0 0.0 while n_iter < max_iter:
1474 6 2301.0 383.5 0.0 print(inner_tol, n_iter, n_cycles)
1475 6 19.0 3.2 0.0 n_iter += 1
1476 # initialize search direction d (to be optimized) with zero
1477 6 35.0 5.8 0.0 d.fill(0)
1478 # inner loop = _cd_cycle
1479 12 27764.0 2313.7 0.1 d, coef_P2, n_cycles, inner_tol = _cd_cycle(
1480 6 13.0 2.2 0.0 d,
1481 6 13.0 2.2 0.0 X,
1482 6 14.0 2.3 0.0 coef,
1483 6 13.0 2.2 0.0 score,
1484 6 13.0 2.2 0.0 fisher,
1485 6 14.0 2.3 0.0 P1,
1486 6 13.0 2.2 0.0 P2,
1487 6 13.0 2.2 0.0 n_cycles,
1488 6 13.0 2.2 0.0 inner_tol,
1489 6 15.0 2.5 0.0 max_inner_iter=max_inner_iter,
1490 6 15.0 2.5 0.0 selection=selection,
1491 6 15.0 2.5 0.0 random_state=random_state,
1492 6 13.0 2.2 0.0 diag_fisher=diag_fisher,
1493 )
1494
1495 # line search by sequence beta^k, k=0, 1, ..
1496 # F(w + lambda d) - F(w) <= lambda * bound
1497 # bound = sigma * (f'(w)*d + w*P2*d
1498 # +||P1 (w+d)||_1 - ||P1 w||_1)
1499 6 247.0 41.2 0.0 P1w_1 = linalg.norm(P1 * coef[idx:], ord=1)
1500 6 202.0 33.7 0.0 P1wd_1 = linalg.norm(P1 * (coef + d)[idx:], ord=1)
1501 # Note: coef_P2 already calculated and still valid
1502 6 86.0 14.3 0.0 bound = sigma * (-(score @ d) + coef_P2 @ d[idx:] + P1wd_1 - P1w_1)
1503
1504 # In the first iteration, we must compute Fw explicitly.
1505 # In later iterations, we just use Fwd from the previous iteration
1506 # as set after the line search loop below.
1507 6 12.0 2.0 0.0 if Fw is None:
1508 1 2.0 2.0 0.0 Fw = (
1509 3 8018.0 2672.7 0.0 0.5 * family.deviance(y, mu, weights)
1510 1 17.0 17.0 0.0 + 0.5 * (coef_P2 @ coef[idx:])
1511 1 2.0 2.0 0.0 + P1w_1
1512 )
1513
1514 6 25.0 4.2 0.0 la = 1.0 / beta
1515
1516 6 247723.0 41287.2 0.6 X_dot_d = _safe_lin_pred(X, d)
1517
1518 # Try progressively shorter line search steps.
1519 6 49.0 8.2 0.0 for k in range(20):
1520 6 19.0 3.2 0.0 la *= beta # starts with la=1
1521 6 76.0 12.7 0.0 coef_wd = coef + la * d
1522
1523 # The simple version of the next line is:
1524 # mu_wd = link.inverse(_safe_lin_pred(X, coef_wd))
1525 # but because coef_wd can be factored as
1526 # coef_wd = coef + la * d
1527 # we can rewrite to only perform one dot product with the data
1528 # matrix per loop which is substantially faster
1529 6 69558.0 11593.0 0.2 mu_wd = link.inverse(eta + la * X_dot_d)
1530
1531 # TODO - optimize: for Tweedie that isn't one of the special cases
1532 # (gaussian, poisson, gamma), family.deviance is quite slow! Can we
1533 # fix that somehow?
1534 12 46967.0 3913.9 0.1 Fwd = 0.5 * family.deviance(y, mu_wd, weights) + linalg.norm(
1535 6 61.0 10.2 0.0 P1 * coef_wd[idx:], ord=1
1536 )
1537 6 20.0 3.3 0.0 if P2.ndim == 1:
1538 Fwd += 0.5 * ((coef_wd[idx:] * P2) @ coef_wd[idx:])
1539 else:
1540 6 514.0 85.7 0.0 Fwd += 0.5 * (coef_wd[idx:] @ (P2 @ coef_wd[idx:]))
1541 6 23.0 3.8 0.0 if Fwd - Fw <= sigma * la * bound:
1542 6 12.0 2.0 0.0 break
1543
1544 # Fw in the next iteration will be equal to Fwd this iteration.
1545 6 13.0 2.2 0.0 Fw = Fwd
1546
1547 # update coefficients
1548 6 77.0 12.8 0.0 coef += la * d
1549
1550 # calculate eta, mu, score, Fisher matrix for next iteration
1551 12 32344270.0 2695355.8 84.8 eta, mu, score, fisher = family._eta_mu_score_fisher(
1552 6 12.0 2.0 0.0 coef=coef,
1553 6 11.0 1.8 0.0 phi=1,
1554 6 12.0 2.0 0.0 X=X,
1555 6 12.0 2.0 0.0 y=y,
1556 6 12.0 2.0 0.0 weights=weights,
1557 6 12.0 2.0 0.0 link=link,
1558 6 12.0 2.0 0.0 diag_fisher=diag_fisher,
1559 )
1560
1561 # stopping criterion for outer loop
1562 # sum_i(|minimum-norm of subgrad of F(w)_i|)
1563 # fp_wP2 = f'(w) + w*P2
1564 # Note: eta, mu and score are already updated
1565 # this also updates the inner tolerance for the next loop!
1566 6 2366.0 394.3 0.0 mn_subgrad_norm = calc_mn_subgrad_norm()
1567 6 24.0 4.0 0.0 if mn_subgrad_norm <= tol:
1568 1 2.0 2.0 0.0 converged = True
1569 1 2.0 2.0 0.0 break
1570
1571 5 18.0 3.6 0.0 inner_tol = calc_inner_tol(mn_subgrad_norm)
1572 # end of outer loop
1573
1574 1 2.0 2.0 0.0 if not converged:
1575 warnings.warn(
1576 "Coordinate descent failed to converge. Increase"
1577 " the maximum number of iterations max_iter"
1578 " (currently {})".format(max_iter),
1579 ConvergenceWarning,
1580 )
1581
1582 1 2.0 2.0 0.0 return coef, n_iter, n_cycles
Total time: 44.3836 s
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py
Function: fit at line 1884
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1884 @profile
1885 def fit(self, X, y, sample_weight=None):
1886 """Fit a Generalized Linear Model.
1887
1888 Parameters
1889 ----------
1890 X : {array-like, sparse matrix}, shape (n_samples, n_features)
1891 Training data.
1892
1893 y : array-like, shape (n_samples,)
1894 Target values.
1895
1896 sample_weight : {None, array-like}, shape (n_samples,),\
1897 optional (default=None)
1898 Individual weights w_i for each sample. Note that for an
1899 Exponential Dispersion Model (EDM), one has
1900 Var[Y_i]=phi/w_i * v(mu).
1901 If Y_i ~ EDM(mu, phi/w_i), then
1902 sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a
1903 weighted average with weights=sample_weight.
1904
1905 Returns
1906 -------
1907 self : returns an instance of self.
1908 """
1909 # 99.8% of time is in the _cd_solver function
1910 #######################################################################
1911 # 1. input validation #
1912 #######################################################################
1913 # 1.1 validate arguments of __init__ ##################################
1914 # Guarantee that self._family_instance is an instance of class
1915 # ExponentialDispersionModel
1916 1 57.0 57.0 0.0 if isinstance(self.family, ExponentialDispersionModel):
1917 self._family_instance = self.family
1918 else:
1919 1 5.0 5.0 0.0 if self.family == "normal":
1920 self._family_instance = NormalDistribution()
1921 1 5.0 5.0 0.0 elif self.family == "poisson":
1922 1 25.0 25.0 0.0 self._family_instance = PoissonDistribution()
1923 elif self.family == "gamma":
1924 self._family_instance = GammaDistribution()
1925 elif self.family == "inverse.gaussian":
1926 self._family_instance = InverseGaussianDistribution()
1927 elif self.family == "binomial":
1928 self._family_instance = BinomialDistribution()
1929 else:
1930 raise ValueError(
1931 "The family must be an instance of class"
1932 " ExponentialDispersionModel or an element of"
1933 " ['normal', 'poisson', 'gamma', 'inverse.gaussian', "
1934 "'binomial']; got (family={})".format(self.family)
1935 )
1936
1937 # Guarantee that self._link_instance is set to an instance of
1938 # class Link
1939 1 15.0 15.0 0.0 if isinstance(self.link, Link):
1940 self._link_instance = self.link
1941 else:
1942 1 5.0 5.0 0.0 if self.link == "auto":
1943 1 7.0 7.0 0.0 if isinstance(self._family_instance, TweedieDistribution):
1944 1 6.0 6.0 0.0 if self._family_instance.power <= 0:
1945 self._link_instance = IdentityLink()
1946 1 5.0 5.0 0.0 if self._family_instance.power >= 1:
1947 1 8.0 8.0 0.0 self._link_instance = LogLink()
1948 elif isinstance(self._family_instance, GeneralizedHyperbolicSecant):
1949 self._link_instance = IdentityLink()
1950 elif isinstance(self._family_instance, BinomialDistribution):
1951 self._link_instance = LogitLink()
1952 else:
1953 raise ValueError(
1954 "No default link known for the "
1955 "specified distribution family. Please "
1956 "set link manually, i.e. not to 'auto'; "
1957 "got (link='auto', family={}".format(self.family)
1958 )
1959 elif self.link == "identity":
1960 self._link_instance = IdentityLink()
1961 elif self.link == "log":
1962 self._link_instance = LogLink()
1963 elif self.link == "logit":
1964 self._link_instance = LogitLink()
1965 else:
1966 raise ValueError(
1967 "The link must be an instance of class Link or "
1968 "an element of ['auto', 'identity', 'log', 'logit']; "
1969 "got (link={})".format(self.link)
1970 )
1971
1972 1 14.0 14.0 0.0 if not isinstance(self.alpha, numbers.Number) or self.alpha < 0:
1973 raise ValueError(
1974 "Penalty term must be a non-negative number;"
1975 " got (alpha={})".format(self.alpha)
1976 )
1977 3 15.0 5.0 0.0 if (
1978 1 5.0 5.0 0.0 not isinstance(self.l1_ratio, numbers.Number)
1979 1 5.0 5.0 0.0 or self.l1_ratio < 0
1980 1 5.0 5.0 0.0 or self.l1_ratio > 1
1981 ):
1982 raise ValueError(
1983 "l1_ratio must be a number in interval [0, 1];"
1984 " got (l1_ratio={})".format(self.l1_ratio)
1985 )
1986 1 5.0 5.0 0.0 if not isinstance(self.fit_intercept, bool):
1987 raise ValueError(
1988 "The argument fit_intercept must be bool;"
1989 " got {}".format(self.fit_intercept)
1990 )
1991 1 5.0 5.0 0.0 if self.solver not in ["auto", "irls", "lbfgs", "newton-cg", "cd"]:
1992 raise ValueError(
1993 "GeneralizedLinearRegressor supports only solvers"
1994 " 'auto', 'irls', 'lbfgs', 'newton-cg' and 'cd';"
1995 " got {}".format(self.solver)
1996 )
1997 1 4.0 4.0 0.0 solver = self.solver
1998 1 5.0 5.0 0.0 if self.solver == "auto":
1999 1 5.0 5.0 0.0 if self.l1_ratio == 0:
2000 solver = "irls"
2001 else:
2002 1 5.0 5.0 0.0 solver = "cd"
2003 1 5.0 5.0 0.0 if self.alpha > 0 and self.l1_ratio > 0 and solver not in ["cd"]:
2004 raise ValueError(
2005 "The chosen solver (solver={}) can't deal "
2006 "with L1 penalties, which are included with "
2007 "(alpha={}) and (l1_ratio={}).".format(
2008 solver, self.alpha, self.l1_ratio
2009 )
2010 )
2011 1 6.0 6.0 0.0 if not isinstance(self.max_iter, int) or self.max_iter <= 0:
2012 raise ValueError(
2013 "Maximum number of iteration must be a positive "
2014 "integer;"
2015 " got (max_iter={!r})".format(self.max_iter)
2016 )
2017 1 6.0 6.0 0.0 if not isinstance(self.tol, numbers.Number) or self.tol <= 0:
2018 raise ValueError(
2019 "Tolerance for stopping criteria must be "
2020 "positive; got (tol={!r})".format(self.tol)
2021 )
2022 1 5.0 5.0 0.0 if not isinstance(self.warm_start, bool):
2023 raise ValueError(
2024 "The argument warm_start must be bool;"
2025 " got {}".format(self.warm_start)
2026 )
2027 1 4.0 4.0 0.0 if self.selection not in ["cyclic", "random"]:
2028 raise ValueError(
2029 "The argument selection must be 'cyclic' or "
2030 "'random'; got (selection={})".format(self.selection)
2031 )
2032 1 379.0 379.0 0.0 random_state = check_random_state(self.random_state)
2033 1 6.0 6.0 0.0 if not isinstance(self.diag_fisher, bool):
2034 raise ValueError(
2035 "The argument diag_fisher must be bool;"
2036 " got {}".format(self.diag_fisher)
2037 )
2038 1 5.0 5.0 0.0 if not isinstance(self.copy_X, bool):
2039 raise ValueError(
2040 "The argument copy_X must be bool;" " got {}".format(self.copy_X)
2041 )
2042 1 5.0 5.0 0.0 if not isinstance(self.check_input, bool):
2043 raise ValueError(
2044 "The argument check_input must be bool; got "
2045 "(check_input={})".format(self.check_input)
2046 )
2047
2048 1 5.0 5.0 0.0 family = self._family_instance
2049 1 5.0 5.0 0.0 link = self._link_instance
2050
2051 # 1.2 validate arguments of fit #######################################
2052 1 6.0 6.0 0.0 _dtype = [np.float64, np.float32]
2053 1 5.0 5.0 0.0 if solver == "cd":
2054 1 4.0 4.0 0.0 _stype = ["csc"]
2055 else:
2056 _stype = ["csc", "csr"]
2057 2 122584.0 61292.0 0.3 X, y = check_X_y(
2058 1 4.0 4.0 0.0 X,
2059 1 5.0 5.0 0.0 y,
2060 1 5.0 5.0 0.0 accept_sparse=_stype,
2061 1 4.0 4.0 0.0 dtype=_dtype,
2062 1 5.0 5.0 0.0 y_numeric=True,
2063 1 4.0 4.0 0.0 multi_output=False,
2064 1 5.0 5.0 0.0 copy=self.copy_X,
2065 )
2066 # Without converting y to float, deviance might raise
2067 # ValueError: Integers to negative integer powers are not allowed.
2068 # Also, y must not be sparse.
2069 1 10.0 10.0 0.0 y = np.asarray(y, dtype=np.float64)
2070
2071 1 1781.0 1781.0 0.0 weights = _check_weights(sample_weight, y.shape[0])
2072
2073 1 9.0 9.0 0.0 n_samples, n_features = X.shape
2074
2075 # 1.3 arguments to take special care ##################################
2076 # P1, P2, start_params
2077 1 7.0 7.0 0.0 if isinstance(self.P1, str) and self.P1 == "identity":
2078 1 26.0 26.0 0.0 P1 = np.ones(n_features)
2079 else:
2080 P1 = np.atleast_1d(self.P1)
2081 try:
2082 P1 = P1.astype(np.float64, casting="safe", copy=False)
2083 except TypeError:
2084 raise TypeError(
2085 "The given P1 cannot be converted to a numeric"
2086 "array; got (P1.dtype={}).".format(P1.dtype)
2087 )
2088 if (P1.ndim != 1) or (P1.shape[0] != n_features):
2089 raise ValueError(
2090 "P1 must be either 'identity' or a 1d array "
2091 "with the length of X.shape[1]; "
2092 "got (P1.shape[0]={}), "
2093 "needed (X.shape[1]={}).".format(P1.shape[0], n_features)
2094 )
2095 # If X is sparse, make P2 sparse, too.
2096 1 5.0 5.0 0.0 if isinstance(self.P2, str) and self.P2 == "identity":
2097 1 7.0 7.0 0.0 if sparse.issparse(X):
2098 1 281.0 281.0 0.0 P2 = (
2099 2 173.0 86.5 0.0 sparse.dia_matrix(
2100 1 10.0 10.0 0.0 (np.ones(n_features), 0), shape=(n_features, n_features)
2101 )
2102 ).tocsc()
2103 else:
2104 P2 = np.ones(n_features)
2105 else:
2106 P2 = check_array(
2107 self.P2, copy=True, accept_sparse=_stype, dtype=_dtype, ensure_2d=False
2108 )
2109 if P2.ndim == 1:
2110 P2 = np.asarray(P2)
2111 if P2.shape[0] != n_features:
2112 raise ValueError(
2113 "P2 should be a 1d array of shape "
2114 "(n_features,) with "
2115 "n_features=X.shape[1]; "
2116 "got (P2.shape=({},)), needed ({},)".format(
2117 P2.shape[0], X.shape[1]
2118 )
2119 )
2120 if sparse.issparse(X):
2121 P2 = (
2122 sparse.dia_matrix((P2, 0), shape=(n_features, n_features))
2123 ).tocsc()
2124 elif (
2125 P2.ndim == 2
2126 and P2.shape[0] == P2.shape[1]
2127 and P2.shape[0] == X.shape[1]
2128 ):
2129 if sparse.issparse(X):
2130 P2 = sparse.csc_matrix(P2)
2131 else:
2132 raise ValueError(
2133 "P2 must be either None or an array of shape "
2134 "(n_features, n_features) with "
2135 "n_features=X.shape[1]; "
2136 "got (P2.shape=({0}, {1})), needed ({2}, {2})".format(
2137 P2.shape[0], P2.shape[1], X.shape[1]
2138 )
2139 )
2140
2141 1 6.0 6.0 0.0 start_params = self.start_params
2142 1 6.0 6.0 0.0 if isinstance(start_params, str):
2143 1 5.0 5.0 0.0 if start_params not in ["guess", "zero"]:
2144 raise ValueError(
2145 "The argument start_params must be 'guess', "
2146 "'zero' or an array of correct length; "
2147 "got(start_params={})".format(start_params)
2148 )
2149 else:
2150 start_params = check_array(
2151 start_params,
2152 accept_sparse=False,
2153 force_all_finite=True,
2154 ensure_2d=False,
2155 dtype=_dtype,
2156 copy=True,
2157 )
2158 if (start_params.shape[0] != X.shape[1] + self.fit_intercept) or (
2159 start_params.ndim != 1
2160 ):
2161 raise ValueError(
2162 "Start values for parameters must have the"
2163 "right length and dimension; required (length"
2164 "={}, ndim=1); got (length={}, ndim={}).".format(
2165 X.shape[1] + self.fit_intercept,
2166 start_params.shape[0],
2167 start_params.ndim,
2168 )
2169 )
2170
2171 1 5.0 5.0 0.0 l1 = self.alpha * self.l1_ratio
2172 1 6.0 6.0 0.0 l2 = self.alpha * (1 - self.l1_ratio)
2173 # P1 and P2 are now for sure copies
2174 1 8.0 8.0 0.0 P1 = l1 * P1
2175 1 146.0 146.0 0.0 P2 = l2 * P2
2176 # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2')
2177 # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric
2178 1 5.0 5.0 0.0 if P2.ndim == 2:
2179 1 6.0 6.0 0.0 if sparse.issparse(P2):
2180 1 6.0 6.0 0.0 if sparse.isspmatrix_csc(P2):
2181 1 811.0 811.0 0.0 P2 = 0.5 * (P2 + P2.transpose()).tocsc()
2182 else:
2183 P2 = 0.5 * (P2 + P2.transpose()).tocsr()
2184 else:
2185 P2 = 0.5 * (P2 + P2.T)
2186
2187 # For coordinate descent, if X is sparse, P2 must also be csc
2188 1 7.0 7.0 0.0 if solver == "cd" and sparse.issparse(X):
2189 1 80.0 80.0 0.0 P2 = sparse.csc_matrix(P2)
2190
2191 # 1.4 additional validations ##########################################
2192 1 5.0 5.0 0.0 if self.check_input:
2193 1 1319.0 1319.0 0.0 if not np.all(family.in_y_range(y)):
2194 raise ValueError(
2195 "Some value(s) of y are out of the valid "
2196 "range for family {}".format(family.__class__.__name__)
2197 )
2198 # check if P1 has only non-negative values, negative values might
2199 # indicate group lasso in the future.
2200 1 6.0 6.0 0.0 if not isinstance(self.P1, str): # if self.P1 != 'identity':
2201 if not np.all(P1 >= 0):
2202 raise ValueError("P1 must not have negative values.")
2203 # check if P2 is positive semidefinite
2204 # np.linalg.cholesky(P2) 'only' asserts positive definite
2205 1 5.0 5.0 0.0 if not isinstance(self.P2, str): # self.P2 != 'identity'
2206 # due to numerical precision, we allow eigenvalues to be a
2207 # tiny bit negative
2208 epsneg = -10 * np.finfo(P2.dtype).epsneg
2209 if P2.ndim == 1 or P2.shape[0] == 1:
2210 p2 = P2
2211 if sparse.issparse(P2):
2212 p2 = P2.toarray()
2213 if not np.all(p2 >= 0):
2214 raise ValueError(
2215 "1d array P2 must not have negative " "values."
2216 )
2217 elif sparse.issparse(P2):
2218 # for sparse matrices, not all eigenvals can be computed
2219 # efficiently, use only half of n_features
2220 # k = how many eigenvals to compute
2221 k = np.min([10, n_features // 10 + 1])
2222 sigma = -1000 * epsneg # start searching near this value
2223 which = "SA" # find smallest algebraic eigenvalues first
2224 eigenvalues = splinalg.eigsh(
2225 P2, k=k, sigma=sigma, which=which, return_eigenvectors=False
2226 )
2227 if not np.all(eigenvalues >= epsneg):
2228 raise ValueError("P2 must be positive semi-definite.")
2229 else:
2230 if not np.all(linalg.eigvalsh(P2) >= epsneg):
2231 raise ValueError("P2 must be positive semi-definite.")
2232 # TODO: if alpha=0 check that X is not rank deficient
2233 # TODO: what else to check?
2234
2235 #######################################################################
2236 # 2a. rescaling of weights (sample_weight) #
2237 #######################################################################
2238 # IMPORTANT NOTE: Since we want to minimize
2239 # 1/(2*sum(sample_weight)) * deviance + L1 + L2,
2240 # deviance = sum(sample_weight * unit_deviance),
2241 # we rescale weights such that sum(weights) = 1 and this becomes
2242 # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance)
2243 1 428.0 428.0 0.0 weights_sum = np.sum(weights)
2244 1 1972.0 1972.0 0.0 weights = weights / weights_sum
2245
2246 #######################################################################
2247 # 2b. potentially rescale predictors
2248 #######################################################################
2249 1 7.0 7.0 0.0 if self.standardize:
2250 1 479789.0 479789.0 1.1 X, P1, P2, col_means, col_stds = _standardize(X, P1, P2, weights)
2251
2252 #######################################################################
2253 # 3. initialization of coef = (intercept_, coef_) #
2254 #######################################################################
2255 # Note: Since phi=self.dispersion_ does not enter the estimation
2256 # of mu_i=E[y_i], set it to 1.
2257
2258 # set start values for coef
2259 1 26.0 26.0 0.0 if self.warm_start and hasattr(self, "coef_"):
2260 coef = self.coef_
2261 intercept = self.intercept_
2262 if self.fit_intercept:
2263 coef = np.concatenate((np.array([intercept]), coef))
2264 if self.standardize:
2265 _standardize_warm_start(coef, col_means, col_stds)
2266 1 7.0 7.0 0.0 elif isinstance(start_params, str):
2267 1 5.0 5.0 0.0 if start_params == "guess":
2268 # Set mu=starting_mu of the family and do one Newton step
2269 # If solver=cd use cd, else irls
2270 1 4731.0 4731.0 0.0 mu = family.starting_mu(y, weights=weights)
2271 1 12563.0 12563.0 0.0 eta = link.link(mu) # linear predictor
2272 1 8.0 8.0 0.0 if solver in ["cd", "lbfgs", "newton-cg"]:
2273 # see function _cd_solver
2274 1 10930.0 10930.0 0.0 sigma_inv = 1 / family.variance(mu, phi=1, weights=weights)
2275 1 10093.0 10093.0 0.0 d1 = link.inverse_derivative(eta)
2276 1 5497.0 5497.0 0.0 temp = sigma_inv * d1 * (y - mu)
2277 1 8.0 8.0 0.0 if self.fit_intercept:
2278 1 42518.0 42518.0 0.1 score = np.concatenate(([temp.sum()], temp @ X))
2279 else:
2280 score = temp @ X # same as X.T @ temp
2281
2282 1 1980.0 1980.0 0.0 d2_sigma_inv = d1 * d1 * sigma_inv
2283 1 7.0 7.0 0.0 diag_fisher = self.diag_fisher
2284 1 5.0 5.0 0.0 if diag_fisher:
2285 fisher = d2_sigma_inv
2286 else:
2287 2 5307565.0 2653782.5 12.0 fisher = _safe_sandwich_dot(
2288 1 6.0 6.0 0.0 X, d2_sigma_inv, intercept=self.fit_intercept
2289 )
2290 # set up space for search direction d for inner loop
2291 1 8.0 8.0 0.0 if self.fit_intercept:
2292 1 12.0 12.0 0.0 coef = np.zeros(n_features + 1)
2293 else:
2294 coef = np.zeros(n_features)
2295 1 43.0 43.0 0.0 d = np.zeros_like(coef)
2296 # initial stopping tolerance of inner loop
2297 # use L1-norm of minimum of norm of subgradient of F
2298 # use less restrictive tolerance for initial guess
2299 1 340.0 340.0 0.0 inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1)
2300 1 84.0 84.0 0.0 inner_tol = 4 * linalg.norm(inner_tol, ord=1)
2301 # just one outer loop = Newton step
2302 1 6.0 6.0 0.0 n_cycles = 0
2303 2 4408.0 2204.0 0.0 d, coef_P2, n_cycles, inner_tol = _cd_cycle(
2304 1 5.0 5.0 0.0 d,
2305 1 6.0 6.0 0.0 X,
2306 1 5.0 5.0 0.0 coef,
2307 1 5.0 5.0 0.0 score,
2308 1 5.0 5.0 0.0 fisher,
2309 1 6.0 6.0 0.0 P1,
2310 1 6.0 6.0 0.0 P2,
2311 1 6.0 6.0 0.0 n_cycles,
2312 1 5.0 5.0 0.0 inner_tol,
2313 1 5.0 5.0 0.0 max_inner_iter=1000,
2314 1 6.0 6.0 0.0 selection=self.selection,
2315 1 6.0 6.0 0.0 random_state=random_state,
2316 1 6.0 6.0 0.0 diag_fisher=self.diag_fisher,
2317 )
2318 1 10.0 10.0 0.0 coef += d # for simplicity no line search here
2319 else:
2320 # See _irls_solver
2321 # h'(eta)
2322 hp = link.inverse_derivative(eta)
2323 # working weights W, in principle a diagonal matrix
2324 # therefore here just as 1d array
2325 W = hp ** 2 / family.variance(mu, phi=1, weights=weights)
2326 # working observations
2327 z = eta + (y - mu) / hp
2328 # solve A*coef = b
2329 # A = X' W X + l2 P2, b = X' W z
2330 coef = _irls_step(X, W, P2, z, fit_intercept=self.fit_intercept)
2331 else: # start_params == 'zero'
2332 if self.fit_intercept:
2333 coef = np.zeros(n_features + 1)
2334 coef[0] = link.link(np.average(y, weights=weights))
2335 else:
2336 coef = np.zeros(n_features)
2337 else: # assign given array as start values
2338 coef = start_params
2339 if self.standardize:
2340 _standardize_warm_start(coef, col_means, col_stds)
2341
2342 #######################################################################
2343 # 4. fit #
2344 #######################################################################
2345 # algorithms for optimization
2346 # TODO: Parallelize it?
2347
2348 # 4.1 IRLS ############################################################
2349 # Note: we already set P2 = l2*P2, see above
2350 # Note: we already symmetrized P2 = 1/2 (P2 + P2')
2351 1 6.0 6.0 0.0 if solver == "irls":
2352 coef, self.n_iter_ = _irls_solver(
2353 coef=coef,
2354 X=X,
2355 y=y,
2356 weights=weights,
2357 P2=P2,
2358 fit_intercept=self.fit_intercept,
2359 family=family,
2360 link=link,
2361 max_iter=self.max_iter,
2362 tol=self.tol,
2363 )
2364
2365 # 4.2 L-BFGS ##########################################################
2366 1 5.0 5.0 0.0 elif solver == "lbfgs":
2367
2368 def func(coef, X, y, weights, P2, family, link):
2369 mu, devp = family._mu_deviance_derivative(coef, X, y, weights, link)
2370 dev = family.deviance(y, mu, weights)
2371 intercept = coef.size == X.shape[1] + 1
2372 idx = 1 if intercept else 0 # offset if coef[0] is intercept
2373 if P2.ndim == 1:
2374 L2 = P2 * coef[idx:]
2375 else:
2376 L2 = P2 @ coef[idx:]
2377 obj = 0.5 * dev + 0.5 * (coef[idx:] @ L2)
2378 objp = 0.5 * devp
2379 objp[idx:] += L2
2380 return obj, objp
2381
2382 args = (X, y, weights, P2, family, link)
2383 coef, loss, info = fmin_l_bfgs_b(
2384 func,
2385 coef,
2386 fprime=None,
2387 args=args,
2388 iprint=(self.verbose > 0) - 1,
2389 pgtol=self.tol,
2390 maxiter=self.max_iter,
2391 factr=1e3,
2392 )
2393 if info["warnflag"] == 1:
2394 warnings.warn(
2395 "lbfgs failed to converge." " Increase the number of iterations.",
2396 ConvergenceWarning,
2397 )
2398 elif info["warnflag"] == 2:
2399 warnings.warn("lbfgs failed for the reason: {}".format(info["task"]))
2400 self.n_iter_ = info["nit"]
2401
2402 # 4.3 Newton-CG #######################################################
2403 # We use again the fisher matrix instead of the hessian. More
2404 # precisely, expected hessian of deviance.
2405 1 6.0 6.0 0.0 elif solver == "newton-cg":
2406
2407 def func(coef, X, y, weights, P2, family, link):
2408 intercept = coef.size == X.shape[1] + 1
2409 idx = 1 if intercept else 0 # offset if coef[0] is intercept
2410 if P2.ndim == 1:
2411 L2 = coef[idx:] @ (P2 * coef[idx:])
2412 else:
2413 L2 = coef[idx:] @ (P2 @ coef[idx:])
2414 mu = link.inverse(_safe_lin_pred(X, coef))
2415 return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2
2416
2417 def grad(coef, X, y, weights, P2, family, link):
2418 mu, devp = family._mu_deviance_derivative(coef, X, y, weights, link)
2419 intercept = coef.size == X.shape[1] + 1
2420 idx = 1 if intercept else 0 # offset if coef[0] is intercept
2421 if P2.ndim == 1:
2422 L2 = P2 * coef[idx:]
2423 else:
2424 L2 = P2 @ coef[idx:]
2425 objp = 0.5 * devp
2426 objp[idx:] += L2
2427 return objp
2428
2429 def grad_hess(coef, X, y, weights, P2, family, link):
2430 intercept = coef.size == X.shape[1] + 1
2431 idx = 1 if intercept else 0 # offset if coef[0] is intercept
2432 if P2.ndim == 1:
2433 L2 = P2 * coef[idx:]
2434 else:
2435 L2 = P2 @ coef[idx:]
2436 eta = _safe_lin_pred(X, coef)
2437 mu = link.inverse(eta)
2438 d1 = link.inverse_derivative(eta)
2439 temp = d1 * family.deviance_derivative(y, mu, weights)
2440 if intercept:
2441 grad = np.concatenate(([0.5 * temp.sum()], 0.5 * temp @ X + L2))
2442 else:
2443 grad = 0.5 * temp @ X + L2 # same as 0.5* X.T @ temp + L2
2444
2445 # expected hessian = fisher = X.T @ diag_matrix @ X
2446 # calculate only diag_matrix
2447 diag = d1 ** 2 / family.variance(mu, phi=1, weights=weights)
2448 if intercept:
2449 h0i = np.concatenate(([diag.sum()], diag @ X))
2450
2451 def Hs(coef):
2452 # return (0.5 * fisher + P2) @ coef
2453 # ret = 0.5 * (X.T @ (diag * (X @ coef)))
2454 ret = 0.5 * ((diag * (X @ coef[idx:])) @ X)
2455 if P2.ndim == 1:
2456 ret += P2 * coef[idx:]
2457 else:
2458 ret += P2 @ coef[idx:]
2459 if intercept:
2460 ret = np.concatenate(
2461 ([0.5 * (h0i @ coef)], ret + 0.5 * coef[0] * h0i[1:])
2462 )
2463 return ret
2464
2465 return grad, Hs
2466
2467 args = (X, y, weights, P2, family, link)
2468 coef, self.n_iter_ = newton_cg(
2469 grad_hess,
2470 func,
2471 grad,
2472 coef,
2473 args=args,
2474 maxiter=self.max_iter,
2475 tol=self.tol,
2476 )
2477
2478 # 4.4 coordinate descent ##############################################
2479 # Note: we already set P1 = l1*P1, see above
2480 # Note: we already set P2 = l2*P2, see above
2481 # Note: we already symmetrized P2 = 1/2 (P2 + P2')
2482 1 6.0 6.0 0.0 elif solver == "cd":
2483 2 38164237.0 19082118.5 86.0 coef, self.n_iter_, self._n_cycles = _cd_solver(
2484 1 6.0 6.0 0.0 coef=coef,
2485 1 5.0 5.0 0.0 X=X,
2486 1 5.0 5.0 0.0 y=y,
2487 1 5.0 5.0 0.0 weights=weights,
2488 1 6.0 6.0 0.0 P1=P1,
2489 1 6.0 6.0 0.0 P2=P2,
2490 1 7.0 7.0 0.0 fit_intercept=self.fit_intercept,
2491 1 6.0 6.0 0.0 family=family,
2492 1 5.0 5.0 0.0 link=link,
2493 1 5.0 5.0 0.0 max_iter=self.max_iter,
2494 1 6.0 6.0 0.0 tol=self.tol,
2495 1 6.0 6.0 0.0 selection=self.selection,
2496 1 6.0 6.0 0.0 random_state=random_state,
2497 1 6.0 6.0 0.0 diag_fisher=self.diag_fisher,
2498 1 5.0 5.0 0.0 copy_X=self.copy_X,
2499 )
2500
2501 #######################################################################
2502 # 5a. handle intercept
2503 #######################################################################
2504 1 8.0 8.0 0.0 if self.fit_intercept:
2505 1 8.0 8.0 0.0 self.intercept_ = coef[0]
2506 1 8.0 8.0 0.0 self.coef_ = coef[1:]
2507 else:
2508 # set intercept to zero as the other linear models do
2509 self.intercept_ = 0.0
2510 self.coef_ = coef
2511
2512 #######################################################################
2513 # 5a. undo standardization
2514 #######################################################################
2515 1 7.0 7.0 0.0 if self.standardize:
2516 # TODO: Make sure this doesn't copy X
2517 2 208084.0 104042.0 0.5 X, self.intercept_, self.coef_ = _unstandardize(
2518 1 6.0 6.0 0.0 X, col_means, col_stds, self.intercept_, self.coef_
2519 )
2520
2521 1 10.0 10.0 0.0 if self.fit_dispersion in ["chisqr", "deviance"]:
2522 # attention because of rescaling of weights
2523 self.dispersion_ = self.estimate_phi(X, y, weights) * weights_sum
2524
2525 1 5.0 5.0 0.0 return self
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment