-
-
Save jtilly/d2ff9b7bd6c690a35db052d1730e0a06 to your computer and use it in GitHub Desktop.
Compare GLM implementations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Setup | |
# | |
# # You need to clone | |
# - https://github.axa.com/martin-rohbeck-external/k-claims-modelling | |
# - https://github.axa.com/jan-tilly/glmnet_python | |
# - https://github.axa.com/jan-tilly/glm_benchmarks | |
# and then run (adjust paths to repos accordingly) | |
# conda run -n ${PROJECT_FLAVOR} mamba install -y fortran-compiler | |
# conda run -n ${PROJECT_FLAVOR} pip install --no-use-pep517 --disable-pip-version-check -e ~/code/k-claims-modelling | |
# conda run -n ${PROJECT_FLAVOR} --cwd ~/code/glmnet_python python setup.py install | |
# conda run -n ${PROJECT_FLAVOR} pip install --no-use-pep517 --disable-pip-version-check -e ~/code/glm_benchmarks | |
import os | |
from time import perf_counter | |
import glm_benchmarks.sklearn_fork as sklearn_qc | |
import numpy as np | |
import pandas as pd | |
import scipy.sparse | |
from glmnet_python import glmnet, glmnetPredict | |
from npp.config import get_logger | |
import k_claims_modelling.external.sklearn.linear_model.glm as sklearn | |
from k_claims_modelling.evaluation.gini import compute_gini | |
from k_claims_modelling.modelling.regressors import exposure_correction | |
logger = get_logger() | |
data_dir = "/home/shared_folders/pricing/claims_modelling/test_data_for_profiling_glms" | |
# %% | |
# load data | |
df = pd.read_parquet(os.path.join(data_dir, "outcomes.parquet")) | |
X = pd.read_parquet(os.path.join(data_dir, "X.parquet")) | |
# subsample | |
idx = df.sample(n=1_000_000).index | |
df = df.loc[idx].reset_index(drop=True) | |
X = X.loc[idx].reset_index(drop=True) | |
print(f"Dim of data: {X.shape}") | |
print(f"Sparseness: {(np.abs(X.values) < 1e-10).mean():.3f}") | |
print(f"Size of data frame: {X.memory_usage().sum() / float(1<<30):.2f} GB") | |
# %% | |
y = df["sanzkh02"].to_numpy() | |
exposure = df["je"].to_numpy() | |
train_set = (df["sample"] == "train").to_numpy() | |
test_set = (df["sample"] == "test").to_numpy() | |
future_set = (df["sample"] == "future").to_numpy() | |
offset = df["offset_kh_sach_frequenz"].to_numpy() | |
y_adj, sample_weight = exposure_correction( | |
power=1, y=y, exposure=exposure, offset=offset | |
) | |
# %% | |
model_args = dict( | |
family="poisson", | |
max_iter=100, | |
random_state=12345, | |
copy_X=True, | |
selection="random", | |
tol=1e-5, | |
) | |
# %% | |
for alpha, l1_ratio, sparse in ( | |
(0.01, 0.00, False), | |
(1.00, 0.50, False), | |
(0.01, 0.00, True), | |
(1.00, 0.50, True), | |
): | |
print("") | |
print("*********************************") | |
print(f"{alpha=}, {l1_ratio=}, {sparse=}") | |
print("") | |
if sparse: | |
print("Creating sparse matrix...") | |
Xtrain = scipy.sparse.csc_matrix(X[train_set]) | |
else: | |
Xtrain = X[train_set].to_numpy() | |
# new implementation | |
print("") | |
print("Using new implementation...") | |
tic = perf_counter() | |
poisson_regressor_qc = sklearn_qc.GeneralizedLinearRegressor( | |
standardize=True, alpha=alpha, l1_ratio=l1_ratio, **model_args | |
).fit(X=Xtrain, y=y_adj[train_set], sample_weight=sample_weight[train_set]) | |
toc = perf_counter() | |
print(f"{toc-tic:.4f} seconds") | |
y_pred_qc = poisson_regressor_qc.predict(X[test_set]) * sample_weight[test_set] | |
gini = compute_gini( | |
y_true=y[test_set], y_score=y_pred_qc, exposure=exposure[test_set] | |
) | |
print(f"{gini=}") | |
print(f"mean outcome: {y[test_set].sum() / exposure[test_set].sum()}") | |
print(f"mean prediction: {y_pred_qc.sum() / exposure[test_set].sum()}") | |
# glmnet | |
print("") | |
print("Using glmnet...") | |
tic = perf_counter() | |
poisson_regressor_glmnet = glmnet( | |
x=Xtrain, | |
y=y_adj[train_set].copy(), | |
weights=sample_weight[train_set], | |
standardize=True, | |
lambdau=np.array([alpha]), | |
alpha=l1_ratio, | |
family=model_args["family"], | |
thresh=model_args["tol"], | |
) | |
toc = perf_counter() | |
print(f"{toc-tic:.4f} seconds") | |
y_pred_glmnet = ( | |
np.exp(glmnetPredict(poisson_regressor_glmnet, X[test_set]).flatten()) | |
* sample_weight[test_set] | |
) | |
gini = compute_gini( | |
y_true=y[test_set], y_score=y_pred_glmnet, exposure=exposure[test_set] | |
) | |
print(f"{gini=}") | |
print(f"mean outcome: {y[test_set].sum() / exposure[test_set].sum()}") | |
print(f"mean prediction: {y_pred_glmnet.sum() / exposure[test_set].sum()}") | |
# compare predictions | |
print(f"all close? {np.allclose(y_pred_qc, y_pred_glmnet, atol=1e-04)}") | |
# old implementation | |
print("") | |
print("Using old implementation... (no standardization)") | |
tic = perf_counter() | |
poisson_regressor = sklearn.GeneralizedLinearRegressor( | |
alpha=alpha, l1_ratio=l1_ratio, **model_args | |
).fit(X=Xtrain, y=y_adj[train_set], sample_weight=sample_weight[train_set]) | |
toc = perf_counter() | |
print(f"{toc-tic:.4f} seconds") | |
y_pred = poisson_regressor.predict(X[test_set]) * sample_weight[test_set] | |
gini = compute_gini(y_true=y[test_set], y_score=y_pred, exposure=exposure[test_set]) | |
print(f"{gini=}") | |
print(f"mean outcome: {y[test_set].sum() / exposure[test_set].sum()}") | |
print(f"mean prediction: {y_pred.sum() / exposure[test_set].sum()}") | |
# compare predictions | |
print(f"all close? {np.allclose(y_pred_qc, y_pred, atol=1e-04)}") | |
# %% |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Dim of data: (10000000, 107) | |
Sparseness: 0.683 | |
Size of data frame: 7.97 GB | |
********************************* | |
alpha=0.01, l1_ratio=0.0, sparse=False | |
Using new implementation... | |
41.0648 seconds | |
gini=0.26368907871297714 | |
mean outcome: 0.04747663840483652 | |
mean prediction: 0.047279319092816983 | |
Using glmnet... | |
91.3531 seconds | |
gini=0.27551655391490293 | |
mean outcome: 0.04747663840483652 | |
mean prediction: 0.04728485394366472 | |
all close? False | |
Using old implementation... (no standardization) | |
36.6776 seconds | |
gini=0.25560893366357507 | |
mean outcome: 0.04747663840483652 | |
mean prediction: 0.047348574848387524 | |
all close? False | |
********************************* | |
alpha=1.0, l1_ratio=0.5, sparse=False | |
Using new implementation... | |
0.09614459466725817 0 0 | |
0.034378559002931665 1 1 | |
0.01170238383401215 2 2 | |
0.0034775112012216183 3 3 | |
0.000708324797992112 4 4 | |
5.2231794878223296e-05 5 5 | |
56.4671 seconds | |
gini=0.11972711376286291 | |
mean outcome: 0.04747663840483652 | |
mean prediction: 0.04729607839402956 | |
Using glmnet... | |
7.4578 seconds | |
gini=0.12023107566353006 | |
mean outcome: 0.04747663840483652 | |
mean prediction: 0.04729181545853376 | |
all close? True | |
Using old implementation... (no standardization) | |
0.9614459466725815 0 0 | |
0.34378559002931663 1 1 | |
0.11702383834012149 2 2 | |
0.03477511201221618 3 3 | |
0.00708324797992112 4 4 | |
0.0005223179487822329 5 5 | |
52.6287 seconds | |
gini=0.11972711376286291 | |
mean outcome: 0.04747663840483652 | |
mean prediction: 0.04729607839402956 | |
all close? True | |
********************************* | |
alpha=0.01, l1_ratio=0.0, sparse=True | |
Creating sparse matrix... | |
Using new implementation... | |
338.1123 seconds | |
gini=0.263689078359197 | |
mean outcome: 0.04747663840483652 | |
mean prediction: 0.047279319092816886 | |
Using glmnet... | |
57.4690 seconds | |
gini=0.2755165538902576 | |
mean outcome: 0.04747663840483652 | |
mean prediction: 0.0472848539436662 | |
all close? False | |
Using old implementation... (no standardization) | |
291.9076 seconds | |
gini=0.2636890785530665 | |
mean outcome: 0.04747663840483652 | |
mean prediction: 0.04727931909281683 | |
all close? True | |
********************************* | |
alpha=1.0, l1_ratio=0.5, sparse=True | |
Creating sparse matrix... | |
Using new implementation... | |
0.09614459466725817 0 0 | |
0.034378559002931665 1 1 | |
0.01170238383401215 2 2 | |
0.0034775112012216183 3 3 | |
0.000708324797992112 4 4 | |
5.2231794878223296e-05 5 5 | |
456.6564 seconds | |
gini=0.11972711376286291 | |
mean outcome: 0.04747663840483652 | |
mean prediction: 0.04729607839402956 | |
Using glmnet... | |
3.9307 seconds | |
gini=0.12011246482376858 | |
mean outcome: 0.04747663840483652 | |
mean prediction: 0.04729181545858199 | |
all close? True | |
Using old implementation... (no standardization) | |
1.344542742914321 0 0 | |
0.6710968536517165 1 1 | |
0.11795095198240992 2 2 | |
0.03510388281768133 3 3 | |
0.007180409077755724 4 4 | |
0.0005352931468411203 5 5 | |
389.8593 seconds | |
gini=0.12033554412788987 | |
mean outcome: 0.04747663840483652 | |
mean prediction: 0.04729629084703527 | |
all close? True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Dim of data: (1000000, 107) | |
Sparseness: 0.683 | |
Size of data frame: 0.80 GB | |
********************************* | |
alpha=0.01, l1_ratio=0.0, sparse=False | |
Using new implementation... | |
4.1740 seconds | |
gini=0.263818790086771 | |
mean outcome: 0.04676540361757755 | |
mean prediction: 0.04670313684179153 | |
Using glmnet... | |
7.9436 seconds | |
gini=0.2724562578231543 | |
mean outcome: 0.04676540361757755 | |
mean prediction: 0.04674076928845567 | |
all close? False | |
Using old implementation... (no standardization) | |
3.6724 seconds | |
gini=0.2532641359545414 | |
mean outcome: 0.04676540361757755 | |
mean prediction: 0.04619259199982138 | |
all close? False | |
********************************* | |
alpha=1.0, l1_ratio=0.5, sparse=False | |
Using new implementation... | |
0.09619816120501168 0 0 | |
0.03441165041414234 1 1 | |
0.011726676309588263 2 2 | |
0.003495483932480208 3 3 | |
0.000718099939306369 4 4 | |
5.41389992328904e-05 5 5 | |
5.6504 seconds | |
gini=0.12005884445047671 | |
mean outcome: 0.04676540361757755 | |
mean prediction: 0.04666568650088688 | |
Using glmnet... | |
0.6818 seconds | |
gini=0.12027297891459444 | |
mean outcome: 0.04676540361757755 | |
mean prediction: 0.046661043630974286 | |
all close? True | |
Using old implementation... (no standardization) | |
0.9619816120501168 0 0 | |
0.3441165041414234 1 1 | |
0.11726676309588263 2 2 | |
0.034954839324802076 3 3 | |
0.0071809993930636905 4 4 | |
0.000541389992328904 5 5 | |
5.2115 seconds | |
gini=0.12005884445047671 | |
mean outcome: 0.04676540361757755 | |
mean prediction: 0.04666568650088688 | |
all close? True | |
********************************* | |
alpha=0.01, l1_ratio=0.0, sparse=True | |
Creating sparse matrix... | |
Using new implementation... | |
32.5009 seconds | |
gini=0.2638187936317561 | |
mean outcome: 0.04676540361757755 | |
mean prediction: 0.046703136841791504 | |
Using glmnet... | |
4.2908 seconds | |
gini=0.2724562578231521 | |
mean outcome: 0.04676540361757755 | |
mean prediction: 0.046740769288456385 | |
all close? False | |
Using old implementation... (no standardization) | |
28.1692 seconds | |
gini=0.2638187961688558 | |
mean outcome: 0.04676540361757755 | |
mean prediction: 0.04670313684179154 | |
all close? True | |
********************************* | |
alpha=1.0, l1_ratio=0.5, sparse=True | |
Creating sparse matrix... | |
Using new implementation... | |
0.09619816120501168 0 0 | |
0.03441165041414234 1 1 | |
0.011726676309588263 2 2 | |
0.003495483932480208 3 3 | |
0.000718099939306369 4 4 | |
5.41389992328904e-05 5 5 | |
42.4812 seconds | |
gini=0.12005884445047671 | |
mean outcome: 0.04676540361757755 | |
mean prediction: 0.04666568650088688 | |
Using glmnet... | |
0.3295 seconds | |
gini=0.11954649640739994 | |
mean outcome: 0.04676540361757755 | |
mean prediction: 0.04666104363097935 | |
all close? True | |
Using old implementation... (no standardization) | |
1.3457867135381405 0 0 | |
0.6713103191660521 1 1 | |
0.11819264109487121 2 2 | |
0.03528348997508118 3 3 | |
0.0072786327343033605 4 4 | |
0.0005546862828521083 5 5 | |
36.9609 seconds | |
gini=0.11950432517998748 | |
mean outcome: 0.04676540361757755 | |
mean prediction: 0.04666591512425464 | |
all close? True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Dim of data: (1000000, 107) | |
Sparseness: 0.683 | |
Size of data frame: 0.80 GB | |
********************************* | |
alpha=1.0, l1_ratio=0.5, sparse=False | |
Using new implementation... | |
0.09615708450804317 0 0 | |
0.034386273780650345 1 1 | |
0.011708045105922351 2 2 | |
0.003481694975107442 3 3 | |
0.000710592744205067 4 4 | |
5.2670328567151794e-05 5 5 | |
8.5725 seconds | |
gini=0.11747903153166883 | |
mean outcome: 0.04873558735427359 | |
mean prediction: 0.04714181671456767 | |
Wrote profile results to glm_qc.py.lprof | |
Timer unit: 1e-06 s | |
Total time: 6.57312 s | |
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py | |
Function: _safe_sandwich_dot at line 105 | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
105 @profile | |
106 def _safe_sandwich_dot(X, d, intercept=False): | |
107 """Compute sandwich product X.T @ diag(d) @ X. | |
108 | |
109 With ``intercept=True``, X is treated as if a column of 1 were appended as | |
110 first column of X. | |
111 X can be sparse, d must be an ndarray. Always returns a ndarray.""" | |
112 8 74.0 9.2 0.0 if sparse.issparse(X): | |
113 # TODO: There's a bit of room to accelerate this by avoiding the | |
114 # allocation of a new sparse matrix every time we pass through this | |
115 # step. The X.multiply creates a new matrix. Avoiding the allocation | |
116 # and updating the rows in place accelerates this line by ~20%. | |
117 temp = X.transpose() @ X.multiply(d[:, np.newaxis]) | |
118 # for older versions of numpy and scipy, temp may be a np.matrix | |
119 temp = _safe_toarray(temp) | |
120 8 22.0 2.8 0.0 elif type(X) is ColScaledSpMat: | |
121 term1 = X.mat.transpose() @ X.mat.multiply(d[:, np.newaxis]) | |
122 term2 = X.mat.transpose().dot(d)[:, np.newaxis] * X.shift | |
123 term3 = term2.T | |
124 term4 = (X.shift.T * X.shift) * d.sum() | |
125 temp = term1 + term2 + term3 + term4 | |
126 temp = _safe_toarray(temp) | |
127 else: | |
128 8 6477524.0 809690.5 98.5 temp = (X.T * d) @ X | |
129 8 40.0 5.0 0.0 if intercept: | |
130 8 38.0 4.8 0.0 dim = X.shape[1] + 1 | |
131 8 31.0 3.9 0.0 if type(X) is ColScaledSpMat: | |
132 order = "F" | |
133 8 79.0 9.9 0.0 elif sparse.issparse(X): | |
134 order = "F" if sparse.isspmatrix_csc(X) else "C" | |
135 else: | |
136 8 36.0 4.5 0.0 order = "F" if X.flags["F_CONTIGUOUS"] else "C" | |
137 8 159.0 19.9 0.0 res = np.empty((dim, dim), dtype=max(X.dtype, d.dtype), order=order) | |
138 8 4656.0 582.0 0.1 res[0, 0] = d.sum() | |
139 8 90118.0 11264.8 1.4 res[1:, 0] = d @ X | |
140 8 56.0 7.0 0.0 res[0, 1:] = res[1:, 0] | |
141 8 272.0 34.0 0.0 res[1:, 1:] = temp | |
142 else: | |
143 res = temp | |
144 8 12.0 1.5 0.0 return res | |
Total time: 6.3131 s | |
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py | |
Function: _eta_mu_score_fisher at line 730 | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
730 @profile | |
731 def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link, diag_fisher=False): | |
732 """Compute linear predictor, mean, score function and fisher matrix. | |
733 | |
734 It calculates the linear predictor, the mean, score function | |
735 (derivative of log-likelihood) and Fisher information matrix | |
736 all in one go as function of `coef` (:math:`w`) and the data. | |
737 | |
738 Parameters | |
739 ---------- | |
740 diag_fisher : boolean, optional (default=False) | |
741 If ``True``, returns only an array d such that | |
742 fisher = X.T @ np.diag(d) @ X. | |
743 | |
744 Returns | |
745 ------- | |
746 (eta, mu, score, fisher) : tuple with 4 elements | |
747 The 4 elements are: | |
748 | |
749 * eta: ndarray, shape (X.shape[0],) | |
750 * mu: ndarray, shape (X.shape[0],) | |
751 * score: ndarray, shape (X.shape[0],) | |
752 * fisher: | |
753 | |
754 * If diag_fisher is ``False``, the full fisher matrix, | |
755 an array of shape (X.shape[1], X.shape[1]) | |
756 * If diag_fisher is ``True`, an array of shape (X.shape[0]) | |
757 """ | |
758 7 22.0 3.1 0.0 intercept = coef.size == X.shape[1] + 1 | |
759 # eta = linear predictor | |
760 7 87047.0 12435.3 1.4 eta = _safe_lin_pred(X, coef) | |
761 7 109902.0 15700.3 1.7 mu = link.inverse(eta) | |
762 | |
763 # FOR TWEEDIE: sigma_inv = weights / (mu ** p) during optimization bc phi = 1 | |
764 7 77727.0 11103.9 1.2 sigma_inv = 1.0 / self.variance(mu, phi=phi, weights=weights) | |
765 7 107635.0 15376.4 1.7 d1 = link.inverse_derivative(eta) # = h'(eta) | |
766 # Alternatively: | |
767 # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g | |
768 # d1 = 1./link.derivative(mu) | |
769 7 12412.0 1773.1 0.2 d1_sigma_inv = d1 * sigma_inv | |
770 7 19982.0 2854.6 0.3 temp = d1_sigma_inv * (y - mu) | |
771 7 19.0 2.7 0.0 if intercept: | |
772 7 78613.0 11230.4 1.2 score = np.concatenate(([temp.sum()], temp @ X)) | |
773 else: | |
774 score = temp @ X | |
775 | |
776 7 12680.0 1811.4 0.2 d2_sigma_inv = d1 * d1_sigma_inv | |
777 7 17.0 2.4 0.0 if diag_fisher: | |
778 fisher_matrix = d2_sigma_inv | |
779 else: | |
780 7 5807028.0 829575.4 92.0 fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv, intercept=intercept) | |
781 7 12.0 1.7 0.0 return eta, mu, score, fisher_matrix | |
Total time: 0.02796 s | |
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py | |
Function: _cd_cycle at line 1131 | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
1131 @profile | |
1132 def _cd_cycle( | |
1133 d, | |
1134 X, | |
1135 coef, | |
1136 score, | |
1137 fisher, | |
1138 P1, | |
1139 P2, | |
1140 n_cycles, | |
1141 inner_tol, | |
1142 max_inner_iter=1000, | |
1143 selection="cyclic", | |
1144 random_state=None, | |
1145 diag_fisher=False, | |
1146 ): | |
1147 """Compute inner loop of coordinate descent, i.e. cycles through features. | |
1148 | |
1149 Minimization of 1-d subproblems:: | |
1150 | |
1151 min_z q(d+z*e_j) - q(d) | |
1152 = min_z A_j z + 1/2 B_jj z^2 + ||P1_j (w_j+d_j+z)||_1 | |
1153 | |
1154 A = f'(w) + d*H(w) + (w+d)*P2 | |
1155 B = H+P2 | |
1156 Note: f'=-score and H=fisher are updated at the end of outer iteration. | |
1157 """ | |
1158 # TODO: use sparsity (coefficient already 0 due to L1 penalty) | |
1159 # => active set of features for featurelist, see paper | |
1160 # of Improved GLMNET or Gap Safe Screening Rules | |
1161 # https://arxiv.org/abs/1611.05780 | |
1162 7 24.0 3.4 0.1 n_samples, n_features = X.shape | |
1163 7 22.0 3.1 0.1 intercept = coef.size == X.shape[1] + 1 | |
1164 7 10.0 1.4 0.0 idx = 1 if intercept else 0 # offset if coef[0] is intercept | |
1165 # ES: This is weird. Should this be B = fisher.copy()? Why would you do this? | |
1166 7 10.0 1.4 0.0 B = fisher | |
1167 7 14.0 2.0 0.1 if P2.ndim == 1: | |
1168 7 93.0 13.3 0.3 coef_P2 = coef[idx:] * P2 | |
1169 7 10.0 1.4 0.0 if not diag_fisher: | |
1170 7 92.0 13.1 0.3 idiag = np.arange(start=idx, stop=B.shape[0]) | |
1171 # B[np.diag_indices_from(B)] += P2 | |
1172 7 198.0 28.3 0.7 B[(idiag, idiag)] += P2 | |
1173 else: | |
1174 coef_P2 = coef[idx:] @ P2 | |
1175 if not diag_fisher: | |
1176 if sparse.issparse(P2): | |
1177 B[idx:, idx:] += P2.toarray() | |
1178 else: | |
1179 B[idx:, idx:] += P2 | |
1180 7 32.0 4.6 0.1 A = -score | |
1181 7 44.0 6.3 0.2 A[idx:] += coef_P2 | |
1182 # A += d @ (H+P2) but so far d=0 | |
1183 # inner loop | |
1184 7 33.0 4.7 0.1 for inner_iter in range(1, max_inner_iter + 1): | |
1185 7 13.0 1.9 0.0 inner_iter += 1 | |
1186 7 11.0 1.6 0.0 n_cycles += 1 | |
1187 # cycle through features, update intercept separately at the end | |
1188 7 15.0 2.1 0.1 if selection == "random": | |
1189 7 232.0 33.1 0.8 featurelist = random_state.permutation(n_features) | |
1190 else: | |
1191 featurelist = np.arange(n_features) | |
1192 756 1432.0 1.9 5.1 for j in featurelist: | |
1193 # minimize_z: a z + 1/2 b z^2 + c |d+z| | |
1194 # a = A_j | |
1195 # b = B_jj > 0 | |
1196 # c = |P1_j| = P1_j > 0, see 1.3 | |
1197 # d = w_j + d_j | |
1198 # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4) | |
1199 # with beta = z+d, beta_hat = d-a/b and gamma = c/b | |
1200 # z = 1/b * S(bd-a,c) - d | |
1201 # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding | |
1202 749 1523.0 2.0 5.4 jdx = j + idx # index for arrays containing entries for intercept | |
1203 749 1355.0 1.8 4.8 a = A[jdx] | |
1204 749 1101.0 1.5 3.9 if diag_fisher: | |
1205 # Note: fisher is ndarray of shape (n_samples,) => no idx | |
1206 # Calculate Bj = B[j, :] = B[:, j] as it is needed later anyway | |
1207 Bj = np.zeros_like(A) | |
1208 if intercept: | |
1209 Bj[0] = fisher.sum() | |
1210 if sparse.issparse(X): | |
1211 Bj[idx:] = _safe_toarray( | |
1212 X[:, j].transpose() @ X.multiply(fisher[:, np.newaxis]) | |
1213 ).ravel() | |
1214 else: | |
1215 Bj[idx:] = (fisher * X[:, j]) @ X | |
1216 | |
1217 if P2.ndim == 1: | |
1218 Bj[idx:] += P2[j] | |
1219 else: | |
1220 if sparse.issparse(P2): | |
1221 # slice columns as P2 is csc | |
1222 Bj[idx:] += P2[:, j].toarray().ravel() | |
1223 else: | |
1224 Bj[idx:] += P2[:, j] | |
1225 b = Bj[jdx] | |
1226 else: | |
1227 749 1421.0 1.9 5.1 b = B[jdx, jdx] | |
1228 | |
1229 # those ten lines are what it is all about | |
1230 749 1513.0 2.0 5.4 if b <= 0: | |
1231 z = 0 | |
1232 749 1674.0 2.2 6.0 elif P1[j] == 0: | |
1233 z = -a / b | |
1234 749 2334.0 3.1 8.3 elif a + P1[j] < b * (coef[jdx] + d[jdx]): | |
1235 z = -(a + P1[j]) / b | |
1236 749 2238.0 3.0 8.0 elif a - P1[j] > b * (coef[jdx] + d[jdx]): | |
1237 z = -(a - P1[j]) / b | |
1238 else: | |
1239 749 1744.0 2.3 6.2 z = -(coef[jdx] + d[jdx]) | |
1240 | |
1241 # update direction d | |
1242 749 1634.0 2.2 5.8 d[jdx] += z | |
1243 # update A because d_j is now d_j+z | |
1244 # A = f'(w) + d*H(w) + (w+d)*P2 | |
1245 # => A += (H+P2)*e_j z = B_j * z | |
1246 # Note: B is symmetric B = B.transpose | |
1247 749 1117.0 1.5 4.0 if diag_fisher: | |
1248 # Bj = B[:, j] calculated above, still valid | |
1249 A += Bj * z | |
1250 else: | |
1251 # B is symmetric, C- or F-contiguous, but never sparse | |
1252 749 1478.0 2.0 5.3 if B.flags["F_CONTIGUOUS"]: | |
1253 # slice columns like for sparse csc | |
1254 749 5177.0 6.9 18.5 A += B[:, jdx] * z | |
1255 else: # B.flags['C_CONTIGUOUS'] might be true | |
1256 # slice rows | |
1257 A += B[jdx, :] * z | |
1258 # end of cycle over features | |
1259 # update intercept | |
1260 7 12.0 1.7 0.0 if intercept: | |
1261 7 9.0 1.3 0.0 if diag_fisher: | |
1262 Bj = np.zeros_like(A) | |
1263 Bj[0] = fisher.sum() | |
1264 Bj[1:] = fisher @ X | |
1265 b = Bj[0] | |
1266 else: | |
1267 7 15.0 2.1 0.1 b = B[0, 0] | |
1268 7 24.0 3.4 0.1 z = 0 if b <= 0 else -A[0] / b | |
1269 7 16.0 2.3 0.1 d[0] += z | |
1270 7 11.0 1.6 0.0 if diag_fisher: | |
1271 A += Bj * z | |
1272 else: | |
1273 7 12.0 1.7 0.0 if B.flags["F_CONTIGUOUS"]: | |
1274 7 47.0 6.7 0.2 A += B[:, 0] * z | |
1275 else: | |
1276 A += B[0, :] * z | |
1277 # end of complete cycle | |
1278 # stopping criterion for inner loop | |
1279 # sum_i(|minimum of norm of subgrad of q(d)_i|) | |
1280 # subgrad q(d) = A + subgrad ||P1*(w+d)||_1 | |
1281 7 571.0 81.6 2.0 mn_subgrad = _min_norm_sugrad(coef=coef + d, grad=A, P2=None, P1=P1) | |
1282 7 605.0 86.4 2.2 mn_subgrad = linalg.norm(mn_subgrad, ord=1) | |
1283 7 14.0 2.0 0.1 if mn_subgrad <= inner_tol: | |
1284 7 11.0 1.6 0.0 if inner_iter == 1: | |
1285 inner_tol = inner_tol / 4.0 | |
1286 7 9.0 1.3 0.0 break | |
1287 # end of inner loop | |
1288 7 10.0 1.4 0.0 return d, coef_P2, n_cycles, inner_tol | |
Total time: 6.94774 s | |
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py | |
Function: _cd_solver at line 1291 | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
1291 @profile | |
1292 def _cd_solver( | |
1293 coef, | |
1294 X, | |
1295 y, | |
1296 weights, | |
1297 P1, | |
1298 P2, | |
1299 fit_intercept, | |
1300 family, | |
1301 link, | |
1302 max_iter=100, | |
1303 max_inner_iter=1000, | |
1304 tol=1e-4, | |
1305 selection="cyclic ", | |
1306 random_state=None, | |
1307 diag_fisher=False, | |
1308 copy_X=True, | |
1309 ): | |
1310 """Solve GLM with L1 and L2 penalty by coordinate descent algorithm. | |
1311 | |
1312 The objective being minimized in the coefficients w=coef is:: | |
1313 | |
1314 F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1 | |
1315 | |
1316 An Improved GLMNET for L1-regularized Logistic Regression: | |
1317 | |
1318 1. Find optimal descent direction d by minimizing | |
1319 min_d F(w+d) = min_d F(w+d) - F(w) | |
1320 2. Quadratic approximation of F(w+d)-F(w) = q(d): | |
1321 using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives: | |
1322 q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d | |
1323 + ||P1*(w+d)||_1 - ||P1*w||_1 | |
1324 Then minimize q(d): min_d q(d) | |
1325 3. Coordinate descent by updating coordinate j (d -> d+z*e_j): | |
1326 min_z q(d+z*e_j) | |
1327 = min_z q(d+z*e_j) - q(d) | |
1328 = min_z A_j z + 1/2 B_jj z^2 | |
1329 + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1 | |
1330 A = f'(w) + d*H(w) + (w+d)*P2 | |
1331 B = H + P2 | |
1332 | |
1333 Repeat steps 1-3 until convergence. | |
1334 Note: Use Fisher matrix instead of Hessian for H. | |
1335 Note: f' = -score, H = Fisher matrix | |
1336 | |
1337 Parameters | |
1338 ---------- | |
1339 coef : ndarray, shape (c,) | |
1340 If fit_intercept=False, shape c=X.shape[1]. | |
1341 If fit_intercept=True, then c=X.shape[1] + 1. | |
1342 | |
1343 X : {ndarray, csc sparse matrix}, shape (n_samples, n_features) | |
1344 Training data (with intercept included if present). If not sparse, | |
1345 pass directly as Fortran-contiguous data to avoid | |
1346 unnecessary memory duplication. | |
1347 | |
1348 y : ndarray, shape (n_samples,) | |
1349 Target values. | |
1350 | |
1351 weights: ndarray, shape (n_samples,) | |
1352 Sample weights with which the deviance is weighted. The weights must | |
1353 bee normalized and sum to 1. | |
1354 | |
1355 P1 : {ndarray}, shape (n_features,) | |
1356 The L1-penalty vector (=diagonal matrix) | |
1357 | |
1358 P2 : {ndarray, csc sparse matrix}, shape (n_features, n_features) | |
1359 The L2-penalty matrix or vector (=diagonal matrix). If a matrix is | |
1360 passed, it must be symmetric. If X is sparse, P2 must also be sparse. | |
1361 | |
1362 fit_intercept : boolean, optional (default=True) | |
1363 Specifies if a constant (a.k.a. bias or intercept) should be | |
1364 added to the linear predictor (X*coef+intercept). | |
1365 | |
1366 family : ExponentialDispersionModel | |
1367 | |
1368 link : Link | |
1369 | |
1370 max_iter : int, optional (default=100) | |
1371 Maximum numer of outer (Newton) iterations. | |
1372 | |
1373 max_inner_iter : int, optional (default=1000) | |
1374 Maximum number of iterations in each inner loop, i.e. max number of | |
1375 cycles over all features per inner loop. | |
1376 | |
1377 tol : float, optional (default=1e-4) | |
1378 Convergence criterion is | |
1379 sum_i(|minimum of norm of subgrad of objective_i|)<=tol. | |
1380 | |
1381 selection : str, optional (default='cyclic') | |
1382 If 'random', randomly chose features in inner loop. | |
1383 | |
1384 random_state : {int, RandomState instance, None}, optional (default=None) | |
1385 | |
1386 diag_fisher : boolean, optional (default=False) | |
1387 ``False`` calculates full fisher matrix, ``True`` only diagonal matrix | |
1388 s.t. fisher = X.T @ diag @ X. This saves storage but needs more | |
1389 matrix-vector multiplications. | |
1390 | |
1391 copy_X : boolean, optional (default=True) | |
1392 If ``True``, X will be copied; else, it may be overwritten. | |
1393 | |
1394 Returns | |
1395 ------- | |
1396 coef : ndarray, shape (c,) | |
1397 If fit_intercept=False, shape c=X.shape[1]. | |
1398 If fit_intercept=True, then c=X.shape[1] + 1. | |
1399 | |
1400 n_iter : number of outer iterations = newton iterations | |
1401 | |
1402 n_cycles : number of cycles over features | |
1403 | |
1404 References | |
1405 ---------- | |
1406 Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin | |
1407 An Improved GLMNET for L1-regularized Logistic Regression, | |
1408 Journal of Machine Learning Research 13 (2012) 1999-2030 | |
1409 https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf | |
1410 """ | |
1411 1 4.0 4.0 0.0 if type(X) is not ColScaledSpMat: | |
1412 # TODO: because copy_X is being passed here and is also being passed in | |
1413 # check_X_y in fit(...), X is being copied twice. Check if this is true. | |
1414 2 279886.0 139943.0 4.0 X = check_array( | |
1415 1 4.0 4.0 0.0 X, "csc", dtype=[np.float64, np.float32], order="F", copy=copy_X | |
1416 ) | |
1417 1 7.0 7.0 0.0 if P2.ndim == 2: | |
1418 P2 = check_array( | |
1419 P2, "csc", dtype=[np.float64, np.float32], order="F", copy=copy_X | |
1420 ) | |
1421 1 9.0 9.0 0.0 if sparse.issparse(X): | |
1422 if not sparse.isspmatrix_csc(P2): | |
1423 raise ValueError( | |
1424 "If X is sparse, P2 must also be sparse csc" | |
1425 "format. Got P2 not sparse." | |
1426 ) | |
1427 1 30.0 30.0 0.0 random_state = check_random_state(random_state) | |
1428 # Note: we already set P2 = l2*P2, P1 = l1*P1 | |
1429 # Note: we already symmetrized P2 = 1/2 (P2 + P2') | |
1430 1 2.0 2.0 0.0 n_iter = 0 # number of outer iterations | |
1431 1 2.0 2.0 0.0 n_cycles = 0 # number of (complete) cycles over features | |
1432 1 2.0 2.0 0.0 converged = False | |
1433 1 2.0 2.0 0.0 n_samples, n_features = X.shape | |
1434 1 2.0 2.0 0.0 idx = 1 if fit_intercept else 0 # offset if coef[0] is intercept | |
1435 # line search parameters | |
1436 1 2.0 2.0 0.0 (beta, sigma) = (0.5, 0.01) | |
1437 # some precalculations | |
1438 # Note: For diag_fisher=False, fisher = X.T @ fisher @ X and fisher is a | |
1439 # 1d array representing a diagonal matrix. | |
1440 2 859869.0 429934.5 12.4 eta, mu, score, fisher = family._eta_mu_score_fisher( | |
1441 1 3.0 3.0 0.0 coef=coef, phi=1, X=X, y=y, weights=weights, link=link, diag_fisher=diag_fisher | |
1442 ) | |
1443 # set up space for search direction d for inner loop | |
1444 1 50.0 50.0 0.0 d = np.zeros_like(coef) | |
1445 | |
1446 # minimum subgradient norm | |
1447 1 4.0 4.0 0.0 def calc_mn_subgrad_norm(): | |
1448 return linalg.norm( | |
1449 _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1), ord=1 | |
1450 ) | |
1451 | |
1452 # the ratio of inner _cd_cycle tolerance to the minimum subgradient norm | |
1453 # This wasn't explored in the newGLMNET paper linked above. | |
1454 # That paper essentially uses inner_tol_ratio = 1.0, but using a slightly | |
1455 # lower value is much faster. | |
1456 # By comparison, the original GLMNET paper uses inner_tol = tol. | |
1457 # So, inner_tol_ratio < 1 is sort of a compromise between the two papers. | |
1458 # The value should probably be between 0.01 and 0.5. 0.1 works well for many problems | |
1459 1 3.0 3.0 0.0 inner_tol_ratio = 0.1 | |
1460 | |
1461 1 3.0 3.0 0.0 def calc_inner_tol(mn_subgrad_norm): | |
1462 # Another potential rule limits the inner tol to be no smaller than tol | |
1463 # return max(mn_subgrad_norm * inner_tol_ratio, tol) | |
1464 return mn_subgrad_norm * inner_tol_ratio | |
1465 | |
1466 # initial stopping tolerance of inner loop | |
1467 # use L1-norm of minimum of norm of subgradient of F | |
1468 1 197.0 197.0 0.0 inner_tol = calc_inner_tol(calc_mn_subgrad_norm()) | |
1469 | |
1470 1 3.0 3.0 0.0 Fw = None | |
1471 | |
1472 # outer loop | |
1473 6 19.0 3.2 0.0 while n_iter < max_iter: | |
1474 6 23751.0 3958.5 0.3 print(inner_tol, n_iter, n_cycles) | |
1475 6 45.0 7.5 0.0 n_iter += 1 | |
1476 # initialize search direction d (to be optimized) with zero | |
1477 6 1800.0 300.0 0.0 d.fill(0) | |
1478 # inner loop = _cd_cycle | |
1479 12 41985.0 3498.8 0.6 d, coef_P2, n_cycles, inner_tol = _cd_cycle( | |
1480 6 18.0 3.0 0.0 d, | |
1481 6 18.0 3.0 0.0 X, | |
1482 6 19.0 3.2 0.0 coef, | |
1483 6 19.0 3.2 0.0 score, | |
1484 6 17.0 2.8 0.0 fisher, | |
1485 6 20.0 3.3 0.0 P1, | |
1486 6 892.0 148.7 0.0 P2, | |
1487 6 19.0 3.2 0.0 n_cycles, | |
1488 6 17.0 2.8 0.0 inner_tol, | |
1489 6 19.0 3.2 0.0 max_inner_iter=max_inner_iter, | |
1490 6 17.0 2.8 0.0 selection=selection, | |
1491 6 18.0 3.0 0.0 random_state=random_state, | |
1492 6 19.0 3.2 0.0 diag_fisher=diag_fisher, | |
1493 ) | |
1494 | |
1495 # line search by sequence beta^k, k=0, 1, .. | |
1496 # F(w + lambda d) - F(w) <= lambda * bound | |
1497 # bound = sigma * (f'(w)*d + w*P2*d | |
1498 # +||P1 (w+d)||_1 - ||P1 w||_1) | |
1499 6 303.0 50.5 0.0 P1w_1 = linalg.norm(P1 * coef[idx:], ord=1) | |
1500 6 282.0 47.0 0.0 P1wd_1 = linalg.norm(P1 * (coef + d)[idx:], ord=1) | |
1501 # Note: coef_P2 already calculated and still valid | |
1502 6 109.0 18.2 0.0 bound = sigma * (-(score @ d) + coef_P2 @ d[idx:] + P1wd_1 - P1w_1) | |
1503 | |
1504 # In the first iteration, we must compute Fw explicitly. | |
1505 # In later iterations, we just use Fwd from the previous iteration | |
1506 # as set after the line search loop below. | |
1507 6 18.0 3.0 0.0 if Fw is None: | |
1508 1 3.0 3.0 0.0 Fw = ( | |
1509 3 10457.0 3485.7 0.2 0.5 * family.deviance(y, mu, weights) | |
1510 1 21.0 21.0 0.0 + 0.5 * (coef_P2 @ coef[idx:]) | |
1511 1 3.0 3.0 0.0 + P1w_1 | |
1512 ) | |
1513 | |
1514 6 24.0 4.0 0.0 la = 1.0 / beta | |
1515 | |
1516 6 83303.0 13883.8 1.2 X_dot_d = _safe_lin_pred(X, d) | |
1517 | |
1518 # Try progressively shorter line search steps. | |
1519 6 53.0 8.8 0.0 for k in range(20): | |
1520 6 21.0 3.5 0.0 la *= beta # starts with la=1 | |
1521 6 86.0 14.3 0.0 coef_wd = coef + la * d | |
1522 | |
1523 # The simple version of the next line is: | |
1524 # mu_wd = link.inverse(_safe_lin_pred(X, coef_wd)) | |
1525 # but because coef_wd can be factored as | |
1526 # coef_wd = coef + la * d | |
1527 # we can rewrite to only perform one dot product with the data | |
1528 # matrix per loop which is substantially faster | |
1529 6 123883.0 20647.2 1.8 mu_wd = link.inverse(eta + la * X_dot_d) | |
1530 | |
1531 # TODO - optimize: for Tweedie that isn't one of the special cases | |
1532 # (gaussian, poisson, gamma), family.deviance is quite slow! Can we | |
1533 # fix that somehow? | |
1534 12 64803.0 5400.2 0.9 Fwd = 0.5 * family.deviance(y, mu_wd, weights) + linalg.norm( | |
1535 6 78.0 13.0 0.0 P1 * coef_wd[idx:], ord=1 | |
1536 ) | |
1537 6 19.0 3.2 0.0 if P2.ndim == 1: | |
1538 6 164.0 27.3 0.0 Fwd += 0.5 * ((coef_wd[idx:] * P2) @ coef_wd[idx:]) | |
1539 else: | |
1540 Fwd += 0.5 * (coef_wd[idx:] @ (P2 @ coef_wd[idx:])) | |
1541 6 27.0 4.5 0.0 if Fwd - Fw <= sigma * la * bound: | |
1542 6 19.0 3.2 0.0 break | |
1543 | |
1544 # Fw in the next iteration will be equal to Fwd this iteration. | |
1545 6 18.0 3.0 0.0 Fw = Fwd | |
1546 | |
1547 # update coefficients | |
1548 6 76.0 12.7 0.0 coef += la * d | |
1549 | |
1550 # calculate eta, mu, score, Fisher matrix for next iteration | |
1551 12 5453705.0 454475.4 78.5 eta, mu, score, fisher = family._eta_mu_score_fisher( | |
1552 6 17.0 2.8 0.0 coef=coef, | |
1553 6 17.0 2.8 0.0 phi=1, | |
1554 6 17.0 2.8 0.0 X=X, | |
1555 6 16.0 2.7 0.0 y=y, | |
1556 6 18.0 3.0 0.0 weights=weights, | |
1557 6 17.0 2.8 0.0 link=link, | |
1558 6 17.0 2.8 0.0 diag_fisher=diag_fisher, | |
1559 ) | |
1560 | |
1561 # stopping criterion for outer loop | |
1562 # sum_i(|minimum-norm of subgrad of F(w)_i|) | |
1563 # fp_wP2 = f'(w) + w*P2 | |
1564 # Note: eta, mu and score are already updated | |
1565 # this also updates the inner tolerance for the next loop! | |
1566 6 1286.0 214.3 0.0 mn_subgrad_norm = calc_mn_subgrad_norm() | |
1567 6 29.0 4.8 0.0 if mn_subgrad_norm <= tol: | |
1568 1 3.0 3.0 0.0 converged = True | |
1569 1 3.0 3.0 0.0 break | |
1570 | |
1571 5 27.0 5.4 0.0 inner_tol = calc_inner_tol(mn_subgrad_norm) | |
1572 # end of outer loop | |
1573 | |
1574 1 3.0 3.0 0.0 if not converged: | |
1575 warnings.warn( | |
1576 "Coordinate descent failed to converge. Increase" | |
1577 " the maximum number of iterations max_iter" | |
1578 " (currently {})".format(max_iter), | |
1579 ConvergenceWarning, | |
1580 ) | |
1581 | |
1582 1 3.0 3.0 0.0 return coef, n_iter, n_cycles | |
Total time: 8.55864 s | |
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py | |
Function: fit at line 1884 | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
1884 @profile | |
1885 def fit(self, X, y, sample_weight=None): | |
1886 """Fit a Generalized Linear Model. | |
1887 | |
1888 Parameters | |
1889 ---------- | |
1890 X : {array-like, sparse matrix}, shape (n_samples, n_features) | |
1891 Training data. | |
1892 | |
1893 y : array-like, shape (n_samples,) | |
1894 Target values. | |
1895 | |
1896 sample_weight : {None, array-like}, shape (n_samples,),\ | |
1897 optional (default=None) | |
1898 Individual weights w_i for each sample. Note that for an | |
1899 Exponential Dispersion Model (EDM), one has | |
1900 Var[Y_i]=phi/w_i * v(mu). | |
1901 If Y_i ~ EDM(mu, phi/w_i), then | |
1902 sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a | |
1903 weighted average with weights=sample_weight. | |
1904 | |
1905 Returns | |
1906 ------- | |
1907 self : returns an instance of self. | |
1908 """ | |
1909 # 99.8% of time is in the _cd_solver function | |
1910 ####################################################################### | |
1911 # 1. input validation # | |
1912 ####################################################################### | |
1913 # 1.1 validate arguments of __init__ ################################## | |
1914 # Guarantee that self._family_instance is an instance of class | |
1915 # ExponentialDispersionModel | |
1916 1 56.0 56.0 0.0 if isinstance(self.family, ExponentialDispersionModel): | |
1917 self._family_instance = self.family | |
1918 else: | |
1919 1 5.0 5.0 0.0 if self.family == "normal": | |
1920 self._family_instance = NormalDistribution() | |
1921 1 5.0 5.0 0.0 elif self.family == "poisson": | |
1922 1 27.0 27.0 0.0 self._family_instance = PoissonDistribution() | |
1923 elif self.family == "gamma": | |
1924 self._family_instance = GammaDistribution() | |
1925 elif self.family == "inverse.gaussian": | |
1926 self._family_instance = InverseGaussianDistribution() | |
1927 elif self.family == "binomial": | |
1928 self._family_instance = BinomialDistribution() | |
1929 else: | |
1930 raise ValueError( | |
1931 "The family must be an instance of class" | |
1932 " ExponentialDispersionModel or an element of" | |
1933 " ['normal', 'poisson', 'gamma', 'inverse.gaussian', " | |
1934 "'binomial']; got (family={})".format(self.family) | |
1935 ) | |
1936 | |
1937 # Guarantee that self._link_instance is set to an instance of | |
1938 # class Link | |
1939 1 16.0 16.0 0.0 if isinstance(self.link, Link): | |
1940 self._link_instance = self.link | |
1941 else: | |
1942 1 5.0 5.0 0.0 if self.link == "auto": | |
1943 1 7.0 7.0 0.0 if isinstance(self._family_instance, TweedieDistribution): | |
1944 1 6.0 6.0 0.0 if self._family_instance.power <= 0: | |
1945 self._link_instance = IdentityLink() | |
1946 1 5.0 5.0 0.0 if self._family_instance.power >= 1: | |
1947 1 8.0 8.0 0.0 self._link_instance = LogLink() | |
1948 elif isinstance(self._family_instance, GeneralizedHyperbolicSecant): | |
1949 self._link_instance = IdentityLink() | |
1950 elif isinstance(self._family_instance, BinomialDistribution): | |
1951 self._link_instance = LogitLink() | |
1952 else: | |
1953 raise ValueError( | |
1954 "No default link known for the " | |
1955 "specified distribution family. Please " | |
1956 "set link manually, i.e. not to 'auto'; " | |
1957 "got (link='auto', family={}".format(self.family) | |
1958 ) | |
1959 elif self.link == "identity": | |
1960 self._link_instance = IdentityLink() | |
1961 elif self.link == "log": | |
1962 self._link_instance = LogLink() | |
1963 elif self.link == "logit": | |
1964 self._link_instance = LogitLink() | |
1965 else: | |
1966 raise ValueError( | |
1967 "The link must be an instance of class Link or " | |
1968 "an element of ['auto', 'identity', 'log', 'logit']; " | |
1969 "got (link={})".format(self.link) | |
1970 ) | |
1971 | |
1972 1 15.0 15.0 0.0 if not isinstance(self.alpha, numbers.Number) or self.alpha < 0: | |
1973 raise ValueError( | |
1974 "Penalty term must be a non-negative number;" | |
1975 " got (alpha={})".format(self.alpha) | |
1976 ) | |
1977 3 15.0 5.0 0.0 if ( | |
1978 1 5.0 5.0 0.0 not isinstance(self.l1_ratio, numbers.Number) | |
1979 1 5.0 5.0 0.0 or self.l1_ratio < 0 | |
1980 1 5.0 5.0 0.0 or self.l1_ratio > 1 | |
1981 ): | |
1982 raise ValueError( | |
1983 "l1_ratio must be a number in interval [0, 1];" | |
1984 " got (l1_ratio={})".format(self.l1_ratio) | |
1985 ) | |
1986 1 6.0 6.0 0.0 if not isinstance(self.fit_intercept, bool): | |
1987 raise ValueError( | |
1988 "The argument fit_intercept must be bool;" | |
1989 " got {}".format(self.fit_intercept) | |
1990 ) | |
1991 1 5.0 5.0 0.0 if self.solver not in ["auto", "irls", "lbfgs", "newton-cg", "cd"]: | |
1992 raise ValueError( | |
1993 "GeneralizedLinearRegressor supports only solvers" | |
1994 " 'auto', 'irls', 'lbfgs', 'newton-cg' and 'cd';" | |
1995 " got {}".format(self.solver) | |
1996 ) | |
1997 1 4.0 4.0 0.0 solver = self.solver | |
1998 1 5.0 5.0 0.0 if self.solver == "auto": | |
1999 1 5.0 5.0 0.0 if self.l1_ratio == 0: | |
2000 solver = "irls" | |
2001 else: | |
2002 1 5.0 5.0 0.0 solver = "cd" | |
2003 1 4.0 4.0 0.0 if self.alpha > 0 and self.l1_ratio > 0 and solver not in ["cd"]: | |
2004 raise ValueError( | |
2005 "The chosen solver (solver={}) can't deal " | |
2006 "with L1 penalties, which are included with " | |
2007 "(alpha={}) and (l1_ratio={}).".format( | |
2008 solver, self.alpha, self.l1_ratio | |
2009 ) | |
2010 ) | |
2011 1 5.0 5.0 0.0 if not isinstance(self.max_iter, int) or self.max_iter <= 0: | |
2012 raise ValueError( | |
2013 "Maximum number of iteration must be a positive " | |
2014 "integer;" | |
2015 " got (max_iter={!r})".format(self.max_iter) | |
2016 ) | |
2017 1 6.0 6.0 0.0 if not isinstance(self.tol, numbers.Number) or self.tol <= 0: | |
2018 raise ValueError( | |
2019 "Tolerance for stopping criteria must be " | |
2020 "positive; got (tol={!r})".format(self.tol) | |
2021 ) | |
2022 1 5.0 5.0 0.0 if not isinstance(self.warm_start, bool): | |
2023 raise ValueError( | |
2024 "The argument warm_start must be bool;" | |
2025 " got {}".format(self.warm_start) | |
2026 ) | |
2027 1 4.0 4.0 0.0 if self.selection not in ["cyclic", "random"]: | |
2028 raise ValueError( | |
2029 "The argument selection must be 'cyclic' or " | |
2030 "'random'; got (selection={})".format(self.selection) | |
2031 ) | |
2032 1 316.0 316.0 0.0 random_state = check_random_state(self.random_state) | |
2033 1 6.0 6.0 0.0 if not isinstance(self.diag_fisher, bool): | |
2034 raise ValueError( | |
2035 "The argument diag_fisher must be bool;" | |
2036 " got {}".format(self.diag_fisher) | |
2037 ) | |
2038 1 5.0 5.0 0.0 if not isinstance(self.copy_X, bool): | |
2039 raise ValueError( | |
2040 "The argument copy_X must be bool;" " got {}".format(self.copy_X) | |
2041 ) | |
2042 1 5.0 5.0 0.0 if not isinstance(self.check_input, bool): | |
2043 raise ValueError( | |
2044 "The argument check_input must be bool; got " | |
2045 "(check_input={})".format(self.check_input) | |
2046 ) | |
2047 | |
2048 1 4.0 4.0 0.0 family = self._family_instance | |
2049 1 5.0 5.0 0.0 link = self._link_instance | |
2050 | |
2051 # 1.2 validate arguments of fit ####################################### | |
2052 1 6.0 6.0 0.0 _dtype = [np.float64, np.float32] | |
2053 1 5.0 5.0 0.0 if solver == "cd": | |
2054 1 5.0 5.0 0.0 _stype = ["csc"] | |
2055 else: | |
2056 _stype = ["csc", "csr"] | |
2057 2 260252.0 130126.0 3.0 X, y = check_X_y( | |
2058 1 4.0 4.0 0.0 X, | |
2059 1 4.0 4.0 0.0 y, | |
2060 1 5.0 5.0 0.0 accept_sparse=_stype, | |
2061 1 5.0 5.0 0.0 dtype=_dtype, | |
2062 1 5.0 5.0 0.0 y_numeric=True, | |
2063 1 5.0 5.0 0.0 multi_output=False, | |
2064 1 5.0 5.0 0.0 copy=self.copy_X, | |
2065 ) | |
2066 # Without converting y to float, deviance might raise | |
2067 # ValueError: Integers to negative integer powers are not allowed. | |
2068 # Also, y must not be sparse. | |
2069 1 10.0 10.0 0.0 y = np.asarray(y, dtype=np.float64) | |
2070 | |
2071 1 1742.0 1742.0 0.0 weights = _check_weights(sample_weight, y.shape[0]) | |
2072 | |
2073 1 7.0 7.0 0.0 n_samples, n_features = X.shape | |
2074 | |
2075 # 1.3 arguments to take special care ################################## | |
2076 # P1, P2, start_params | |
2077 1 6.0 6.0 0.0 if isinstance(self.P1, str) and self.P1 == "identity": | |
2078 1 25.0 25.0 0.0 P1 = np.ones(n_features) | |
2079 else: | |
2080 P1 = np.atleast_1d(self.P1) | |
2081 try: | |
2082 P1 = P1.astype(np.float64, casting="safe", copy=False) | |
2083 except TypeError: | |
2084 raise TypeError( | |
2085 "The given P1 cannot be converted to a numeric" | |
2086 "array; got (P1.dtype={}).".format(P1.dtype) | |
2087 ) | |
2088 if (P1.ndim != 1) or (P1.shape[0] != n_features): | |
2089 raise ValueError( | |
2090 "P1 must be either 'identity' or a 1d array " | |
2091 "with the length of X.shape[1]; " | |
2092 "got (P1.shape[0]={}), " | |
2093 "needed (X.shape[1]={}).".format(P1.shape[0], n_features) | |
2094 ) | |
2095 # If X is sparse, make P2 sparse, too. | |
2096 1 5.0 5.0 0.0 if isinstance(self.P2, str) and self.P2 == "identity": | |
2097 1 8.0 8.0 0.0 if sparse.issparse(X): | |
2098 P2 = ( | |
2099 sparse.dia_matrix( | |
2100 (np.ones(n_features), 0), shape=(n_features, n_features) | |
2101 ) | |
2102 ).tocsc() | |
2103 else: | |
2104 1 10.0 10.0 0.0 P2 = np.ones(n_features) | |
2105 else: | |
2106 P2 = check_array( | |
2107 self.P2, copy=True, accept_sparse=_stype, dtype=_dtype, ensure_2d=False | |
2108 ) | |
2109 if P2.ndim == 1: | |
2110 P2 = np.asarray(P2) | |
2111 if P2.shape[0] != n_features: | |
2112 raise ValueError( | |
2113 "P2 should be a 1d array of shape " | |
2114 "(n_features,) with " | |
2115 "n_features=X.shape[1]; " | |
2116 "got (P2.shape=({},)), needed ({},)".format( | |
2117 P2.shape[0], X.shape[1] | |
2118 ) | |
2119 ) | |
2120 if sparse.issparse(X): | |
2121 P2 = ( | |
2122 sparse.dia_matrix((P2, 0), shape=(n_features, n_features)) | |
2123 ).tocsc() | |
2124 elif ( | |
2125 P2.ndim == 2 | |
2126 and P2.shape[0] == P2.shape[1] | |
2127 and P2.shape[0] == X.shape[1] | |
2128 ): | |
2129 if sparse.issparse(X): | |
2130 P2 = sparse.csc_matrix(P2) | |
2131 else: | |
2132 raise ValueError( | |
2133 "P2 must be either None or an array of shape " | |
2134 "(n_features, n_features) with " | |
2135 "n_features=X.shape[1]; " | |
2136 "got (P2.shape=({0}, {1})), needed ({2}, {2})".format( | |
2137 P2.shape[0], P2.shape[1], X.shape[1] | |
2138 ) | |
2139 ) | |
2140 | |
2141 1 6.0 6.0 0.0 start_params = self.start_params | |
2142 1 5.0 5.0 0.0 if isinstance(start_params, str): | |
2143 1 5.0 5.0 0.0 if start_params not in ["guess", "zero"]: | |
2144 raise ValueError( | |
2145 "The argument start_params must be 'guess', " | |
2146 "'zero' or an array of correct length; " | |
2147 "got(start_params={})".format(start_params) | |
2148 ) | |
2149 else: | |
2150 start_params = check_array( | |
2151 start_params, | |
2152 accept_sparse=False, | |
2153 force_all_finite=True, | |
2154 ensure_2d=False, | |
2155 dtype=_dtype, | |
2156 copy=True, | |
2157 ) | |
2158 if (start_params.shape[0] != X.shape[1] + self.fit_intercept) or ( | |
2159 start_params.ndim != 1 | |
2160 ): | |
2161 raise ValueError( | |
2162 "Start values for parameters must have the" | |
2163 "right length and dimension; required (length" | |
2164 "={}, ndim=1); got (length={}, ndim={}).".format( | |
2165 X.shape[1] + self.fit_intercept, | |
2166 start_params.shape[0], | |
2167 start_params.ndim, | |
2168 ) | |
2169 ) | |
2170 | |
2171 1 6.0 6.0 0.0 l1 = self.alpha * self.l1_ratio | |
2172 1 6.0 6.0 0.0 l2 = self.alpha * (1 - self.l1_ratio) | |
2173 # P1 and P2 are now for sure copies | |
2174 1 11.0 11.0 0.0 P1 = l1 * P1 | |
2175 1 6.0 6.0 0.0 P2 = l2 * P2 | |
2176 # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2') | |
2177 # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric | |
2178 1 5.0 5.0 0.0 if P2.ndim == 2: | |
2179 if sparse.issparse(P2): | |
2180 if sparse.isspmatrix_csc(P2): | |
2181 P2 = 0.5 * (P2 + P2.transpose()).tocsc() | |
2182 else: | |
2183 P2 = 0.5 * (P2 + P2.transpose()).tocsr() | |
2184 else: | |
2185 P2 = 0.5 * (P2 + P2.T) | |
2186 | |
2187 # For coordinate descent, if X is sparse, P2 must also be csc | |
2188 1 5.0 5.0 0.0 if solver == "cd" and sparse.issparse(X): | |
2189 P2 = sparse.csc_matrix(P2) | |
2190 | |
2191 # 1.4 additional validations ########################################## | |
2192 1 5.0 5.0 0.0 if self.check_input: | |
2193 1 1086.0 1086.0 0.0 if not np.all(family.in_y_range(y)): | |
2194 raise ValueError( | |
2195 "Some value(s) of y are out of the valid " | |
2196 "range for family {}".format(family.__class__.__name__) | |
2197 ) | |
2198 # check if P1 has only non-negative values, negative values might | |
2199 # indicate group lasso in the future. | |
2200 1 6.0 6.0 0.0 if not isinstance(self.P1, str): # if self.P1 != 'identity': | |
2201 if not np.all(P1 >= 0): | |
2202 raise ValueError("P1 must not have negative values.") | |
2203 # check if P2 is positive semidefinite | |
2204 # np.linalg.cholesky(P2) 'only' asserts positive definite | |
2205 1 5.0 5.0 0.0 if not isinstance(self.P2, str): # self.P2 != 'identity' | |
2206 # due to numerical precision, we allow eigenvalues to be a | |
2207 # tiny bit negative | |
2208 epsneg = -10 * np.finfo(P2.dtype).epsneg | |
2209 if P2.ndim == 1 or P2.shape[0] == 1: | |
2210 p2 = P2 | |
2211 if sparse.issparse(P2): | |
2212 p2 = P2.toarray() | |
2213 if not np.all(p2 >= 0): | |
2214 raise ValueError( | |
2215 "1d array P2 must not have negative " "values." | |
2216 ) | |
2217 elif sparse.issparse(P2): | |
2218 # for sparse matrices, not all eigenvals can be computed | |
2219 # efficiently, use only half of n_features | |
2220 # k = how many eigenvals to compute | |
2221 k = np.min([10, n_features // 10 + 1]) | |
2222 sigma = -1000 * epsneg # start searching near this value | |
2223 which = "SA" # find smallest algebraic eigenvalues first | |
2224 eigenvalues = splinalg.eigsh( | |
2225 P2, k=k, sigma=sigma, which=which, return_eigenvectors=False | |
2226 ) | |
2227 if not np.all(eigenvalues >= epsneg): | |
2228 raise ValueError("P2 must be positive semi-definite.") | |
2229 else: | |
2230 if not np.all(linalg.eigvalsh(P2) >= epsneg): | |
2231 raise ValueError("P2 must be positive semi-definite.") | |
2232 # TODO: if alpha=0 check that X is not rank deficient | |
2233 # TODO: what else to check? | |
2234 | |
2235 ####################################################################### | |
2236 # 2a. rescaling of weights (sample_weight) # | |
2237 ####################################################################### | |
2238 # IMPORTANT NOTE: Since we want to minimize | |
2239 # 1/(2*sum(sample_weight)) * deviance + L1 + L2, | |
2240 # deviance = sum(sample_weight * unit_deviance), | |
2241 # we rescale weights such that sum(weights) = 1 and this becomes | |
2242 # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance) | |
2243 1 351.0 351.0 0.0 weights_sum = np.sum(weights) | |
2244 1 1916.0 1916.0 0.0 weights = weights / weights_sum | |
2245 | |
2246 ####################################################################### | |
2247 # 2b. potentially rescale predictors | |
2248 ####################################################################### | |
2249 1 7.0 7.0 0.0 if self.standardize: | |
2250 1 356277.0 356277.0 4.2 X, P1, P2, col_means, col_stds = _standardize(X, P1, P2, weights) | |
2251 | |
2252 ####################################################################### | |
2253 # 3. initialization of coef = (intercept_, coef_) # | |
2254 ####################################################################### | |
2255 # Note: Since phi=self.dispersion_ does not enter the estimation | |
2256 # of mu_i=E[y_i], set it to 1. | |
2257 | |
2258 # set start values for coef | |
2259 1 11.0 11.0 0.0 if self.warm_start and hasattr(self, "coef_"): | |
2260 coef = self.coef_ | |
2261 intercept = self.intercept_ | |
2262 if self.fit_intercept: | |
2263 coef = np.concatenate((np.array([intercept]), coef)) | |
2264 if self.standardize: | |
2265 _standardize_warm_start(coef, col_means, col_stds) | |
2266 1 9.0 9.0 0.0 elif isinstance(start_params, str): | |
2267 1 8.0 8.0 0.0 if start_params == "guess": | |
2268 # Set mu=starting_mu of the family and do one Newton step | |
2269 # If solver=cd use cd, else irls | |
2270 1 7813.0 7813.0 0.1 mu = family.starting_mu(y, weights=weights) | |
2271 1 21668.0 21668.0 0.3 eta = link.link(mu) # linear predictor | |
2272 1 11.0 11.0 0.0 if solver in ["cd", "lbfgs", "newton-cg"]: | |
2273 # see function _cd_solver | |
2274 1 14785.0 14785.0 0.2 sigma_inv = 1 / family.variance(mu, phi=1, weights=weights) | |
2275 1 17775.0 17775.0 0.2 d1 = link.inverse_derivative(eta) | |
2276 1 7121.0 7121.0 0.1 temp = sigma_inv * d1 * (y - mu) | |
2277 1 11.0 11.0 0.0 if self.fit_intercept: | |
2278 1 11419.0 11419.0 0.1 score = np.concatenate(([temp.sum()], temp @ X)) | |
2279 else: | |
2280 score = temp @ X # same as X.T @ temp | |
2281 | |
2282 1 1965.0 1965.0 0.0 d2_sigma_inv = d1 * d1 * sigma_inv | |
2283 1 9.0 9.0 0.0 diag_fisher = self.diag_fisher | |
2284 1 7.0 7.0 0.0 if diag_fisher: | |
2285 fisher = d2_sigma_inv | |
2286 else: | |
2287 2 766539.0 383269.5 9.0 fisher = _safe_sandwich_dot( | |
2288 1 7.0 7.0 0.0 X, d2_sigma_inv, intercept=self.fit_intercept | |
2289 ) | |
2290 # set up space for search direction d for inner loop | |
2291 1 9.0 9.0 0.0 if self.fit_intercept: | |
2292 1 20.0 20.0 0.0 coef = np.zeros(n_features + 1) | |
2293 else: | |
2294 coef = np.zeros(n_features) | |
2295 1 65.0 65.0 0.0 d = np.zeros_like(coef) | |
2296 # initial stopping tolerance of inner loop | |
2297 # use L1-norm of minimum of norm of subgradient of F | |
2298 # use less restrictive tolerance for initial guess | |
2299 1 122.0 122.0 0.0 inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1) | |
2300 1 114.0 114.0 0.0 inner_tol = 4 * linalg.norm(inner_tol, ord=1) | |
2301 # just one outer loop = Newton step | |
2302 1 9.0 9.0 0.0 n_cycles = 0 | |
2303 2 6689.0 3344.5 0.1 d, coef_P2, n_cycles, inner_tol = _cd_cycle( | |
2304 1 8.0 8.0 0.0 d, | |
2305 1 7.0 7.0 0.0 X, | |
2306 1 7.0 7.0 0.0 coef, | |
2307 1 8.0 8.0 0.0 score, | |
2308 1 8.0 8.0 0.0 fisher, | |
2309 1 8.0 8.0 0.0 P1, | |
2310 1 8.0 8.0 0.0 P2, | |
2311 1 8.0 8.0 0.0 n_cycles, | |
2312 1 8.0 8.0 0.0 inner_tol, | |
2313 1 8.0 8.0 0.0 max_inner_iter=1000, | |
2314 1 9.0 9.0 0.0 selection=self.selection, | |
2315 1 8.0 8.0 0.0 random_state=random_state, | |
2316 1 8.0 8.0 0.0 diag_fisher=self.diag_fisher, | |
2317 ) | |
2318 1 13.0 13.0 0.0 coef += d # for simplicity no line search here | |
2319 else: | |
2320 # See _irls_solver | |
2321 # h'(eta) | |
2322 hp = link.inverse_derivative(eta) | |
2323 # working weights W, in principle a diagonal matrix | |
2324 # therefore here just as 1d array | |
2325 W = hp ** 2 / family.variance(mu, phi=1, weights=weights) | |
2326 # working observations | |
2327 z = eta + (y - mu) / hp | |
2328 # solve A*coef = b | |
2329 # A = X' W X + l2 P2, b = X' W z | |
2330 coef = _irls_step(X, W, P2, z, fit_intercept=self.fit_intercept) | |
2331 else: # start_params == 'zero' | |
2332 if self.fit_intercept: | |
2333 coef = np.zeros(n_features + 1) | |
2334 coef[0] = link.link(np.average(y, weights=weights)) | |
2335 else: | |
2336 coef = np.zeros(n_features) | |
2337 else: # assign given array as start values | |
2338 coef = start_params | |
2339 if self.standardize: | |
2340 _standardize_warm_start(coef, col_means, col_stds) | |
2341 | |
2342 ####################################################################### | |
2343 # 4. fit # | |
2344 ####################################################################### | |
2345 # algorithms for optimization | |
2346 # TODO: Parallelize it? | |
2347 | |
2348 # 4.1 IRLS ############################################################ | |
2349 # Note: we already set P2 = l2*P2, see above | |
2350 # Note: we already symmetrized P2 = 1/2 (P2 + P2') | |
2351 1 8.0 8.0 0.0 if solver == "irls": | |
2352 coef, self.n_iter_ = _irls_solver( | |
2353 coef=coef, | |
2354 X=X, | |
2355 y=y, | |
2356 weights=weights, | |
2357 P2=P2, | |
2358 fit_intercept=self.fit_intercept, | |
2359 family=family, | |
2360 link=link, | |
2361 max_iter=self.max_iter, | |
2362 tol=self.tol, | |
2363 ) | |
2364 | |
2365 # 4.2 L-BFGS ########################################################## | |
2366 1 8.0 8.0 0.0 elif solver == "lbfgs": | |
2367 | |
2368 def func(coef, X, y, weights, P2, family, link): | |
2369 mu, devp = family._mu_deviance_derivative(coef, X, y, weights, link) | |
2370 dev = family.deviance(y, mu, weights) | |
2371 intercept = coef.size == X.shape[1] + 1 | |
2372 idx = 1 if intercept else 0 # offset if coef[0] is intercept | |
2373 if P2.ndim == 1: | |
2374 L2 = P2 * coef[idx:] | |
2375 else: | |
2376 L2 = P2 @ coef[idx:] | |
2377 obj = 0.5 * dev + 0.5 * (coef[idx:] @ L2) | |
2378 objp = 0.5 * devp | |
2379 objp[idx:] += L2 | |
2380 return obj, objp | |
2381 | |
2382 args = (X, y, weights, P2, family, link) | |
2383 coef, loss, info = fmin_l_bfgs_b( | |
2384 func, | |
2385 coef, | |
2386 fprime=None, | |
2387 args=args, | |
2388 iprint=(self.verbose > 0) - 1, | |
2389 pgtol=self.tol, | |
2390 maxiter=self.max_iter, | |
2391 factr=1e3, | |
2392 ) | |
2393 if info["warnflag"] == 1: | |
2394 warnings.warn( | |
2395 "lbfgs failed to converge." " Increase the number of iterations.", | |
2396 ConvergenceWarning, | |
2397 ) | |
2398 elif info["warnflag"] == 2: | |
2399 warnings.warn("lbfgs failed for the reason: {}".format(info["task"])) | |
2400 self.n_iter_ = info["nit"] | |
2401 | |
2402 # 4.3 Newton-CG ####################################################### | |
2403 # We use again the fisher matrix instead of the hessian. More | |
2404 # precisely, expected hessian of deviance. | |
2405 1 8.0 8.0 0.0 elif solver == "newton-cg": | |
2406 | |
2407 def func(coef, X, y, weights, P2, family, link): | |
2408 intercept = coef.size == X.shape[1] + 1 | |
2409 idx = 1 if intercept else 0 # offset if coef[0] is intercept | |
2410 if P2.ndim == 1: | |
2411 L2 = coef[idx:] @ (P2 * coef[idx:]) | |
2412 else: | |
2413 L2 = coef[idx:] @ (P2 @ coef[idx:]) | |
2414 mu = link.inverse(_safe_lin_pred(X, coef)) | |
2415 return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2 | |
2416 | |
2417 def grad(coef, X, y, weights, P2, family, link): | |
2418 mu, devp = family._mu_deviance_derivative(coef, X, y, weights, link) | |
2419 intercept = coef.size == X.shape[1] + 1 | |
2420 idx = 1 if intercept else 0 # offset if coef[0] is intercept | |
2421 if P2.ndim == 1: | |
2422 L2 = P2 * coef[idx:] | |
2423 else: | |
2424 L2 = P2 @ coef[idx:] | |
2425 objp = 0.5 * devp | |
2426 objp[idx:] += L2 | |
2427 return objp | |
2428 | |
2429 def grad_hess(coef, X, y, weights, P2, family, link): | |
2430 intercept = coef.size == X.shape[1] + 1 | |
2431 idx = 1 if intercept else 0 # offset if coef[0] is intercept | |
2432 if P2.ndim == 1: | |
2433 L2 = P2 * coef[idx:] | |
2434 else: | |
2435 L2 = P2 @ coef[idx:] | |
2436 eta = _safe_lin_pred(X, coef) | |
2437 mu = link.inverse(eta) | |
2438 d1 = link.inverse_derivative(eta) | |
2439 temp = d1 * family.deviance_derivative(y, mu, weights) | |
2440 if intercept: | |
2441 grad = np.concatenate(([0.5 * temp.sum()], 0.5 * temp @ X + L2)) | |
2442 else: | |
2443 grad = 0.5 * temp @ X + L2 # same as 0.5* X.T @ temp + L2 | |
2444 | |
2445 # expected hessian = fisher = X.T @ diag_matrix @ X | |
2446 # calculate only diag_matrix | |
2447 diag = d1 ** 2 / family.variance(mu, phi=1, weights=weights) | |
2448 if intercept: | |
2449 h0i = np.concatenate(([diag.sum()], diag @ X)) | |
2450 | |
2451 def Hs(coef): | |
2452 # return (0.5 * fisher + P2) @ coef | |
2453 # ret = 0.5 * (X.T @ (diag * (X @ coef))) | |
2454 ret = 0.5 * ((diag * (X @ coef[idx:])) @ X) | |
2455 if P2.ndim == 1: | |
2456 ret += P2 * coef[idx:] | |
2457 else: | |
2458 ret += P2 @ coef[idx:] | |
2459 if intercept: | |
2460 ret = np.concatenate( | |
2461 ([0.5 * (h0i @ coef)], ret + 0.5 * coef[0] * h0i[1:]) | |
2462 ) | |
2463 return ret | |
2464 | |
2465 return grad, Hs | |
2466 | |
2467 args = (X, y, weights, P2, family, link) | |
2468 coef, self.n_iter_ = newton_cg( | |
2469 grad_hess, | |
2470 func, | |
2471 grad, | |
2472 coef, | |
2473 args=args, | |
2474 maxiter=self.max_iter, | |
2475 tol=self.tol, | |
2476 ) | |
2477 | |
2478 # 4.4 coordinate descent ############################################## | |
2479 # Note: we already set P1 = l1*P1, see above | |
2480 # Note: we already set P2 = l2*P2, see above | |
2481 # Note: we already symmetrized P2 = 1/2 (P2 + P2') | |
2482 1 8.0 8.0 0.0 elif solver == "cd": | |
2483 2 6952782.0 3476391.0 81.2 coef, self.n_iter_, self._n_cycles = _cd_solver( | |
2484 1 8.0 8.0 0.0 coef=coef, | |
2485 1 8.0 8.0 0.0 X=X, | |
2486 1 8.0 8.0 0.0 y=y, | |
2487 1 9.0 9.0 0.0 weights=weights, | |
2488 1 9.0 9.0 0.0 P1=P1, | |
2489 1 8.0 8.0 0.0 P2=P2, | |
2490 1 8.0 8.0 0.0 fit_intercept=self.fit_intercept, | |
2491 1 8.0 8.0 0.0 family=family, | |
2492 1 8.0 8.0 0.0 link=link, | |
2493 1 8.0 8.0 0.0 max_iter=self.max_iter, | |
2494 1 8.0 8.0 0.0 tol=self.tol, | |
2495 1 8.0 8.0 0.0 selection=self.selection, | |
2496 1 8.0 8.0 0.0 random_state=random_state, | |
2497 1 8.0 8.0 0.0 diag_fisher=self.diag_fisher, | |
2498 1 8.0 8.0 0.0 copy_X=self.copy_X, | |
2499 ) | |
2500 | |
2501 ####################################################################### | |
2502 # 5a. handle intercept | |
2503 ####################################################################### | |
2504 1 12.0 12.0 0.0 if self.fit_intercept: | |
2505 1 12.0 12.0 0.0 self.intercept_ = coef[0] | |
2506 1 13.0 13.0 0.0 self.coef_ = coef[1:] | |
2507 else: | |
2508 # set intercept to zero as the other linear models do | |
2509 self.intercept_ = 0.0 | |
2510 self.coef_ = coef | |
2511 | |
2512 ####################################################################### | |
2513 # 5a. undo standardization | |
2514 ####################################################################### | |
2515 1 9.0 9.0 0.0 if self.standardize: | |
2516 # TODO: Make sure this doesn't copy X | |
2517 2 126923.0 63461.5 1.5 X, self.intercept_, self.coef_ = _unstandardize( | |
2518 1 9.0 9.0 0.0 X, col_means, col_stds, self.intercept_, self.coef_ | |
2519 ) | |
2520 | |
2521 1 13.0 13.0 0.0 if self.fit_dispersion in ["chisqr", "deviance"]: | |
2522 # attention because of rescaling of weights | |
2523 self.dispersion_ = self.estimate_phi(X, y, weights) * weights_sum | |
2524 | |
2525 1 7.0 7.0 0.0 return self | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Dim of data: (1000000, 107) | |
Sparseness: 0.683 | |
Size of data frame: 0.80 GB | |
********************************* | |
alpha=1.0, l1_ratio=0.5, sparse=True | |
Creating sparse matrix... | |
Using new implementation... | |
0.09618600747421668 0 0 | |
0.034404141330196404 1 1 | |
0.011721161752972374 2 2 | |
0.0034913993816849454 3 3 | |
0.0007158709705561006 4 4 | |
5.3700190453114465e-05 5 5 | |
44.3958 seconds | |
gini=0.1111423209994666 | |
mean outcome: 0.04787563746463126 | |
mean prediction: 0.046850885751455344 | |
Wrote profile results to glm_qc.py.lprof | |
Timer unit: 1e-06 s | |
Total time: 42.2249 s | |
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py | |
Function: _safe_sandwich_dot at line 105 | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
105 @profile | |
106 def _safe_sandwich_dot(X, d, intercept=False): | |
107 """Compute sandwich product X.T @ diag(d) @ X. | |
108 | |
109 With ``intercept=True``, X is treated as if a column of 1 were appended as | |
110 first column of X. | |
111 X can be sparse, d must be an ndarray. Always returns a ndarray.""" | |
112 8 44.0 5.5 0.0 if sparse.issparse(X): | |
113 # TODO: There's a bit of room to accelerate this by avoiding the | |
114 # allocation of a new sparse matrix every time we pass through this | |
115 # step. The X.multiply creates a new matrix. Avoiding the allocation | |
116 # and updating the rows in place accelerates this line by ~20%. | |
117 temp = X.transpose() @ X.multiply(d[:, np.newaxis]) | |
118 # for older versions of numpy and scipy, temp may be a np.matrix | |
119 temp = _safe_toarray(temp) | |
120 8 15.0 1.9 0.0 elif type(X) is ColScaledSpMat: | |
121 8 41538672.0 5192334.0 98.4 term1 = X.mat.transpose() @ X.mat.multiply(d[:, np.newaxis]) | |
122 8 341964.0 42745.5 0.8 term2 = X.mat.transpose().dot(d)[:, np.newaxis] * X.shift | |
123 8 34.0 4.2 0.0 term3 = term2.T | |
124 8 2622.0 327.8 0.0 term4 = (X.shift.T * X.shift) * d.sum() | |
125 8 3603.0 450.4 0.0 temp = term1 + term2 + term3 + term4 | |
126 8 86.0 10.8 0.0 temp = _safe_toarray(temp) | |
127 else: | |
128 temp = (X.T * d) @ X | |
129 8 7.0 0.9 0.0 if intercept: | |
130 8 11.0 1.4 0.0 dim = X.shape[1] + 1 | |
131 8 10.0 1.2 0.0 if type(X) is ColScaledSpMat: | |
132 8 6.0 0.8 0.0 order = "F" | |
133 elif sparse.issparse(X): | |
134 order = "F" if sparse.isspmatrix_csc(X) else "C" | |
135 else: | |
136 order = "F" if X.flags["F_CONTIGUOUS"] else "C" | |
137 8 42.0 5.2 0.0 res = np.empty((dim, dim), dtype=max(X.dtype, d.dtype), order=order) | |
138 8 2157.0 269.6 0.0 res[0, 0] = d.sum() | |
139 8 335305.0 41913.1 0.8 res[1:, 0] = d @ X | |
140 8 83.0 10.4 0.0 res[0, 1:] = res[1:, 0] | |
141 8 233.0 29.1 0.0 res[1:, 1:] = temp | |
142 else: | |
143 res = temp | |
144 8 7.0 0.9 0.0 return res | |
Total time: 37.7536 s | |
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py | |
Function: _eta_mu_score_fisher at line 730 | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
730 @profile | |
731 def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link, diag_fisher=False): | |
732 """Compute linear predictor, mean, score function and fisher matrix. | |
733 | |
734 It calculates the linear predictor, the mean, score function | |
735 (derivative of log-likelihood) and Fisher information matrix | |
736 all in one go as function of `coef` (:math:`w`) and the data. | |
737 | |
738 Parameters | |
739 ---------- | |
740 diag_fisher : boolean, optional (default=False) | |
741 If ``True``, returns only an array d such that | |
742 fisher = X.T @ np.diag(d) @ X. | |
743 | |
744 Returns | |
745 ------- | |
746 (eta, mu, score, fisher) : tuple with 4 elements | |
747 The 4 elements are: | |
748 | |
749 * eta: ndarray, shape (X.shape[0],) | |
750 * mu: ndarray, shape (X.shape[0],) | |
751 * score: ndarray, shape (X.shape[0],) | |
752 * fisher: | |
753 | |
754 * If diag_fisher is ``False``, the full fisher matrix, | |
755 an array of shape (X.shape[1], X.shape[1]) | |
756 * If diag_fisher is ``True`, an array of shape (X.shape[0]) | |
757 """ | |
758 7 15.0 2.1 0.0 intercept = coef.size == X.shape[1] + 1 | |
759 # eta = linear predictor | |
760 7 292686.0 41812.3 0.8 eta = _safe_lin_pred(X, coef) | |
761 7 71816.0 10259.4 0.2 mu = link.inverse(eta) | |
762 | |
763 # FOR TWEEDIE: sigma_inv = weights / (mu ** p) during optimization bc phi = 1 | |
764 7 61441.0 8777.3 0.2 sigma_inv = 1.0 / self.variance(mu, phi=phi, weights=weights) | |
765 7 70751.0 10107.3 0.2 d1 = link.inverse_derivative(eta) # = h'(eta) | |
766 # Alternatively: | |
767 # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g | |
768 # d1 = 1./link.derivative(mu) | |
769 7 11318.0 1616.9 0.0 d1_sigma_inv = d1 * sigma_inv | |
770 7 17883.0 2554.7 0.0 temp = d1_sigma_inv * (y - mu) | |
771 7 13.0 1.9 0.0 if intercept: | |
772 7 297231.0 42461.6 0.8 score = np.concatenate(([temp.sum()], temp @ X)) | |
773 else: | |
774 score = temp @ X | |
775 | |
776 7 12631.0 1804.4 0.0 d2_sigma_inv = d1 * d1_sigma_inv | |
777 7 13.0 1.9 0.0 if diag_fisher: | |
778 fisher_matrix = d2_sigma_inv | |
779 else: | |
780 7 36917769.0 5273967.0 97.8 fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv, intercept=intercept) | |
781 7 9.0 1.3 0.0 return eta, mu, score, fisher_matrix | |
Total time: 0.019522 s | |
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py | |
Function: _cd_cycle at line 1131 | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
1131 @profile | |
1132 def _cd_cycle( | |
1133 d, | |
1134 X, | |
1135 coef, | |
1136 score, | |
1137 fisher, | |
1138 P1, | |
1139 P2, | |
1140 n_cycles, | |
1141 inner_tol, | |
1142 max_inner_iter=1000, | |
1143 selection="cyclic", | |
1144 random_state=None, | |
1145 diag_fisher=False, | |
1146 ): | |
1147 """Compute inner loop of coordinate descent, i.e. cycles through features. | |
1148 | |
1149 Minimization of 1-d subproblems:: | |
1150 | |
1151 min_z q(d+z*e_j) - q(d) | |
1152 = min_z A_j z + 1/2 B_jj z^2 + ||P1_j (w_j+d_j+z)||_1 | |
1153 | |
1154 A = f'(w) + d*H(w) + (w+d)*P2 | |
1155 B = H+P2 | |
1156 Note: f'=-score and H=fisher are updated at the end of outer iteration. | |
1157 """ | |
1158 # TODO: use sparsity (coefficient already 0 due to L1 penalty) | |
1159 # => active set of features for featurelist, see paper | |
1160 # of Improved GLMNET or Gap Safe Screening Rules | |
1161 # https://arxiv.org/abs/1611.05780 | |
1162 7 15.0 2.1 0.1 n_samples, n_features = X.shape | |
1163 7 17.0 2.4 0.1 intercept = coef.size == X.shape[1] + 1 | |
1164 7 10.0 1.4 0.1 idx = 1 if intercept else 0 # offset if coef[0] is intercept | |
1165 # ES: This is weird. Should this be B = fisher.copy()? Why would you do this? | |
1166 7 7.0 1.0 0.0 B = fisher | |
1167 7 10.0 1.4 0.1 if P2.ndim == 1: | |
1168 coef_P2 = coef[idx:] * P2 | |
1169 if not diag_fisher: | |
1170 idiag = np.arange(start=idx, stop=B.shape[0]) | |
1171 # B[np.diag_indices_from(B)] += P2 | |
1172 B[(idiag, idiag)] += P2 | |
1173 else: | |
1174 7 1828.0 261.1 9.4 coef_P2 = coef[idx:] @ P2 | |
1175 7 12.0 1.7 0.1 if not diag_fisher: | |
1176 7 23.0 3.3 0.1 if sparse.issparse(P2): | |
1177 7 602.0 86.0 3.1 B[idx:, idx:] += P2.toarray() | |
1178 else: | |
1179 B[idx:, idx:] += P2 | |
1180 7 28.0 4.0 0.1 A = -score | |
1181 7 37.0 5.3 0.2 A[idx:] += coef_P2 | |
1182 # A += d @ (H+P2) but so far d=0 | |
1183 # inner loop | |
1184 7 21.0 3.0 0.1 for inner_iter in range(1, max_inner_iter + 1): | |
1185 7 9.0 1.3 0.0 inner_iter += 1 | |
1186 7 7.0 1.0 0.0 n_cycles += 1 | |
1187 # cycle through features, update intercept separately at the end | |
1188 7 8.0 1.1 0.0 if selection == "random": | |
1189 7 152.0 21.7 0.8 featurelist = random_state.permutation(n_features) | |
1190 else: | |
1191 featurelist = np.arange(n_features) | |
1192 756 833.0 1.1 4.3 for j in featurelist: | |
1193 # minimize_z: a z + 1/2 b z^2 + c |d+z| | |
1194 # a = A_j | |
1195 # b = B_jj > 0 | |
1196 # c = |P1_j| = P1_j > 0, see 1.3 | |
1197 # d = w_j + d_j | |
1198 # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4) | |
1199 # with beta = z+d, beta_hat = d-a/b and gamma = c/b | |
1200 # z = 1/b * S(bd-a,c) - d | |
1201 # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding | |
1202 749 962.0 1.3 4.9 jdx = j + idx # index for arrays containing entries for intercept | |
1203 749 851.0 1.1 4.4 a = A[jdx] | |
1204 749 688.0 0.9 3.5 if diag_fisher: | |
1205 # Note: fisher is ndarray of shape (n_samples,) => no idx | |
1206 # Calculate Bj = B[j, :] = B[:, j] as it is needed later anyway | |
1207 Bj = np.zeros_like(A) | |
1208 if intercept: | |
1209 Bj[0] = fisher.sum() | |
1210 if sparse.issparse(X): | |
1211 Bj[idx:] = _safe_toarray( | |
1212 X[:, j].transpose() @ X.multiply(fisher[:, np.newaxis]) | |
1213 ).ravel() | |
1214 else: | |
1215 Bj[idx:] = (fisher * X[:, j]) @ X | |
1216 | |
1217 if P2.ndim == 1: | |
1218 Bj[idx:] += P2[j] | |
1219 else: | |
1220 if sparse.issparse(P2): | |
1221 # slice columns as P2 is csc | |
1222 Bj[idx:] += P2[:, j].toarray().ravel() | |
1223 else: | |
1224 Bj[idx:] += P2[:, j] | |
1225 b = Bj[jdx] | |
1226 else: | |
1227 749 878.0 1.2 4.5 b = B[jdx, jdx] | |
1228 | |
1229 # those ten lines are what it is all about | |
1230 749 913.0 1.2 4.7 if b <= 0: | |
1231 z = 0 | |
1232 749 1028.0 1.4 5.3 elif P1[j] == 0: | |
1233 z = -a / b | |
1234 749 1434.0 1.9 7.3 elif a + P1[j] < b * (coef[jdx] + d[jdx]): | |
1235 z = -(a + P1[j]) / b | |
1236 749 1370.0 1.8 7.0 elif a - P1[j] > b * (coef[jdx] + d[jdx]): | |
1237 z = -(a - P1[j]) / b | |
1238 else: | |
1239 749 1096.0 1.5 5.6 z = -(coef[jdx] + d[jdx]) | |
1240 | |
1241 # update direction d | |
1242 749 1004.0 1.3 5.1 d[jdx] += z | |
1243 # update A because d_j is now d_j+z | |
1244 # A = f'(w) + d*H(w) + (w+d)*P2 | |
1245 # => A += (H+P2)*e_j z = B_j * z | |
1246 # Note: B is symmetric B = B.transpose | |
1247 749 681.0 0.9 3.5 if diag_fisher: | |
1248 # Bj = B[:, j] calculated above, still valid | |
1249 A += Bj * z | |
1250 else: | |
1251 # B is symmetric, C- or F-contiguous, but never sparse | |
1252 749 878.0 1.2 4.5 if B.flags["F_CONTIGUOUS"]: | |
1253 # slice columns like for sparse csc | |
1254 749 3088.0 4.1 15.8 A += B[:, jdx] * z | |
1255 else: # B.flags['C_CONTIGUOUS'] might be true | |
1256 # slice rows | |
1257 A += B[jdx, :] * z | |
1258 # end of cycle over features | |
1259 # update intercept | |
1260 7 7.0 1.0 0.0 if intercept: | |
1261 7 7.0 1.0 0.0 if diag_fisher: | |
1262 Bj = np.zeros_like(A) | |
1263 Bj[0] = fisher.sum() | |
1264 Bj[1:] = fisher @ X | |
1265 b = Bj[0] | |
1266 else: | |
1267 7 13.0 1.9 0.1 b = B[0, 0] | |
1268 7 15.0 2.1 0.1 z = 0 if b <= 0 else -A[0] / b | |
1269 7 11.0 1.6 0.1 d[0] += z | |
1270 7 6.0 0.9 0.0 if diag_fisher: | |
1271 A += Bj * z | |
1272 else: | |
1273 7 8.0 1.1 0.0 if B.flags["F_CONTIGUOUS"]: | |
1274 7 30.0 4.3 0.2 A += B[:, 0] * z | |
1275 else: | |
1276 A += B[0, :] * z | |
1277 # end of complete cycle | |
1278 # stopping criterion for inner loop | |
1279 # sum_i(|minimum of norm of subgrad of q(d)_i|) | |
1280 # subgrad q(d) = A + subgrad ||P1*(w+d)||_1 | |
1281 7 442.0 63.1 2.3 mn_subgrad = _min_norm_sugrad(coef=coef + d, grad=A, P2=None, P1=P1) | |
1282 7 454.0 64.9 2.3 mn_subgrad = linalg.norm(mn_subgrad, ord=1) | |
1283 7 13.0 1.9 0.1 if mn_subgrad <= inner_tol: | |
1284 7 8.0 1.1 0.0 if inner_iter == 1: | |
1285 inner_tol = inner_tol / 4.0 | |
1286 7 9.0 1.3 0.0 break | |
1287 # end of inner loop | |
1288 7 9.0 1.3 0.0 return d, coef_P2, n_cycles, inner_tol | |
Total time: 38.1615 s | |
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py | |
Function: _cd_solver at line 1291 | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
1291 @profile | |
1292 def _cd_solver( | |
1293 coef, | |
1294 X, | |
1295 y, | |
1296 weights, | |
1297 P1, | |
1298 P2, | |
1299 fit_intercept, | |
1300 family, | |
1301 link, | |
1302 max_iter=100, | |
1303 max_inner_iter=1000, | |
1304 tol=1e-4, | |
1305 selection="cyclic ", | |
1306 random_state=None, | |
1307 diag_fisher=False, | |
1308 copy_X=True, | |
1309 ): | |
1310 """Solve GLM with L1 and L2 penalty by coordinate descent algorithm. | |
1311 | |
1312 The objective being minimized in the coefficients w=coef is:: | |
1313 | |
1314 F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1 | |
1315 | |
1316 An Improved GLMNET for L1-regularized Logistic Regression: | |
1317 | |
1318 1. Find optimal descent direction d by minimizing | |
1319 min_d F(w+d) = min_d F(w+d) - F(w) | |
1320 2. Quadratic approximation of F(w+d)-F(w) = q(d): | |
1321 using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives: | |
1322 q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d | |
1323 + ||P1*(w+d)||_1 - ||P1*w||_1 | |
1324 Then minimize q(d): min_d q(d) | |
1325 3. Coordinate descent by updating coordinate j (d -> d+z*e_j): | |
1326 min_z q(d+z*e_j) | |
1327 = min_z q(d+z*e_j) - q(d) | |
1328 = min_z A_j z + 1/2 B_jj z^2 | |
1329 + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1 | |
1330 A = f'(w) + d*H(w) + (w+d)*P2 | |
1331 B = H + P2 | |
1332 | |
1333 Repeat steps 1-3 until convergence. | |
1334 Note: Use Fisher matrix instead of Hessian for H. | |
1335 Note: f' = -score, H = Fisher matrix | |
1336 | |
1337 Parameters | |
1338 ---------- | |
1339 coef : ndarray, shape (c,) | |
1340 If fit_intercept=False, shape c=X.shape[1]. | |
1341 If fit_intercept=True, then c=X.shape[1] + 1. | |
1342 | |
1343 X : {ndarray, csc sparse matrix}, shape (n_samples, n_features) | |
1344 Training data (with intercept included if present). If not sparse, | |
1345 pass directly as Fortran-contiguous data to avoid | |
1346 unnecessary memory duplication. | |
1347 | |
1348 y : ndarray, shape (n_samples,) | |
1349 Target values. | |
1350 | |
1351 weights: ndarray, shape (n_samples,) | |
1352 Sample weights with which the deviance is weighted. The weights must | |
1353 bee normalized and sum to 1. | |
1354 | |
1355 P1 : {ndarray}, shape (n_features,) | |
1356 The L1-penalty vector (=diagonal matrix) | |
1357 | |
1358 P2 : {ndarray, csc sparse matrix}, shape (n_features, n_features) | |
1359 The L2-penalty matrix or vector (=diagonal matrix). If a matrix is | |
1360 passed, it must be symmetric. If X is sparse, P2 must also be sparse. | |
1361 | |
1362 fit_intercept : boolean, optional (default=True) | |
1363 Specifies if a constant (a.k.a. bias or intercept) should be | |
1364 added to the linear predictor (X*coef+intercept). | |
1365 | |
1366 family : ExponentialDispersionModel | |
1367 | |
1368 link : Link | |
1369 | |
1370 max_iter : int, optional (default=100) | |
1371 Maximum numer of outer (Newton) iterations. | |
1372 | |
1373 max_inner_iter : int, optional (default=1000) | |
1374 Maximum number of iterations in each inner loop, i.e. max number of | |
1375 cycles over all features per inner loop. | |
1376 | |
1377 tol : float, optional (default=1e-4) | |
1378 Convergence criterion is | |
1379 sum_i(|minimum of norm of subgrad of objective_i|)<=tol. | |
1380 | |
1381 selection : str, optional (default='cyclic') | |
1382 If 'random', randomly chose features in inner loop. | |
1383 | |
1384 random_state : {int, RandomState instance, None}, optional (default=None) | |
1385 | |
1386 diag_fisher : boolean, optional (default=False) | |
1387 ``False`` calculates full fisher matrix, ``True`` only diagonal matrix | |
1388 s.t. fisher = X.T @ diag @ X. This saves storage but needs more | |
1389 matrix-vector multiplications. | |
1390 | |
1391 copy_X : boolean, optional (default=True) | |
1392 If ``True``, X will be copied; else, it may be overwritten. | |
1393 | |
1394 Returns | |
1395 ------- | |
1396 coef : ndarray, shape (c,) | |
1397 If fit_intercept=False, shape c=X.shape[1]. | |
1398 If fit_intercept=True, then c=X.shape[1] + 1. | |
1399 | |
1400 n_iter : number of outer iterations = newton iterations | |
1401 | |
1402 n_cycles : number of cycles over features | |
1403 | |
1404 References | |
1405 ---------- | |
1406 Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin | |
1407 An Improved GLMNET for L1-regularized Logistic Regression, | |
1408 Journal of Machine Learning Research 13 (2012) 1999-2030 | |
1409 https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf | |
1410 """ | |
1411 1 3.0 3.0 0.0 if type(X) is not ColScaledSpMat: | |
1412 # TODO: because copy_X is being passed here and is also being passed in | |
1413 # check_X_y in fit(...), X is being copied twice. Check if this is true. | |
1414 X = check_array( | |
1415 X, "csc", dtype=[np.float64, np.float32], order="F", copy=copy_X | |
1416 ) | |
1417 1 3.0 3.0 0.0 if P2.ndim == 2: | |
1418 2 457.0 228.5 0.0 P2 = check_array( | |
1419 1 4.0 4.0 0.0 P2, "csc", dtype=[np.float64, np.float32], order="F", copy=copy_X | |
1420 ) | |
1421 1 5.0 5.0 0.0 if sparse.issparse(X): | |
1422 if not sparse.isspmatrix_csc(P2): | |
1423 raise ValueError( | |
1424 "If X is sparse, P2 must also be sparse csc" | |
1425 "format. Got P2 not sparse." | |
1426 ) | |
1427 1 12.0 12.0 0.0 random_state = check_random_state(random_state) | |
1428 # Note: we already set P2 = l2*P2, P1 = l1*P1 | |
1429 # Note: we already symmetrized P2 = 1/2 (P2 + P2') | |
1430 1 3.0 3.0 0.0 n_iter = 0 # number of outer iterations | |
1431 1 2.0 2.0 0.0 n_cycles = 0 # number of (complete) cycles over features | |
1432 1 1.0 1.0 0.0 converged = False | |
1433 1 2.0 2.0 0.0 n_samples, n_features = X.shape | |
1434 1 2.0 2.0 0.0 idx = 1 if fit_intercept else 0 # offset if coef[0] is intercept | |
1435 # line search parameters | |
1436 1 2.0 2.0 0.0 (beta, sigma) = (0.5, 0.01) | |
1437 # some precalculations | |
1438 # Note: For diag_fisher=False, fisher = X.T @ fisher @ X and fisher is a | |
1439 # 1d array representing a diagonal matrix. | |
1440 2 5409710.0 2704855.0 14.2 eta, mu, score, fisher = family._eta_mu_score_fisher( | |
1441 1 2.0 2.0 0.0 coef=coef, phi=1, X=X, y=y, weights=weights, link=link, diag_fisher=diag_fisher | |
1442 ) | |
1443 # set up space for search direction d for inner loop | |
1444 1 31.0 31.0 0.0 d = np.zeros_like(coef) | |
1445 | |
1446 # minimum subgradient norm | |
1447 1 3.0 3.0 0.0 def calc_mn_subgrad_norm(): | |
1448 return linalg.norm( | |
1449 _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1), ord=1 | |
1450 ) | |
1451 | |
1452 # the ratio of inner _cd_cycle tolerance to the minimum subgradient norm | |
1453 # This wasn't explored in the newGLMNET paper linked above. | |
1454 # That paper essentially uses inner_tol_ratio = 1.0, but using a slightly | |
1455 # lower value is much faster. | |
1456 # By comparison, the original GLMNET paper uses inner_tol = tol. | |
1457 # So, inner_tol_ratio < 1 is sort of a compromise between the two papers. | |
1458 # The value should probably be between 0.01 and 0.5. 0.1 works well for many problems | |
1459 1 2.0 2.0 0.0 inner_tol_ratio = 0.1 | |
1460 | |
1461 1 2.0 2.0 0.0 def calc_inner_tol(mn_subgrad_norm): | |
1462 # Another potential rule limits the inner tol to be no smaller than tol | |
1463 # return max(mn_subgrad_norm * inner_tol_ratio, tol) | |
1464 return mn_subgrad_norm * inner_tol_ratio | |
1465 | |
1466 # initial stopping tolerance of inner loop | |
1467 # use L1-norm of minimum of norm of subgradient of F | |
1468 1 417.0 417.0 0.0 inner_tol = calc_inner_tol(calc_mn_subgrad_norm()) | |
1469 | |
1470 1 2.0 2.0 0.0 Fw = None | |
1471 | |
1472 # outer loop | |
1473 6 12.0 2.0 0.0 while n_iter < max_iter: | |
1474 6 2301.0 383.5 0.0 print(inner_tol, n_iter, n_cycles) | |
1475 6 19.0 3.2 0.0 n_iter += 1 | |
1476 # initialize search direction d (to be optimized) with zero | |
1477 6 35.0 5.8 0.0 d.fill(0) | |
1478 # inner loop = _cd_cycle | |
1479 12 27764.0 2313.7 0.1 d, coef_P2, n_cycles, inner_tol = _cd_cycle( | |
1480 6 13.0 2.2 0.0 d, | |
1481 6 13.0 2.2 0.0 X, | |
1482 6 14.0 2.3 0.0 coef, | |
1483 6 13.0 2.2 0.0 score, | |
1484 6 13.0 2.2 0.0 fisher, | |
1485 6 14.0 2.3 0.0 P1, | |
1486 6 13.0 2.2 0.0 P2, | |
1487 6 13.0 2.2 0.0 n_cycles, | |
1488 6 13.0 2.2 0.0 inner_tol, | |
1489 6 15.0 2.5 0.0 max_inner_iter=max_inner_iter, | |
1490 6 15.0 2.5 0.0 selection=selection, | |
1491 6 15.0 2.5 0.0 random_state=random_state, | |
1492 6 13.0 2.2 0.0 diag_fisher=diag_fisher, | |
1493 ) | |
1494 | |
1495 # line search by sequence beta^k, k=0, 1, .. | |
1496 # F(w + lambda d) - F(w) <= lambda * bound | |
1497 # bound = sigma * (f'(w)*d + w*P2*d | |
1498 # +||P1 (w+d)||_1 - ||P1 w||_1) | |
1499 6 247.0 41.2 0.0 P1w_1 = linalg.norm(P1 * coef[idx:], ord=1) | |
1500 6 202.0 33.7 0.0 P1wd_1 = linalg.norm(P1 * (coef + d)[idx:], ord=1) | |
1501 # Note: coef_P2 already calculated and still valid | |
1502 6 86.0 14.3 0.0 bound = sigma * (-(score @ d) + coef_P2 @ d[idx:] + P1wd_1 - P1w_1) | |
1503 | |
1504 # In the first iteration, we must compute Fw explicitly. | |
1505 # In later iterations, we just use Fwd from the previous iteration | |
1506 # as set after the line search loop below. | |
1507 6 12.0 2.0 0.0 if Fw is None: | |
1508 1 2.0 2.0 0.0 Fw = ( | |
1509 3 8018.0 2672.7 0.0 0.5 * family.deviance(y, mu, weights) | |
1510 1 17.0 17.0 0.0 + 0.5 * (coef_P2 @ coef[idx:]) | |
1511 1 2.0 2.0 0.0 + P1w_1 | |
1512 ) | |
1513 | |
1514 6 25.0 4.2 0.0 la = 1.0 / beta | |
1515 | |
1516 6 247723.0 41287.2 0.6 X_dot_d = _safe_lin_pred(X, d) | |
1517 | |
1518 # Try progressively shorter line search steps. | |
1519 6 49.0 8.2 0.0 for k in range(20): | |
1520 6 19.0 3.2 0.0 la *= beta # starts with la=1 | |
1521 6 76.0 12.7 0.0 coef_wd = coef + la * d | |
1522 | |
1523 # The simple version of the next line is: | |
1524 # mu_wd = link.inverse(_safe_lin_pred(X, coef_wd)) | |
1525 # but because coef_wd can be factored as | |
1526 # coef_wd = coef + la * d | |
1527 # we can rewrite to only perform one dot product with the data | |
1528 # matrix per loop which is substantially faster | |
1529 6 69558.0 11593.0 0.2 mu_wd = link.inverse(eta + la * X_dot_d) | |
1530 | |
1531 # TODO - optimize: for Tweedie that isn't one of the special cases | |
1532 # (gaussian, poisson, gamma), family.deviance is quite slow! Can we | |
1533 # fix that somehow? | |
1534 12 46967.0 3913.9 0.1 Fwd = 0.5 * family.deviance(y, mu_wd, weights) + linalg.norm( | |
1535 6 61.0 10.2 0.0 P1 * coef_wd[idx:], ord=1 | |
1536 ) | |
1537 6 20.0 3.3 0.0 if P2.ndim == 1: | |
1538 Fwd += 0.5 * ((coef_wd[idx:] * P2) @ coef_wd[idx:]) | |
1539 else: | |
1540 6 514.0 85.7 0.0 Fwd += 0.5 * (coef_wd[idx:] @ (P2 @ coef_wd[idx:])) | |
1541 6 23.0 3.8 0.0 if Fwd - Fw <= sigma * la * bound: | |
1542 6 12.0 2.0 0.0 break | |
1543 | |
1544 # Fw in the next iteration will be equal to Fwd this iteration. | |
1545 6 13.0 2.2 0.0 Fw = Fwd | |
1546 | |
1547 # update coefficients | |
1548 6 77.0 12.8 0.0 coef += la * d | |
1549 | |
1550 # calculate eta, mu, score, Fisher matrix for next iteration | |
1551 12 32344270.0 2695355.8 84.8 eta, mu, score, fisher = family._eta_mu_score_fisher( | |
1552 6 12.0 2.0 0.0 coef=coef, | |
1553 6 11.0 1.8 0.0 phi=1, | |
1554 6 12.0 2.0 0.0 X=X, | |
1555 6 12.0 2.0 0.0 y=y, | |
1556 6 12.0 2.0 0.0 weights=weights, | |
1557 6 12.0 2.0 0.0 link=link, | |
1558 6 12.0 2.0 0.0 diag_fisher=diag_fisher, | |
1559 ) | |
1560 | |
1561 # stopping criterion for outer loop | |
1562 # sum_i(|minimum-norm of subgrad of F(w)_i|) | |
1563 # fp_wP2 = f'(w) + w*P2 | |
1564 # Note: eta, mu and score are already updated | |
1565 # this also updates the inner tolerance for the next loop! | |
1566 6 2366.0 394.3 0.0 mn_subgrad_norm = calc_mn_subgrad_norm() | |
1567 6 24.0 4.0 0.0 if mn_subgrad_norm <= tol: | |
1568 1 2.0 2.0 0.0 converged = True | |
1569 1 2.0 2.0 0.0 break | |
1570 | |
1571 5 18.0 3.6 0.0 inner_tol = calc_inner_tol(mn_subgrad_norm) | |
1572 # end of outer loop | |
1573 | |
1574 1 2.0 2.0 0.0 if not converged: | |
1575 warnings.warn( | |
1576 "Coordinate descent failed to converge. Increase" | |
1577 " the maximum number of iterations max_iter" | |
1578 " (currently {})".format(max_iter), | |
1579 ConvergenceWarning, | |
1580 ) | |
1581 | |
1582 1 2.0 2.0 0.0 return coef, n_iter, n_cycles | |
Total time: 44.3836 s | |
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py | |
Function: fit at line 1884 | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
1884 @profile | |
1885 def fit(self, X, y, sample_weight=None): | |
1886 """Fit a Generalized Linear Model. | |
1887 | |
1888 Parameters | |
1889 ---------- | |
1890 X : {array-like, sparse matrix}, shape (n_samples, n_features) | |
1891 Training data. | |
1892 | |
1893 y : array-like, shape (n_samples,) | |
1894 Target values. | |
1895 | |
1896 sample_weight : {None, array-like}, shape (n_samples,),\ | |
1897 optional (default=None) | |
1898 Individual weights w_i for each sample. Note that for an | |
1899 Exponential Dispersion Model (EDM), one has | |
1900 Var[Y_i]=phi/w_i * v(mu). | |
1901 If Y_i ~ EDM(mu, phi/w_i), then | |
1902 sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a | |
1903 weighted average with weights=sample_weight. | |
1904 | |
1905 Returns | |
1906 ------- | |
1907 self : returns an instance of self. | |
1908 """ | |
1909 # 99.8% of time is in the _cd_solver function | |
1910 ####################################################################### | |
1911 # 1. input validation # | |
1912 ####################################################################### | |
1913 # 1.1 validate arguments of __init__ ################################## | |
1914 # Guarantee that self._family_instance is an instance of class | |
1915 # ExponentialDispersionModel | |
1916 1 57.0 57.0 0.0 if isinstance(self.family, ExponentialDispersionModel): | |
1917 self._family_instance = self.family | |
1918 else: | |
1919 1 5.0 5.0 0.0 if self.family == "normal": | |
1920 self._family_instance = NormalDistribution() | |
1921 1 5.0 5.0 0.0 elif self.family == "poisson": | |
1922 1 25.0 25.0 0.0 self._family_instance = PoissonDistribution() | |
1923 elif self.family == "gamma": | |
1924 self._family_instance = GammaDistribution() | |
1925 elif self.family == "inverse.gaussian": | |
1926 self._family_instance = InverseGaussianDistribution() | |
1927 elif self.family == "binomial": | |
1928 self._family_instance = BinomialDistribution() | |
1929 else: | |
1930 raise ValueError( | |
1931 "The family must be an instance of class" | |
1932 " ExponentialDispersionModel or an element of" | |
1933 " ['normal', 'poisson', 'gamma', 'inverse.gaussian', " | |
1934 "'binomial']; got (family={})".format(self.family) | |
1935 ) | |
1936 | |
1937 # Guarantee that self._link_instance is set to an instance of | |
1938 # class Link | |
1939 1 15.0 15.0 0.0 if isinstance(self.link, Link): | |
1940 self._link_instance = self.link | |
1941 else: | |
1942 1 5.0 5.0 0.0 if self.link == "auto": | |
1943 1 7.0 7.0 0.0 if isinstance(self._family_instance, TweedieDistribution): | |
1944 1 6.0 6.0 0.0 if self._family_instance.power <= 0: | |
1945 self._link_instance = IdentityLink() | |
1946 1 5.0 5.0 0.0 if self._family_instance.power >= 1: | |
1947 1 8.0 8.0 0.0 self._link_instance = LogLink() | |
1948 elif isinstance(self._family_instance, GeneralizedHyperbolicSecant): | |
1949 self._link_instance = IdentityLink() | |
1950 elif isinstance(self._family_instance, BinomialDistribution): | |
1951 self._link_instance = LogitLink() | |
1952 else: | |
1953 raise ValueError( | |
1954 "No default link known for the " | |
1955 "specified distribution family. Please " | |
1956 "set link manually, i.e. not to 'auto'; " | |
1957 "got (link='auto', family={}".format(self.family) | |
1958 ) | |
1959 elif self.link == "identity": | |
1960 self._link_instance = IdentityLink() | |
1961 elif self.link == "log": | |
1962 self._link_instance = LogLink() | |
1963 elif self.link == "logit": | |
1964 self._link_instance = LogitLink() | |
1965 else: | |
1966 raise ValueError( | |
1967 "The link must be an instance of class Link or " | |
1968 "an element of ['auto', 'identity', 'log', 'logit']; " | |
1969 "got (link={})".format(self.link) | |
1970 ) | |
1971 | |
1972 1 14.0 14.0 0.0 if not isinstance(self.alpha, numbers.Number) or self.alpha < 0: | |
1973 raise ValueError( | |
1974 "Penalty term must be a non-negative number;" | |
1975 " got (alpha={})".format(self.alpha) | |
1976 ) | |
1977 3 15.0 5.0 0.0 if ( | |
1978 1 5.0 5.0 0.0 not isinstance(self.l1_ratio, numbers.Number) | |
1979 1 5.0 5.0 0.0 or self.l1_ratio < 0 | |
1980 1 5.0 5.0 0.0 or self.l1_ratio > 1 | |
1981 ): | |
1982 raise ValueError( | |
1983 "l1_ratio must be a number in interval [0, 1];" | |
1984 " got (l1_ratio={})".format(self.l1_ratio) | |
1985 ) | |
1986 1 5.0 5.0 0.0 if not isinstance(self.fit_intercept, bool): | |
1987 raise ValueError( | |
1988 "The argument fit_intercept must be bool;" | |
1989 " got {}".format(self.fit_intercept) | |
1990 ) | |
1991 1 5.0 5.0 0.0 if self.solver not in ["auto", "irls", "lbfgs", "newton-cg", "cd"]: | |
1992 raise ValueError( | |
1993 "GeneralizedLinearRegressor supports only solvers" | |
1994 " 'auto', 'irls', 'lbfgs', 'newton-cg' and 'cd';" | |
1995 " got {}".format(self.solver) | |
1996 ) | |
1997 1 4.0 4.0 0.0 solver = self.solver | |
1998 1 5.0 5.0 0.0 if self.solver == "auto": | |
1999 1 5.0 5.0 0.0 if self.l1_ratio == 0: | |
2000 solver = "irls" | |
2001 else: | |
2002 1 5.0 5.0 0.0 solver = "cd" | |
2003 1 5.0 5.0 0.0 if self.alpha > 0 and self.l1_ratio > 0 and solver not in ["cd"]: | |
2004 raise ValueError( | |
2005 "The chosen solver (solver={}) can't deal " | |
2006 "with L1 penalties, which are included with " | |
2007 "(alpha={}) and (l1_ratio={}).".format( | |
2008 solver, self.alpha, self.l1_ratio | |
2009 ) | |
2010 ) | |
2011 1 6.0 6.0 0.0 if not isinstance(self.max_iter, int) or self.max_iter <= 0: | |
2012 raise ValueError( | |
2013 "Maximum number of iteration must be a positive " | |
2014 "integer;" | |
2015 " got (max_iter={!r})".format(self.max_iter) | |
2016 ) | |
2017 1 6.0 6.0 0.0 if not isinstance(self.tol, numbers.Number) or self.tol <= 0: | |
2018 raise ValueError( | |
2019 "Tolerance for stopping criteria must be " | |
2020 "positive; got (tol={!r})".format(self.tol) | |
2021 ) | |
2022 1 5.0 5.0 0.0 if not isinstance(self.warm_start, bool): | |
2023 raise ValueError( | |
2024 "The argument warm_start must be bool;" | |
2025 " got {}".format(self.warm_start) | |
2026 ) | |
2027 1 4.0 4.0 0.0 if self.selection not in ["cyclic", "random"]: | |
2028 raise ValueError( | |
2029 "The argument selection must be 'cyclic' or " | |
2030 "'random'; got (selection={})".format(self.selection) | |
2031 ) | |
2032 1 379.0 379.0 0.0 random_state = check_random_state(self.random_state) | |
2033 1 6.0 6.0 0.0 if not isinstance(self.diag_fisher, bool): | |
2034 raise ValueError( | |
2035 "The argument diag_fisher must be bool;" | |
2036 " got {}".format(self.diag_fisher) | |
2037 ) | |
2038 1 5.0 5.0 0.0 if not isinstance(self.copy_X, bool): | |
2039 raise ValueError( | |
2040 "The argument copy_X must be bool;" " got {}".format(self.copy_X) | |
2041 ) | |
2042 1 5.0 5.0 0.0 if not isinstance(self.check_input, bool): | |
2043 raise ValueError( | |
2044 "The argument check_input must be bool; got " | |
2045 "(check_input={})".format(self.check_input) | |
2046 ) | |
2047 | |
2048 1 5.0 5.0 0.0 family = self._family_instance | |
2049 1 5.0 5.0 0.0 link = self._link_instance | |
2050 | |
2051 # 1.2 validate arguments of fit ####################################### | |
2052 1 6.0 6.0 0.0 _dtype = [np.float64, np.float32] | |
2053 1 5.0 5.0 0.0 if solver == "cd": | |
2054 1 4.0 4.0 0.0 _stype = ["csc"] | |
2055 else: | |
2056 _stype = ["csc", "csr"] | |
2057 2 122584.0 61292.0 0.3 X, y = check_X_y( | |
2058 1 4.0 4.0 0.0 X, | |
2059 1 5.0 5.0 0.0 y, | |
2060 1 5.0 5.0 0.0 accept_sparse=_stype, | |
2061 1 4.0 4.0 0.0 dtype=_dtype, | |
2062 1 5.0 5.0 0.0 y_numeric=True, | |
2063 1 4.0 4.0 0.0 multi_output=False, | |
2064 1 5.0 5.0 0.0 copy=self.copy_X, | |
2065 ) | |
2066 # Without converting y to float, deviance might raise | |
2067 # ValueError: Integers to negative integer powers are not allowed. | |
2068 # Also, y must not be sparse. | |
2069 1 10.0 10.0 0.0 y = np.asarray(y, dtype=np.float64) | |
2070 | |
2071 1 1781.0 1781.0 0.0 weights = _check_weights(sample_weight, y.shape[0]) | |
2072 | |
2073 1 9.0 9.0 0.0 n_samples, n_features = X.shape | |
2074 | |
2075 # 1.3 arguments to take special care ################################## | |
2076 # P1, P2, start_params | |
2077 1 7.0 7.0 0.0 if isinstance(self.P1, str) and self.P1 == "identity": | |
2078 1 26.0 26.0 0.0 P1 = np.ones(n_features) | |
2079 else: | |
2080 P1 = np.atleast_1d(self.P1) | |
2081 try: | |
2082 P1 = P1.astype(np.float64, casting="safe", copy=False) | |
2083 except TypeError: | |
2084 raise TypeError( | |
2085 "The given P1 cannot be converted to a numeric" | |
2086 "array; got (P1.dtype={}).".format(P1.dtype) | |
2087 ) | |
2088 if (P1.ndim != 1) or (P1.shape[0] != n_features): | |
2089 raise ValueError( | |
2090 "P1 must be either 'identity' or a 1d array " | |
2091 "with the length of X.shape[1]; " | |
2092 "got (P1.shape[0]={}), " | |
2093 "needed (X.shape[1]={}).".format(P1.shape[0], n_features) | |
2094 ) | |
2095 # If X is sparse, make P2 sparse, too. | |
2096 1 5.0 5.0 0.0 if isinstance(self.P2, str) and self.P2 == "identity": | |
2097 1 7.0 7.0 0.0 if sparse.issparse(X): | |
2098 1 281.0 281.0 0.0 P2 = ( | |
2099 2 173.0 86.5 0.0 sparse.dia_matrix( | |
2100 1 10.0 10.0 0.0 (np.ones(n_features), 0), shape=(n_features, n_features) | |
2101 ) | |
2102 ).tocsc() | |
2103 else: | |
2104 P2 = np.ones(n_features) | |
2105 else: | |
2106 P2 = check_array( | |
2107 self.P2, copy=True, accept_sparse=_stype, dtype=_dtype, ensure_2d=False | |
2108 ) | |
2109 if P2.ndim == 1: | |
2110 P2 = np.asarray(P2) | |
2111 if P2.shape[0] != n_features: | |
2112 raise ValueError( | |
2113 "P2 should be a 1d array of shape " | |
2114 "(n_features,) with " | |
2115 "n_features=X.shape[1]; " | |
2116 "got (P2.shape=({},)), needed ({},)".format( | |
2117 P2.shape[0], X.shape[1] | |
2118 ) | |
2119 ) | |
2120 if sparse.issparse(X): | |
2121 P2 = ( | |
2122 sparse.dia_matrix((P2, 0), shape=(n_features, n_features)) | |
2123 ).tocsc() | |
2124 elif ( | |
2125 P2.ndim == 2 | |
2126 and P2.shape[0] == P2.shape[1] | |
2127 and P2.shape[0] == X.shape[1] | |
2128 ): | |
2129 if sparse.issparse(X): | |
2130 P2 = sparse.csc_matrix(P2) | |
2131 else: | |
2132 raise ValueError( | |
2133 "P2 must be either None or an array of shape " | |
2134 "(n_features, n_features) with " | |
2135 "n_features=X.shape[1]; " | |
2136 "got (P2.shape=({0}, {1})), needed ({2}, {2})".format( | |
2137 P2.shape[0], P2.shape[1], X.shape[1] | |
2138 ) | |
2139 ) | |
2140 | |
2141 1 6.0 6.0 0.0 start_params = self.start_params | |
2142 1 6.0 6.0 0.0 if isinstance(start_params, str): | |
2143 1 5.0 5.0 0.0 if start_params not in ["guess", "zero"]: | |
2144 raise ValueError( | |
2145 "The argument start_params must be 'guess', " | |
2146 "'zero' or an array of correct length; " | |
2147 "got(start_params={})".format(start_params) | |
2148 ) | |
2149 else: | |
2150 start_params = check_array( | |
2151 start_params, | |
2152 accept_sparse=False, | |
2153 force_all_finite=True, | |
2154 ensure_2d=False, | |
2155 dtype=_dtype, | |
2156 copy=True, | |
2157 ) | |
2158 if (start_params.shape[0] != X.shape[1] + self.fit_intercept) or ( | |
2159 start_params.ndim != 1 | |
2160 ): | |
2161 raise ValueError( | |
2162 "Start values for parameters must have the" | |
2163 "right length and dimension; required (length" | |
2164 "={}, ndim=1); got (length={}, ndim={}).".format( | |
2165 X.shape[1] + self.fit_intercept, | |
2166 start_params.shape[0], | |
2167 start_params.ndim, | |
2168 ) | |
2169 ) | |
2170 | |
2171 1 5.0 5.0 0.0 l1 = self.alpha * self.l1_ratio | |
2172 1 6.0 6.0 0.0 l2 = self.alpha * (1 - self.l1_ratio) | |
2173 # P1 and P2 are now for sure copies | |
2174 1 8.0 8.0 0.0 P1 = l1 * P1 | |
2175 1 146.0 146.0 0.0 P2 = l2 * P2 | |
2176 # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2') | |
2177 # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric | |
2178 1 5.0 5.0 0.0 if P2.ndim == 2: | |
2179 1 6.0 6.0 0.0 if sparse.issparse(P2): | |
2180 1 6.0 6.0 0.0 if sparse.isspmatrix_csc(P2): | |
2181 1 811.0 811.0 0.0 P2 = 0.5 * (P2 + P2.transpose()).tocsc() | |
2182 else: | |
2183 P2 = 0.5 * (P2 + P2.transpose()).tocsr() | |
2184 else: | |
2185 P2 = 0.5 * (P2 + P2.T) | |
2186 | |
2187 # For coordinate descent, if X is sparse, P2 must also be csc | |
2188 1 7.0 7.0 0.0 if solver == "cd" and sparse.issparse(X): | |
2189 1 80.0 80.0 0.0 P2 = sparse.csc_matrix(P2) | |
2190 | |
2191 # 1.4 additional validations ########################################## | |
2192 1 5.0 5.0 0.0 if self.check_input: | |
2193 1 1319.0 1319.0 0.0 if not np.all(family.in_y_range(y)): | |
2194 raise ValueError( | |
2195 "Some value(s) of y are out of the valid " | |
2196 "range for family {}".format(family.__class__.__name__) | |
2197 ) | |
2198 # check if P1 has only non-negative values, negative values might | |
2199 # indicate group lasso in the future. | |
2200 1 6.0 6.0 0.0 if not isinstance(self.P1, str): # if self.P1 != 'identity': | |
2201 if not np.all(P1 >= 0): | |
2202 raise ValueError("P1 must not have negative values.") | |
2203 # check if P2 is positive semidefinite | |
2204 # np.linalg.cholesky(P2) 'only' asserts positive definite | |
2205 1 5.0 5.0 0.0 if not isinstance(self.P2, str): # self.P2 != 'identity' | |
2206 # due to numerical precision, we allow eigenvalues to be a | |
2207 # tiny bit negative | |
2208 epsneg = -10 * np.finfo(P2.dtype).epsneg | |
2209 if P2.ndim == 1 or P2.shape[0] == 1: | |
2210 p2 = P2 | |
2211 if sparse.issparse(P2): | |
2212 p2 = P2.toarray() | |
2213 if not np.all(p2 >= 0): | |
2214 raise ValueError( | |
2215 "1d array P2 must not have negative " "values." | |
2216 ) | |
2217 elif sparse.issparse(P2): | |
2218 # for sparse matrices, not all eigenvals can be computed | |
2219 # efficiently, use only half of n_features | |
2220 # k = how many eigenvals to compute | |
2221 k = np.min([10, n_features // 10 + 1]) | |
2222 sigma = -1000 * epsneg # start searching near this value | |
2223 which = "SA" # find smallest algebraic eigenvalues first | |
2224 eigenvalues = splinalg.eigsh( | |
2225 P2, k=k, sigma=sigma, which=which, return_eigenvectors=False | |
2226 ) | |
2227 if not np.all(eigenvalues >= epsneg): | |
2228 raise ValueError("P2 must be positive semi-definite.") | |
2229 else: | |
2230 if not np.all(linalg.eigvalsh(P2) >= epsneg): | |
2231 raise ValueError("P2 must be positive semi-definite.") | |
2232 # TODO: if alpha=0 check that X is not rank deficient | |
2233 # TODO: what else to check? | |
2234 | |
2235 ####################################################################### | |
2236 # 2a. rescaling of weights (sample_weight) # | |
2237 ####################################################################### | |
2238 # IMPORTANT NOTE: Since we want to minimize | |
2239 # 1/(2*sum(sample_weight)) * deviance + L1 + L2, | |
2240 # deviance = sum(sample_weight * unit_deviance), | |
2241 # we rescale weights such that sum(weights) = 1 and this becomes | |
2242 # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance) | |
2243 1 428.0 428.0 0.0 weights_sum = np.sum(weights) | |
2244 1 1972.0 1972.0 0.0 weights = weights / weights_sum | |
2245 | |
2246 ####################################################################### | |
2247 # 2b. potentially rescale predictors | |
2248 ####################################################################### | |
2249 1 7.0 7.0 0.0 if self.standardize: | |
2250 1 479789.0 479789.0 1.1 X, P1, P2, col_means, col_stds = _standardize(X, P1, P2, weights) | |
2251 | |
2252 ####################################################################### | |
2253 # 3. initialization of coef = (intercept_, coef_) # | |
2254 ####################################################################### | |
2255 # Note: Since phi=self.dispersion_ does not enter the estimation | |
2256 # of mu_i=E[y_i], set it to 1. | |
2257 | |
2258 # set start values for coef | |
2259 1 26.0 26.0 0.0 if self.warm_start and hasattr(self, "coef_"): | |
2260 coef = self.coef_ | |
2261 intercept = self.intercept_ | |
2262 if self.fit_intercept: | |
2263 coef = np.concatenate((np.array([intercept]), coef)) | |
2264 if self.standardize: | |
2265 _standardize_warm_start(coef, col_means, col_stds) | |
2266 1 7.0 7.0 0.0 elif isinstance(start_params, str): | |
2267 1 5.0 5.0 0.0 if start_params == "guess": | |
2268 # Set mu=starting_mu of the family and do one Newton step | |
2269 # If solver=cd use cd, else irls | |
2270 1 4731.0 4731.0 0.0 mu = family.starting_mu(y, weights=weights) | |
2271 1 12563.0 12563.0 0.0 eta = link.link(mu) # linear predictor | |
2272 1 8.0 8.0 0.0 if solver in ["cd", "lbfgs", "newton-cg"]: | |
2273 # see function _cd_solver | |
2274 1 10930.0 10930.0 0.0 sigma_inv = 1 / family.variance(mu, phi=1, weights=weights) | |
2275 1 10093.0 10093.0 0.0 d1 = link.inverse_derivative(eta) | |
2276 1 5497.0 5497.0 0.0 temp = sigma_inv * d1 * (y - mu) | |
2277 1 8.0 8.0 0.0 if self.fit_intercept: | |
2278 1 42518.0 42518.0 0.1 score = np.concatenate(([temp.sum()], temp @ X)) | |
2279 else: | |
2280 score = temp @ X # same as X.T @ temp | |
2281 | |
2282 1 1980.0 1980.0 0.0 d2_sigma_inv = d1 * d1 * sigma_inv | |
2283 1 7.0 7.0 0.0 diag_fisher = self.diag_fisher | |
2284 1 5.0 5.0 0.0 if diag_fisher: | |
2285 fisher = d2_sigma_inv | |
2286 else: | |
2287 2 5307565.0 2653782.5 12.0 fisher = _safe_sandwich_dot( | |
2288 1 6.0 6.0 0.0 X, d2_sigma_inv, intercept=self.fit_intercept | |
2289 ) | |
2290 # set up space for search direction d for inner loop | |
2291 1 8.0 8.0 0.0 if self.fit_intercept: | |
2292 1 12.0 12.0 0.0 coef = np.zeros(n_features + 1) | |
2293 else: | |
2294 coef = np.zeros(n_features) | |
2295 1 43.0 43.0 0.0 d = np.zeros_like(coef) | |
2296 # initial stopping tolerance of inner loop | |
2297 # use L1-norm of minimum of norm of subgradient of F | |
2298 # use less restrictive tolerance for initial guess | |
2299 1 340.0 340.0 0.0 inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1) | |
2300 1 84.0 84.0 0.0 inner_tol = 4 * linalg.norm(inner_tol, ord=1) | |
2301 # just one outer loop = Newton step | |
2302 1 6.0 6.0 0.0 n_cycles = 0 | |
2303 2 4408.0 2204.0 0.0 d, coef_P2, n_cycles, inner_tol = _cd_cycle( | |
2304 1 5.0 5.0 0.0 d, | |
2305 1 6.0 6.0 0.0 X, | |
2306 1 5.0 5.0 0.0 coef, | |
2307 1 5.0 5.0 0.0 score, | |
2308 1 5.0 5.0 0.0 fisher, | |
2309 1 6.0 6.0 0.0 P1, | |
2310 1 6.0 6.0 0.0 P2, | |
2311 1 6.0 6.0 0.0 n_cycles, | |
2312 1 5.0 5.0 0.0 inner_tol, | |
2313 1 5.0 5.0 0.0 max_inner_iter=1000, | |
2314 1 6.0 6.0 0.0 selection=self.selection, | |
2315 1 6.0 6.0 0.0 random_state=random_state, | |
2316 1 6.0 6.0 0.0 diag_fisher=self.diag_fisher, | |
2317 ) | |
2318 1 10.0 10.0 0.0 coef += d # for simplicity no line search here | |
2319 else: | |
2320 # See _irls_solver | |
2321 # h'(eta) | |
2322 hp = link.inverse_derivative(eta) | |
2323 # working weights W, in principle a diagonal matrix | |
2324 # therefore here just as 1d array | |
2325 W = hp ** 2 / family.variance(mu, phi=1, weights=weights) | |
2326 # working observations | |
2327 z = eta + (y - mu) / hp | |
2328 # solve A*coef = b | |
2329 # A = X' W X + l2 P2, b = X' W z | |
2330 coef = _irls_step(X, W, P2, z, fit_intercept=self.fit_intercept) | |
2331 else: # start_params == 'zero' | |
2332 if self.fit_intercept: | |
2333 coef = np.zeros(n_features + 1) | |
2334 coef[0] = link.link(np.average(y, weights=weights)) | |
2335 else: | |
2336 coef = np.zeros(n_features) | |
2337 else: # assign given array as start values | |
2338 coef = start_params | |
2339 if self.standardize: | |
2340 _standardize_warm_start(coef, col_means, col_stds) | |
2341 | |
2342 ####################################################################### | |
2343 # 4. fit # | |
2344 ####################################################################### | |
2345 # algorithms for optimization | |
2346 # TODO: Parallelize it? | |
2347 | |
2348 # 4.1 IRLS ############################################################ | |
2349 # Note: we already set P2 = l2*P2, see above | |
2350 # Note: we already symmetrized P2 = 1/2 (P2 + P2') | |
2351 1 6.0 6.0 0.0 if solver == "irls": | |
2352 coef, self.n_iter_ = _irls_solver( | |
2353 coef=coef, | |
2354 X=X, | |
2355 y=y, | |
2356 weights=weights, | |
2357 P2=P2, | |
2358 fit_intercept=self.fit_intercept, | |
2359 family=family, | |
2360 link=link, | |
2361 max_iter=self.max_iter, | |
2362 tol=self.tol, | |
2363 ) | |
2364 | |
2365 # 4.2 L-BFGS ########################################################## | |
2366 1 5.0 5.0 0.0 elif solver == "lbfgs": | |
2367 | |
2368 def func(coef, X, y, weights, P2, family, link): | |
2369 mu, devp = family._mu_deviance_derivative(coef, X, y, weights, link) | |
2370 dev = family.deviance(y, mu, weights) | |
2371 intercept = coef.size == X.shape[1] + 1 | |
2372 idx = 1 if intercept else 0 # offset if coef[0] is intercept | |
2373 if P2.ndim == 1: | |
2374 L2 = P2 * coef[idx:] | |
2375 else: | |
2376 L2 = P2 @ coef[idx:] | |
2377 obj = 0.5 * dev + 0.5 * (coef[idx:] @ L2) | |
2378 objp = 0.5 * devp | |
2379 objp[idx:] += L2 | |
2380 return obj, objp | |
2381 | |
2382 args = (X, y, weights, P2, family, link) | |
2383 coef, loss, info = fmin_l_bfgs_b( | |
2384 func, | |
2385 coef, | |
2386 fprime=None, | |
2387 args=args, | |
2388 iprint=(self.verbose > 0) - 1, | |
2389 pgtol=self.tol, | |
2390 maxiter=self.max_iter, | |
2391 factr=1e3, | |
2392 ) | |
2393 if info["warnflag"] == 1: | |
2394 warnings.warn( | |
2395 "lbfgs failed to converge." " Increase the number of iterations.", | |
2396 ConvergenceWarning, | |
2397 ) | |
2398 elif info["warnflag"] == 2: | |
2399 warnings.warn("lbfgs failed for the reason: {}".format(info["task"])) | |
2400 self.n_iter_ = info["nit"] | |
2401 | |
2402 # 4.3 Newton-CG ####################################################### | |
2403 # We use again the fisher matrix instead of the hessian. More | |
2404 # precisely, expected hessian of deviance. | |
2405 1 6.0 6.0 0.0 elif solver == "newton-cg": | |
2406 | |
2407 def func(coef, X, y, weights, P2, family, link): | |
2408 intercept = coef.size == X.shape[1] + 1 | |
2409 idx = 1 if intercept else 0 # offset if coef[0] is intercept | |
2410 if P2.ndim == 1: | |
2411 L2 = coef[idx:] @ (P2 * coef[idx:]) | |
2412 else: | |
2413 L2 = coef[idx:] @ (P2 @ coef[idx:]) | |
2414 mu = link.inverse(_safe_lin_pred(X, coef)) | |
2415 return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2 | |
2416 | |
2417 def grad(coef, X, y, weights, P2, family, link): | |
2418 mu, devp = family._mu_deviance_derivative(coef, X, y, weights, link) | |
2419 intercept = coef.size == X.shape[1] + 1 | |
2420 idx = 1 if intercept else 0 # offset if coef[0] is intercept | |
2421 if P2.ndim == 1: | |
2422 L2 = P2 * coef[idx:] | |
2423 else: | |
2424 L2 = P2 @ coef[idx:] | |
2425 objp = 0.5 * devp | |
2426 objp[idx:] += L2 | |
2427 return objp | |
2428 | |
2429 def grad_hess(coef, X, y, weights, P2, family, link): | |
2430 intercept = coef.size == X.shape[1] + 1 | |
2431 idx = 1 if intercept else 0 # offset if coef[0] is intercept | |
2432 if P2.ndim == 1: | |
2433 L2 = P2 * coef[idx:] | |
2434 else: | |
2435 L2 = P2 @ coef[idx:] | |
2436 eta = _safe_lin_pred(X, coef) | |
2437 mu = link.inverse(eta) | |
2438 d1 = link.inverse_derivative(eta) | |
2439 temp = d1 * family.deviance_derivative(y, mu, weights) | |
2440 if intercept: | |
2441 grad = np.concatenate(([0.5 * temp.sum()], 0.5 * temp @ X + L2)) | |
2442 else: | |
2443 grad = 0.5 * temp @ X + L2 # same as 0.5* X.T @ temp + L2 | |
2444 | |
2445 # expected hessian = fisher = X.T @ diag_matrix @ X | |
2446 # calculate only diag_matrix | |
2447 diag = d1 ** 2 / family.variance(mu, phi=1, weights=weights) | |
2448 if intercept: | |
2449 h0i = np.concatenate(([diag.sum()], diag @ X)) | |
2450 | |
2451 def Hs(coef): | |
2452 # return (0.5 * fisher + P2) @ coef | |
2453 # ret = 0.5 * (X.T @ (diag * (X @ coef))) | |
2454 ret = 0.5 * ((diag * (X @ coef[idx:])) @ X) | |
2455 if P2.ndim == 1: | |
2456 ret += P2 * coef[idx:] | |
2457 else: | |
2458 ret += P2 @ coef[idx:] | |
2459 if intercept: | |
2460 ret = np.concatenate( | |
2461 ([0.5 * (h0i @ coef)], ret + 0.5 * coef[0] * h0i[1:]) | |
2462 ) | |
2463 return ret | |
2464 | |
2465 return grad, Hs | |
2466 | |
2467 args = (X, y, weights, P2, family, link) | |
2468 coef, self.n_iter_ = newton_cg( | |
2469 grad_hess, | |
2470 func, | |
2471 grad, | |
2472 coef, | |
2473 args=args, | |
2474 maxiter=self.max_iter, | |
2475 tol=self.tol, | |
2476 ) | |
2477 | |
2478 # 4.4 coordinate descent ############################################## | |
2479 # Note: we already set P1 = l1*P1, see above | |
2480 # Note: we already set P2 = l2*P2, see above | |
2481 # Note: we already symmetrized P2 = 1/2 (P2 + P2') | |
2482 1 6.0 6.0 0.0 elif solver == "cd": | |
2483 2 38164237.0 19082118.5 86.0 coef, self.n_iter_, self._n_cycles = _cd_solver( | |
2484 1 6.0 6.0 0.0 coef=coef, | |
2485 1 5.0 5.0 0.0 X=X, | |
2486 1 5.0 5.0 0.0 y=y, | |
2487 1 5.0 5.0 0.0 weights=weights, | |
2488 1 6.0 6.0 0.0 P1=P1, | |
2489 1 6.0 6.0 0.0 P2=P2, | |
2490 1 7.0 7.0 0.0 fit_intercept=self.fit_intercept, | |
2491 1 6.0 6.0 0.0 family=family, | |
2492 1 5.0 5.0 0.0 link=link, | |
2493 1 5.0 5.0 0.0 max_iter=self.max_iter, | |
2494 1 6.0 6.0 0.0 tol=self.tol, | |
2495 1 6.0 6.0 0.0 selection=self.selection, | |
2496 1 6.0 6.0 0.0 random_state=random_state, | |
2497 1 6.0 6.0 0.0 diag_fisher=self.diag_fisher, | |
2498 1 5.0 5.0 0.0 copy_X=self.copy_X, | |
2499 ) | |
2500 | |
2501 ####################################################################### | |
2502 # 5a. handle intercept | |
2503 ####################################################################### | |
2504 1 8.0 8.0 0.0 if self.fit_intercept: | |
2505 1 8.0 8.0 0.0 self.intercept_ = coef[0] | |
2506 1 8.0 8.0 0.0 self.coef_ = coef[1:] | |
2507 else: | |
2508 # set intercept to zero as the other linear models do | |
2509 self.intercept_ = 0.0 | |
2510 self.coef_ = coef | |
2511 | |
2512 ####################################################################### | |
2513 # 5a. undo standardization | |
2514 ####################################################################### | |
2515 1 7.0 7.0 0.0 if self.standardize: | |
2516 # TODO: Make sure this doesn't copy X | |
2517 2 208084.0 104042.0 0.5 X, self.intercept_, self.coef_ = _unstandardize( | |
2518 1 6.0 6.0 0.0 X, col_means, col_stds, self.intercept_, self.coef_ | |
2519 ) | |
2520 | |
2521 1 10.0 10.0 0.0 if self.fit_dispersion in ["chisqr", "deviance"]: | |
2522 # attention because of rescaling of weights | |
2523 self.dispersion_ = self.estimate_phi(X, y, weights) * weights_sum | |
2524 | |
2525 1 5.0 5.0 0.0 return self | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment