jtilly/glm_qc.py Secret

## glm_qc.py
# Setup
#
# # You need to clone
# - https://github.axa.com/martin-rohbeck-external/k-claims-modelling
# - https://github.axa.com/jan-tilly/glmnet_python
# - https://github.axa.com/jan-tilly/glm_benchmarks
# and then run (adjust paths to repos accordingly)
# conda run -n ${PROJECT_FLAVOR} mamba install -y fortran-compiler
# conda run -n ${PROJECT_FLAVOR} pip install --no-use-pep517 --disable-pip-version-check -e ~/code/k-claims-modelling
# conda run -n ${PROJECT_FLAVOR} --cwd ~/code/glmnet_python python setup.py install
# conda run -n ${PROJECT_FLAVOR} pip install --no-use-pep517 --disable-pip-version-check -e ~/code/glm_benchmarks

import os
from time import perf_counter

import glm_benchmarks.sklearn_fork as sklearn_qc
import numpy as np
import pandas as pd
import scipy.sparse
from glmnet_python import glmnet, glmnetPredict
from npp.config import get_logger

import k_claims_modelling.external.sklearn.linear_model.glm as sklearn
from k_claims_modelling.evaluation.gini import compute_gini
from k_claims_modelling.modelling.regressors import exposure_correction

logger = get_logger()
data_dir = "/home/shared_folders/pricing/claims_modelling/test_data_for_profiling_glms"

# %%
# load data
df = pd.read_parquet(os.path.join(data_dir, "outcomes.parquet"))
X = pd.read_parquet(os.path.join(data_dir, "X.parquet"))

# subsample
idx = df.sample(n=1_000_000).index
df = df.loc[idx].reset_index(drop=True)
X = X.loc[idx].reset_index(drop=True)

print(f"Dim of data: {X.shape}")
print(f"Sparseness: {(np.abs(X.values) < 1e-10).mean():.3f}")
print(f"Size of data frame: {X.memory_usage().sum() / float(1<<30):.2f} GB")

# %%
y = df["sanzkh02"].to_numpy()
exposure = df["je"].to_numpy()
train_set = (df["sample"] == "train").to_numpy()
test_set = (df["sample"] == "test").to_numpy()
future_set = (df["sample"] == "future").to_numpy()
offset = df["offset_kh_sach_frequenz"].to_numpy()
y_adj, sample_weight = exposure_correction(
    power=1, y=y, exposure=exposure, offset=offset
)

# %%
model_args = dict(
    family="poisson",
    max_iter=100,
    random_state=12345,
    copy_X=True,
    selection="random",
    tol=1e-5,
)

# %%
for alpha, l1_ratio, sparse in (
    (0.01, 0.00, False),
    (1.00, 0.50, False),
    (0.01, 0.00, True),
    (1.00, 0.50, True),
):

    print("")
    print("*********************************")
    print(f"{alpha=}, {l1_ratio=}, {sparse=}")
    print("")

    if sparse:
        print("Creating sparse matrix...")
        Xtrain = scipy.sparse.csc_matrix(X[train_set])
    else:
        Xtrain = X[train_set].to_numpy()

    # new implementation
    print("")
    print("Using new implementation...")
    tic = perf_counter()
    poisson_regressor_qc = sklearn_qc.GeneralizedLinearRegressor(
        standardize=True, alpha=alpha, l1_ratio=l1_ratio, **model_args
    ).fit(X=Xtrain, y=y_adj[train_set], sample_weight=sample_weight[train_set])
    toc = perf_counter()
    print(f"{toc-tic:.4f} seconds")

    y_pred_qc = poisson_regressor_qc.predict(X[test_set]) * sample_weight[test_set]
    gini = compute_gini(
        y_true=y[test_set], y_score=y_pred_qc, exposure=exposure[test_set]
    )
    print(f"{gini=}")
    print(f"mean outcome: {y[test_set].sum() / exposure[test_set].sum()}")
    print(f"mean prediction: {y_pred_qc.sum() / exposure[test_set].sum()}")

    # glmnet
    print("")
    print("Using glmnet...")
    tic = perf_counter()
    poisson_regressor_glmnet = glmnet(
        x=Xtrain,
        y=y_adj[train_set].copy(),
        weights=sample_weight[train_set],
        standardize=True,
        lambdau=np.array([alpha]),
        alpha=l1_ratio,
        family=model_args["family"],
        thresh=model_args["tol"],
    )
    toc = perf_counter()
    print(f"{toc-tic:.4f} seconds")

    y_pred_glmnet = (
        np.exp(glmnetPredict(poisson_regressor_glmnet, X[test_set]).flatten())
        * sample_weight[test_set]
    )
    gini = compute_gini(
        y_true=y[test_set], y_score=y_pred_glmnet, exposure=exposure[test_set]
    )
    print(f"{gini=}")
    print(f"mean outcome: {y[test_set].sum() / exposure[test_set].sum()}")
    print(f"mean prediction: {y_pred_glmnet.sum() / exposure[test_set].sum()}")

    # compare predictions
    print(f"all close? {np.allclose(y_pred_qc, y_pred_glmnet, atol=1e-04)}")

    # old implementation
    print("")
    print("Using old implementation... (no standardization)")
    tic = perf_counter()
    poisson_regressor = sklearn.GeneralizedLinearRegressor(
        alpha=alpha, l1_ratio=l1_ratio, **model_args
    ).fit(X=Xtrain, y=y_adj[train_set], sample_weight=sample_weight[train_set])
    toc = perf_counter()
    print(f"{toc-tic:.4f} seconds")

    y_pred = poisson_regressor.predict(X[test_set]) * sample_weight[test_set]
    gini = compute_gini(y_true=y[test_set], y_score=y_pred, exposure=exposure[test_set])
    print(f"{gini=}")
    print(f"mean outcome: {y[test_set].sum() / exposure[test_set].sum()}")
    print(f"mean prediction: {y_pred.sum() / exposure[test_set].sum()}")

    # compare predictions
    print(f"all close? {np.allclose(y_pred_qc, y_pred, atol=1e-04)}")


# %%

## glm_qc_10M.log
Dim of data: (10000000, 107)
Sparseness: 0.683
Size of data frame: 7.97 GB

*********************************
alpha=0.01, l1_ratio=0.0, sparse=False


Using new implementation...
41.0648 seconds
gini=0.26368907871297714
mean outcome: 0.04747663840483652
mean prediction: 0.047279319092816983

Using glmnet...
91.3531 seconds
gini=0.27551655391490293
mean outcome: 0.04747663840483652
mean prediction: 0.04728485394366472
all close? False

Using old implementation... (no standardization)
36.6776 seconds
gini=0.25560893366357507
mean outcome: 0.04747663840483652
mean prediction: 0.047348574848387524
all close? False

*********************************
alpha=1.0, l1_ratio=0.5, sparse=False


Using new implementation...
0.09614459466725817 0 0
0.034378559002931665 1 1
0.01170238383401215 2 2
0.0034775112012216183 3 3
0.000708324797992112 4 4
5.2231794878223296e-05 5 5
56.4671 seconds
gini=0.11972711376286291
mean outcome: 0.04747663840483652
mean prediction: 0.04729607839402956

Using glmnet...
7.4578 seconds
gini=0.12023107566353006
mean outcome: 0.04747663840483652
mean prediction: 0.04729181545853376
all close? True

Using old implementation... (no standardization)
0.9614459466725815 0 0
0.34378559002931663 1 1
0.11702383834012149 2 2
0.03477511201221618 3 3
0.00708324797992112 4 4
0.0005223179487822329 5 5
52.6287 seconds
gini=0.11972711376286291
mean outcome: 0.04747663840483652
mean prediction: 0.04729607839402956
all close? True

*********************************
alpha=0.01, l1_ratio=0.0, sparse=True

Creating sparse matrix...

Using new implementation...
338.1123 seconds
gini=0.263689078359197
mean outcome: 0.04747663840483652
mean prediction: 0.047279319092816886

Using glmnet...
57.4690 seconds
gini=0.2755165538902576
mean outcome: 0.04747663840483652
mean prediction: 0.0472848539436662
all close? False

Using old implementation... (no standardization)
291.9076 seconds
gini=0.2636890785530665
mean outcome: 0.04747663840483652
mean prediction: 0.04727931909281683
all close? True

*********************************
alpha=1.0, l1_ratio=0.5, sparse=True

Creating sparse matrix...

Using new implementation...
0.09614459466725817 0 0
0.034378559002931665 1 1
0.01170238383401215 2 2
0.0034775112012216183 3 3
0.000708324797992112 4 4
5.2231794878223296e-05 5 5
456.6564 seconds
gini=0.11972711376286291
mean outcome: 0.04747663840483652
mean prediction: 0.04729607839402956

Using glmnet...
3.9307 seconds
gini=0.12011246482376858
mean outcome: 0.04747663840483652
mean prediction: 0.04729181545858199
all close? True

Using old implementation... (no standardization)
1.344542742914321 0 0
0.6710968536517165 1 1
0.11795095198240992 2 2
0.03510388281768133 3 3
0.007180409077755724 4 4
0.0005352931468411203 5 5
389.8593 seconds
gini=0.12033554412788987
mean outcome: 0.04747663840483652
mean prediction: 0.04729629084703527
all close? True

## glm_qc_1M.log
Dim of data: (1000000, 107)
Sparseness: 0.683
Size of data frame: 0.80 GB

*********************************
alpha=0.01, l1_ratio=0.0, sparse=False


Using new implementation...
4.1740 seconds
gini=0.263818790086771
mean outcome: 0.04676540361757755
mean prediction: 0.04670313684179153

Using glmnet...
7.9436 seconds
gini=0.2724562578231543
mean outcome: 0.04676540361757755
mean prediction: 0.04674076928845567
all close? False

Using old implementation... (no standardization)
3.6724 seconds
gini=0.2532641359545414
mean outcome: 0.04676540361757755
mean prediction: 0.04619259199982138
all close? False

*********************************
alpha=1.0, l1_ratio=0.5, sparse=False


Using new implementation...
0.09619816120501168 0 0
0.03441165041414234 1 1
0.011726676309588263 2 2
0.003495483932480208 3 3
0.000718099939306369 4 4
5.41389992328904e-05 5 5
5.6504 seconds
gini=0.12005884445047671
mean outcome: 0.04676540361757755
mean prediction: 0.04666568650088688

Using glmnet...
0.6818 seconds
gini=0.12027297891459444
mean outcome: 0.04676540361757755
mean prediction: 0.046661043630974286
all close? True

Using old implementation... (no standardization)
0.9619816120501168 0 0
0.3441165041414234 1 1
0.11726676309588263 2 2
0.034954839324802076 3 3
0.0071809993930636905 4 4
0.000541389992328904 5 5
5.2115 seconds
gini=0.12005884445047671
mean outcome: 0.04676540361757755
mean prediction: 0.04666568650088688
all close? True

*********************************
alpha=0.01, l1_ratio=0.0, sparse=True

Creating sparse matrix...

Using new implementation...
32.5009 seconds
gini=0.2638187936317561
mean outcome: 0.04676540361757755
mean prediction: 0.046703136841791504

Using glmnet...
4.2908 seconds
gini=0.2724562578231521
mean outcome: 0.04676540361757755
mean prediction: 0.046740769288456385
all close? False

Using old implementation... (no standardization)
28.1692 seconds
gini=0.2638187961688558
mean outcome: 0.04676540361757755
mean prediction: 0.04670313684179154
all close? True

*********************************
alpha=1.0, l1_ratio=0.5, sparse=True

Creating sparse matrix...

Using new implementation...
0.09619816120501168 0 0
0.03441165041414234 1 1
0.011726676309588263 2 2
0.003495483932480208 3 3
0.000718099939306369 4 4
5.41389992328904e-05 5 5
42.4812 seconds
gini=0.12005884445047671
mean outcome: 0.04676540361757755
mean prediction: 0.04666568650088688

Using glmnet...
0.3295 seconds
gini=0.11954649640739994
mean outcome: 0.04676540361757755
mean prediction: 0.04666104363097935
all close? True

Using old implementation... (no standardization)
1.3457867135381405 0 0
0.6713103191660521 1 1
0.11819264109487121 2 2
0.03528348997508118 3 3
0.0072786327343033605 4 4
0.0005546862828521083 5 5
36.9609 seconds
gini=0.11950432517998748
mean outcome: 0.04676540361757755
mean prediction: 0.04666591512425464
all close? True

## profile_1M_dense_alpha_1_l1_ratio_0.5.log
Dim of data: (1000000, 107)
Sparseness: 0.683
Size of data frame: 0.80 GB

*********************************
alpha=1.0, l1_ratio=0.5, sparse=False


Using new implementation...
0.09615708450804317 0 0
0.034386273780650345 1 1
0.011708045105922351 2 2
0.003481694975107442 3 3
0.000710592744205067 4 4
5.2670328567151794e-05 5 5
8.5725 seconds
gini=0.11747903153166883
mean outcome: 0.04873558735427359
mean prediction: 0.04714181671456767
Wrote profile results to glm_qc.py.lprof
Timer unit: 1e-06 s

Total time: 6.57312 s
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py
Function: _safe_sandwich_dot at line 105

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
   105                                           @profile
   106                                           def _safe_sandwich_dot(X, d, intercept=False):
   107                                               """Compute sandwich product X.T @ diag(d) @ X.
   108
   109                                               With ``intercept=True``, X is treated as if a column of 1 were appended as
   110                                               first column of X.
   111                                               X can be sparse, d must be an ndarray. Always returns a ndarray."""
   112         8         74.0      9.2      0.0      if sparse.issparse(X):
   113                                                   # TODO: There's a bit of room to accelerate this by avoiding the
   114                                                   # allocation of a new sparse matrix every time we pass through this
   115                                                   # step. The X.multiply creates a new matrix. Avoiding the allocation
   116                                                   # and updating the rows in place accelerates this line by ~20%.
   117                                                   temp = X.transpose() @ X.multiply(d[:, np.newaxis])
   118                                                   # for older versions of numpy and scipy, temp may be a np.matrix
   119                                                   temp = _safe_toarray(temp)
   120         8         22.0      2.8      0.0      elif type(X) is ColScaledSpMat:
   121                                                   term1 = X.mat.transpose() @ X.mat.multiply(d[:, np.newaxis])
   122                                                   term2 = X.mat.transpose().dot(d)[:, np.newaxis] * X.shift
   123                                                   term3 = term2.T
   124                                                   term4 = (X.shift.T * X.shift) * d.sum()
   125                                                   temp = term1 + term2 + term3 + term4
   126                                                   temp = _safe_toarray(temp)
   127                                               else:
   128         8    6477524.0 809690.5     98.5          temp = (X.T * d) @ X
   129         8         40.0      5.0      0.0      if intercept:
   130         8         38.0      4.8      0.0          dim = X.shape[1] + 1
   131         8         31.0      3.9      0.0          if type(X) is ColScaledSpMat:
   132                                                       order = "F"
   133         8         79.0      9.9      0.0          elif sparse.issparse(X):
   134                                                       order = "F" if sparse.isspmatrix_csc(X) else "C"
   135                                                   else:
   136         8         36.0      4.5      0.0              order = "F" if X.flags["F_CONTIGUOUS"] else "C"
   137         8        159.0     19.9      0.0          res = np.empty((dim, dim), dtype=max(X.dtype, d.dtype), order=order)
   138         8       4656.0    582.0      0.1          res[0, 0] = d.sum()
   139         8      90118.0  11264.8      1.4          res[1:, 0] = d @ X
   140         8         56.0      7.0      0.0          res[0, 1:] = res[1:, 0]
   141         8        272.0     34.0      0.0          res[1:, 1:] = temp
   142                                               else:
   143                                                   res = temp
   144         8         12.0      1.5      0.0      return res

Total time: 6.3131 s
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py
Function: _eta_mu_score_fisher at line 730

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
   730                                               @profile
   731                                               def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link, diag_fisher=False):
   732                                                   """Compute linear predictor, mean, score function and fisher matrix.
   733
   734                                                   It calculates the linear predictor, the mean, score function
   735                                                   (derivative of log-likelihood) and Fisher information matrix
   736                                                   all in one go as function of `coef` (:math:`w`) and the data.
   737
   738                                                   Parameters
   739                                                   ----------
   740                                                   diag_fisher : boolean, optional (default=False)
   741                                                       If ``True``, returns only an array d such that
   742                                                       fisher = X.T @ np.diag(d) @ X.
   743
   744                                                   Returns
   745                                                   -------
   746                                                   (eta, mu, score, fisher) : tuple with 4 elements
   747                                                       The 4 elements are:
   748
   749                                                       * eta: ndarray, shape (X.shape[0],)
   750                                                       * mu: ndarray, shape (X.shape[0],)
   751                                                       * score: ndarray, shape (X.shape[0],)
   752                                                       * fisher:
   753
   754                                                           * If diag_fisher is ``False``, the full fisher matrix,
   755                                                             an array of shape (X.shape[1], X.shape[1])
   756                                                           * If diag_fisher is ``True`, an array of shape (X.shape[0])
   757                                                   """
   758         7         22.0      3.1      0.0          intercept = coef.size == X.shape[1] + 1
   759                                                   # eta = linear predictor
   760         7      87047.0  12435.3      1.4          eta = _safe_lin_pred(X, coef)
   761         7     109902.0  15700.3      1.7          mu = link.inverse(eta)
   762
   763                                                   # FOR TWEEDIE: sigma_inv = weights / (mu ** p) during optimization bc phi = 1
   764         7      77727.0  11103.9      1.2          sigma_inv = 1.0 / self.variance(mu, phi=phi, weights=weights)
   765         7     107635.0  15376.4      1.7          d1 = link.inverse_derivative(eta)  # = h'(eta)
   766                                                   # Alternatively:
   767                                                   # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g
   768                                                   # d1 = 1./link.derivative(mu)
   769         7      12412.0   1773.1      0.2          d1_sigma_inv = d1 * sigma_inv
   770         7      19982.0   2854.6      0.3          temp = d1_sigma_inv * (y - mu)
   771         7         19.0      2.7      0.0          if intercept:
   772         7      78613.0  11230.4      1.2              score = np.concatenate(([temp.sum()], temp @ X))
   773                                                   else:
   774                                                       score = temp @ X
   775
   776         7      12680.0   1811.4      0.2          d2_sigma_inv = d1 * d1_sigma_inv
   777         7         17.0      2.4      0.0          if diag_fisher:
   778                                                       fisher_matrix = d2_sigma_inv
   779                                                   else:
   780         7    5807028.0 829575.4     92.0              fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv, intercept=intercept)
   781         7         12.0      1.7      0.0          return eta, mu, score, fisher_matrix

Total time: 0.02796 s
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py
Function: _cd_cycle at line 1131

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
  1131                                           @profile
  1132                                           def _cd_cycle(
  1133                                               d,
  1134                                               X,
  1135                                               coef,
  1136                                               score,
  1137                                               fisher,
  1138                                               P1,
  1139                                               P2,
  1140                                               n_cycles,
  1141                                               inner_tol,
  1142                                               max_inner_iter=1000,
  1143                                               selection="cyclic",
  1144                                               random_state=None,
  1145                                               diag_fisher=False,
  1146                                           ):
  1147                                               """Compute inner loop of coordinate descent, i.e. cycles through features.
  1148
  1149                                               Minimization of 1-d subproblems::
  1150
  1151                                                   min_z q(d+z*e_j) - q(d)
  1152                                                   = min_z A_j z + 1/2 B_jj z^2 + ||P1_j (w_j+d_j+z)||_1
  1153
  1154                                               A = f'(w) + d*H(w) + (w+d)*P2
  1155                                               B = H+P2
  1156                                               Note: f'=-score and H=fisher are updated at the end of outer iteration.
  1157                                               """
  1158                                               # TODO: use sparsity (coefficient already 0 due to L1 penalty)
  1159                                               #       => active set of features for featurelist, see paper
  1160                                               #          of Improved GLMNET or Gap Safe Screening Rules
  1161                                               #          https://arxiv.org/abs/1611.05780
  1162         7         24.0      3.4      0.1      n_samples, n_features = X.shape
  1163         7         22.0      3.1      0.1      intercept = coef.size == X.shape[1] + 1
  1164         7         10.0      1.4      0.0      idx = 1 if intercept else 0  # offset if coef[0] is intercept
  1165                                               # ES: This is weird. Should this be B = fisher.copy()? Why would you do this?
  1166         7         10.0      1.4      0.0      B = fisher
  1167         7         14.0      2.0      0.1      if P2.ndim == 1:
  1168         7         93.0     13.3      0.3          coef_P2 = coef[idx:] * P2
  1169         7         10.0      1.4      0.0          if not diag_fisher:
  1170         7         92.0     13.1      0.3              idiag = np.arange(start=idx, stop=B.shape[0])
  1171                                                       # B[np.diag_indices_from(B)] += P2
  1172         7        198.0     28.3      0.7              B[(idiag, idiag)] += P2
  1173                                               else:
  1174                                                   coef_P2 = coef[idx:] @ P2
  1175                                                   if not diag_fisher:
  1176                                                       if sparse.issparse(P2):
  1177                                                           B[idx:, idx:] += P2.toarray()
  1178                                                       else:
  1179                                                           B[idx:, idx:] += P2
  1180         7         32.0      4.6      0.1      A = -score
  1181         7         44.0      6.3      0.2      A[idx:] += coef_P2
  1182                                               # A += d @ (H+P2) but so far d=0
  1183                                               # inner loop
  1184         7         33.0      4.7      0.1      for inner_iter in range(1, max_inner_iter + 1):
  1185         7         13.0      1.9      0.0          inner_iter += 1
  1186         7         11.0      1.6      0.0          n_cycles += 1
  1187                                                   # cycle through features, update intercept separately at the end
  1188         7         15.0      2.1      0.1          if selection == "random":
  1189         7        232.0     33.1      0.8              featurelist = random_state.permutation(n_features)
  1190                                                   else:
  1191                                                       featurelist = np.arange(n_features)
  1192       756       1432.0      1.9      5.1          for j in featurelist:
  1193                                                       # minimize_z: a z + 1/2 b z^2 + c |d+z|
  1194                                                       # a = A_j
  1195                                                       # b = B_jj > 0
  1196                                                       # c = |P1_j| = P1_j > 0, see 1.3
  1197                                                       # d = w_j + d_j
  1198                                                       # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4)
  1199                                                       # with beta = z+d, beta_hat = d-a/b and gamma = c/b
  1200                                                       # z = 1/b * S(bd-a,c) - d
  1201                                                       # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding
  1202       749       1523.0      2.0      5.4              jdx = j + idx  # index for arrays containing entries for intercept
  1203       749       1355.0      1.8      4.8              a = A[jdx]
  1204       749       1101.0      1.5      3.9              if diag_fisher:
  1205                                                           # Note: fisher is ndarray of shape (n_samples,) => no idx
  1206                                                           # Calculate Bj = B[j, :] = B[:, j] as it is needed later anyway
  1207                                                           Bj = np.zeros_like(A)
  1208                                                           if intercept:
  1209                                                               Bj[0] = fisher.sum()
  1210                                                           if sparse.issparse(X):
  1211                                                               Bj[idx:] = _safe_toarray(
  1212                                                                   X[:, j].transpose() @ X.multiply(fisher[:, np.newaxis])
  1213                                                               ).ravel()
  1214                                                           else:
  1215                                                               Bj[idx:] = (fisher * X[:, j]) @ X
  1216
  1217                                                           if P2.ndim == 1:
  1218                                                               Bj[idx:] += P2[j]
  1219                                                           else:
  1220                                                               if sparse.issparse(P2):
  1221                                                                   # slice columns as P2 is csc
  1222                                                                   Bj[idx:] += P2[:, j].toarray().ravel()
  1223                                                               else:
  1224                                                                   Bj[idx:] += P2[:, j]
  1225                                                           b = Bj[jdx]
  1226                                                       else:
  1227       749       1421.0      1.9      5.1                  b = B[jdx, jdx]
  1228
  1229                                                       # those ten lines are what it is all about
  1230       749       1513.0      2.0      5.4              if b <= 0:
  1231                                                           z = 0
  1232       749       1674.0      2.2      6.0              elif P1[j] == 0:
  1233                                                           z = -a / b
  1234       749       2334.0      3.1      8.3              elif a + P1[j] < b * (coef[jdx] + d[jdx]):
  1235                                                           z = -(a + P1[j]) / b
  1236       749       2238.0      3.0      8.0              elif a - P1[j] > b * (coef[jdx] + d[jdx]):
  1237                                                           z = -(a - P1[j]) / b
  1238                                                       else:
  1239       749       1744.0      2.3      6.2                  z = -(coef[jdx] + d[jdx])
  1240
  1241                                                       # update direction d
  1242       749       1634.0      2.2      5.8              d[jdx] += z
  1243                                                       # update A because d_j is now d_j+z
  1244                                                       # A = f'(w) + d*H(w) + (w+d)*P2
  1245                                                       # => A += (H+P2)*e_j z = B_j * z
  1246                                                       # Note: B is symmetric B = B.transpose
  1247       749       1117.0      1.5      4.0              if diag_fisher:
  1248                                                           # Bj = B[:, j] calculated above, still valid
  1249                                                           A += Bj * z
  1250                                                       else:
  1251                                                           # B is symmetric, C- or F-contiguous, but never sparse
  1252       749       1478.0      2.0      5.3                  if B.flags["F_CONTIGUOUS"]:
  1253                                                               # slice columns like for sparse csc
  1254       749       5177.0      6.9     18.5                      A += B[:, jdx] * z
  1255                                                           else:  # B.flags['C_CONTIGUOUS'] might be true
  1256                                                               # slice rows
  1257                                                               A += B[jdx, :] * z
  1258                                                       # end of cycle over features
  1259                                                   # update intercept
  1260         7         12.0      1.7      0.0          if intercept:
  1261         7          9.0      1.3      0.0              if diag_fisher:
  1262                                                           Bj = np.zeros_like(A)
  1263                                                           Bj[0] = fisher.sum()
  1264                                                           Bj[1:] = fisher @ X
  1265                                                           b = Bj[0]
  1266                                                       else:
  1267         7         15.0      2.1      0.1                  b = B[0, 0]
  1268         7         24.0      3.4      0.1              z = 0 if b <= 0 else -A[0] / b
  1269         7         16.0      2.3      0.1              d[0] += z
  1270         7         11.0      1.6      0.0              if diag_fisher:
  1271                                                           A += Bj * z
  1272                                                       else:
  1273         7         12.0      1.7      0.0                  if B.flags["F_CONTIGUOUS"]:
  1274         7         47.0      6.7      0.2                      A += B[:, 0] * z
  1275                                                           else:
  1276                                                               A += B[0, :] * z
  1277                                                   # end of complete cycle
  1278                                                   # stopping criterion for inner loop
  1279                                                   # sum_i(|minimum of norm of subgrad of q(d)_i|)
  1280                                                   # subgrad q(d) = A + subgrad ||P1*(w+d)||_1
  1281         7        571.0     81.6      2.0          mn_subgrad = _min_norm_sugrad(coef=coef + d, grad=A, P2=None, P1=P1)
  1282         7        605.0     86.4      2.2          mn_subgrad = linalg.norm(mn_subgrad, ord=1)
  1283         7         14.0      2.0      0.1          if mn_subgrad <= inner_tol:
  1284         7         11.0      1.6      0.0              if inner_iter == 1:
  1285                                                           inner_tol = inner_tol / 4.0
  1286         7          9.0      1.3      0.0              break
  1287                                                   # end of inner loop
  1288         7         10.0      1.4      0.0      return d, coef_P2, n_cycles, inner_tol

Total time: 6.94774 s
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py
Function: _cd_solver at line 1291

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
  1291                                           @profile
  1292                                           def _cd_solver(
  1293                                               coef,
  1294                                               X,
  1295                                               y,
  1296                                               weights,
  1297                                               P1,
  1298                                               P2,
  1299                                               fit_intercept,
  1300                                               family,
  1301                                               link,
  1302                                               max_iter=100,
  1303                                               max_inner_iter=1000,
  1304                                               tol=1e-4,
  1305                                               selection="cyclic ",
  1306                                               random_state=None,
  1307                                               diag_fisher=False,
  1308                                               copy_X=True,
  1309                                           ):
  1310                                               """Solve GLM with L1 and L2 penalty by coordinate descent algorithm.
  1311
  1312                                               The objective being minimized in the coefficients w=coef is::
  1313
  1314                                                   F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1
  1315
  1316                                               An Improved GLMNET for L1-regularized Logistic Regression:
  1317
  1318                                               1. Find optimal descent direction d by minimizing
  1319                                                  min_d F(w+d) = min_d F(w+d) - F(w)
  1320                                               2. Quadratic approximation of F(w+d)-F(w) = q(d):
  1321                                                  using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives:
  1322                                                  q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d
  1323                                                  + ||P1*(w+d)||_1 - ||P1*w||_1
  1324                                                  Then minimize q(d): min_d q(d)
  1325                                               3. Coordinate descent by updating coordinate j (d -> d+z*e_j):
  1326                                                  min_z q(d+z*e_j)
  1327                                                  = min_z q(d+z*e_j) - q(d)
  1328                                                  = min_z A_j z + 1/2 B_jj z^2
  1329                                                          + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1
  1330                                                  A = f'(w) + d*H(w) + (w+d)*P2
  1331                                                  B = H + P2
  1332
  1333                                               Repeat steps 1-3 until convergence.
  1334                                               Note: Use Fisher matrix instead of Hessian for H.
  1335                                               Note: f' = -score, H = Fisher matrix
  1336
  1337                                               Parameters
  1338                                               ----------
  1339                                               coef : ndarray, shape (c,)
  1340                                                   If fit_intercept=False, shape c=X.shape[1].
  1341                                                   If fit_intercept=True, then c=X.shape[1] + 1.
  1342
  1343                                               X : {ndarray, csc sparse matrix}, shape (n_samples, n_features)
  1344                                                   Training data (with intercept included if present). If not sparse,
  1345                                                   pass directly as Fortran-contiguous data to avoid
  1346                                                   unnecessary memory duplication.
  1347
  1348                                               y : ndarray, shape (n_samples,)
  1349                                                   Target values.
  1350
  1351                                               weights: ndarray, shape (n_samples,)
  1352                                                   Sample weights with which the deviance is weighted. The weights must
  1353                                                   bee normalized and sum to 1.
  1354
  1355                                               P1 : {ndarray}, shape (n_features,)
  1356                                                   The L1-penalty vector (=diagonal matrix)
  1357
  1358                                               P2 : {ndarray, csc sparse matrix}, shape (n_features, n_features)
  1359                                                   The L2-penalty matrix or vector (=diagonal matrix). If a matrix is
  1360                                                   passed, it must be symmetric. If X is sparse, P2 must also be sparse.
  1361
  1362                                               fit_intercept : boolean, optional (default=True)
  1363                                                   Specifies if a constant (a.k.a. bias or intercept) should be
  1364                                                   added to the linear predictor (X*coef+intercept).
  1365
  1366                                               family : ExponentialDispersionModel
  1367
  1368                                               link : Link
  1369
  1370                                               max_iter : int, optional (default=100)
  1371                                                   Maximum numer of outer (Newton) iterations.
  1372
  1373                                               max_inner_iter : int, optional (default=1000)
  1374                                                   Maximum number of iterations in each inner loop, i.e. max number of
  1375                                                   cycles over all features per inner loop.
  1376
  1377                                               tol : float, optional (default=1e-4)
  1378                                                   Convergence criterion is
  1379                                                   sum_i(|minimum of norm of subgrad of objective_i|)<=tol.
  1380
  1381                                               selection : str, optional (default='cyclic')
  1382                                                   If 'random', randomly chose features in inner loop.
  1383
  1384                                               random_state : {int, RandomState instance, None}, optional (default=None)
  1385
  1386                                               diag_fisher : boolean, optional (default=False)
  1387                                                   ``False`` calculates full fisher matrix, ``True`` only diagonal matrix
  1388                                                   s.t. fisher = X.T @ diag @ X. This saves storage but needs more
  1389                                                   matrix-vector multiplications.
  1390
  1391                                               copy_X : boolean, optional (default=True)
  1392                                                   If ``True``, X will be copied; else, it may be overwritten.
  1393
  1394                                               Returns
  1395                                               -------
  1396                                               coef : ndarray, shape (c,)
  1397                                                   If fit_intercept=False, shape c=X.shape[1].
  1398                                                   If fit_intercept=True, then c=X.shape[1] + 1.
  1399
  1400                                               n_iter : number of outer iterations = newton iterations
  1401
  1402                                               n_cycles : number of cycles over features
  1403
  1404                                               References
  1405                                               ----------
  1406                                               Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
  1407                                               An Improved GLMNET for L1-regularized Logistic Regression,
  1408                                               Journal of Machine Learning Research 13 (2012) 1999-2030
  1409                                               https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
  1410                                               """
  1411         1          4.0      4.0      0.0      if type(X) is not ColScaledSpMat:
  1412                                                   # TODO: because copy_X is being passed here and is also being passed in
  1413                                                   # check_X_y in fit(...), X is being copied twice. Check if this is true.
  1414         2     279886.0 139943.0      4.0          X = check_array(
  1415         1          4.0      4.0      0.0              X, "csc", dtype=[np.float64, np.float32], order="F", copy=copy_X
  1416                                                   )
  1417         1          7.0      7.0      0.0      if P2.ndim == 2:
  1418                                                   P2 = check_array(
  1419                                                       P2, "csc", dtype=[np.float64, np.float32], order="F", copy=copy_X
  1420                                                   )
  1421         1          9.0      9.0      0.0      if sparse.issparse(X):
  1422                                                   if not sparse.isspmatrix_csc(P2):
  1423                                                       raise ValueError(
  1424                                                           "If X is sparse, P2 must also be sparse csc"
  1425                                                           "format. Got P2 not sparse."
  1426                                                       )
  1427         1         30.0     30.0      0.0      random_state = check_random_state(random_state)
  1428                                               # Note: we already set P2 = l2*P2, P1 = l1*P1
  1429                                               # Note: we already symmetrized P2 = 1/2 (P2 + P2')
  1430         1          2.0      2.0      0.0      n_iter = 0  # number of outer iterations
  1431         1          2.0      2.0      0.0      n_cycles = 0  # number of (complete) cycles over features
  1432         1          2.0      2.0      0.0      converged = False
  1433         1          2.0      2.0      0.0      n_samples, n_features = X.shape
  1434         1          2.0      2.0      0.0      idx = 1 if fit_intercept else 0  # offset if coef[0] is intercept
  1435                                               # line search parameters
  1436         1          2.0      2.0      0.0      (beta, sigma) = (0.5, 0.01)
  1437                                               # some precalculations
  1438                                               # Note: For diag_fisher=False, fisher = X.T @ fisher @ X and fisher is a
  1439                                               #       1d array representing a diagonal matrix.
  1440         2     859869.0 429934.5     12.4      eta, mu, score, fisher = family._eta_mu_score_fisher(
  1441         1          3.0      3.0      0.0          coef=coef, phi=1, X=X, y=y, weights=weights, link=link, diag_fisher=diag_fisher
  1442                                               )
  1443                                               # set up space for search direction d for inner loop
  1444         1         50.0     50.0      0.0      d = np.zeros_like(coef)
  1445
  1446                                               # minimum subgradient norm
  1447         1          4.0      4.0      0.0      def calc_mn_subgrad_norm():
  1448                                                   return linalg.norm(
  1449                                                       _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1), ord=1
  1450                                                   )
  1451
  1452                                               # the ratio of inner _cd_cycle tolerance to the minimum subgradient norm
  1453                                               # This wasn't explored in the newGLMNET paper linked above.
  1454                                               # That paper essentially uses inner_tol_ratio = 1.0, but using a slightly
  1455                                               # lower value is much faster.
  1456                                               # By comparison, the original GLMNET paper uses inner_tol = tol.
  1457                                               # So, inner_tol_ratio < 1 is sort of a compromise between the two papers.
  1458                                               # The value should probably be between 0.01 and 0.5. 0.1 works well for many problems
  1459         1          3.0      3.0      0.0      inner_tol_ratio = 0.1
  1460
  1461         1          3.0      3.0      0.0      def calc_inner_tol(mn_subgrad_norm):
  1462                                                   # Another potential rule limits the inner tol to be no smaller than tol
  1463                                                   # return max(mn_subgrad_norm * inner_tol_ratio, tol)
  1464                                                   return mn_subgrad_norm * inner_tol_ratio
  1465
  1466                                               # initial stopping tolerance of inner loop
  1467                                               # use L1-norm of minimum of norm of subgradient of F
  1468         1        197.0    197.0      0.0      inner_tol = calc_inner_tol(calc_mn_subgrad_norm())
  1469
  1470         1          3.0      3.0      0.0      Fw = None
  1471
  1472                                               # outer loop
  1473         6         19.0      3.2      0.0      while n_iter < max_iter:
  1474         6      23751.0   3958.5      0.3          print(inner_tol, n_iter, n_cycles)
  1475         6         45.0      7.5      0.0          n_iter += 1
  1476                                                   # initialize search direction d (to be optimized) with zero
  1477         6       1800.0    300.0      0.0          d.fill(0)
  1478                                                   # inner loop = _cd_cycle
  1479        12      41985.0   3498.8      0.6          d, coef_P2, n_cycles, inner_tol = _cd_cycle(
  1480         6         18.0      3.0      0.0              d,
  1481         6         18.0      3.0      0.0              X,
  1482         6         19.0      3.2      0.0              coef,
  1483         6         19.0      3.2      0.0              score,
  1484         6         17.0      2.8      0.0              fisher,
  1485         6         20.0      3.3      0.0              P1,
  1486         6        892.0    148.7      0.0              P2,
  1487         6         19.0      3.2      0.0              n_cycles,
  1488         6         17.0      2.8      0.0              inner_tol,
  1489         6         19.0      3.2      0.0              max_inner_iter=max_inner_iter,
  1490         6         17.0      2.8      0.0              selection=selection,
  1491         6         18.0      3.0      0.0              random_state=random_state,
  1492         6         19.0      3.2      0.0              diag_fisher=diag_fisher,
  1493                                                   )
  1494
  1495                                                   # line search by sequence beta^k, k=0, 1, ..
  1496                                                   # F(w + lambda d) - F(w) <= lambda * bound
  1497                                                   # bound = sigma * (f'(w)*d + w*P2*d
  1498                                                   #                  +||P1 (w+d)||_1 - ||P1 w||_1)
  1499         6        303.0     50.5      0.0          P1w_1 = linalg.norm(P1 * coef[idx:], ord=1)
  1500         6        282.0     47.0      0.0          P1wd_1 = linalg.norm(P1 * (coef + d)[idx:], ord=1)
  1501                                                   # Note: coef_P2 already calculated and still valid
  1502         6        109.0     18.2      0.0          bound = sigma * (-(score @ d) + coef_P2 @ d[idx:] + P1wd_1 - P1w_1)
  1503
  1504                                                   # In the first iteration, we must compute Fw explicitly.
  1505                                                   # In later iterations, we just use Fwd from the previous iteration
  1506                                                   # as set after the line search loop below.
  1507         6         18.0      3.0      0.0          if Fw is None:
  1508         1          3.0      3.0      0.0              Fw = (
  1509         3      10457.0   3485.7      0.2                  0.5 * family.deviance(y, mu, weights)
  1510         1         21.0     21.0      0.0                  + 0.5 * (coef_P2 @ coef[idx:])
  1511         1          3.0      3.0      0.0                  + P1w_1
  1512                                                       )
  1513
  1514         6         24.0      4.0      0.0          la = 1.0 / beta
  1515
  1516         6      83303.0  13883.8      1.2          X_dot_d = _safe_lin_pred(X, d)
  1517
  1518                                                   # Try progressively shorter line search steps.
  1519         6         53.0      8.8      0.0          for k in range(20):
  1520         6         21.0      3.5      0.0              la *= beta  # starts with la=1
  1521         6         86.0     14.3      0.0              coef_wd = coef + la * d
  1522
  1523                                                       # The simple version of the next line is:
  1524                                                       # mu_wd = link.inverse(_safe_lin_pred(X, coef_wd))
  1525                                                       # but because coef_wd can be factored as
  1526                                                       # coef_wd = coef + la * d
  1527                                                       # we can rewrite to only perform one dot product with the data
  1528                                                       # matrix per loop which is substantially faster
  1529         6     123883.0  20647.2      1.8              mu_wd = link.inverse(eta + la * X_dot_d)
  1530
  1531                                                       # TODO - optimize: for Tweedie that isn't one of the special cases
  1532                                                       # (gaussian, poisson, gamma), family.deviance is quite slow! Can we
  1533                                                       # fix that somehow?
  1534        12      64803.0   5400.2      0.9              Fwd = 0.5 * family.deviance(y, mu_wd, weights) + linalg.norm(
  1535         6         78.0     13.0      0.0                  P1 * coef_wd[idx:], ord=1
  1536                                                       )
  1537         6         19.0      3.2      0.0              if P2.ndim == 1:
  1538         6        164.0     27.3      0.0                  Fwd += 0.5 * ((coef_wd[idx:] * P2) @ coef_wd[idx:])
  1539                                                       else:
  1540                                                           Fwd += 0.5 * (coef_wd[idx:] @ (P2 @ coef_wd[idx:]))
  1541         6         27.0      4.5      0.0              if Fwd - Fw <= sigma * la * bound:
  1542         6         19.0      3.2      0.0                  break
  1543
  1544                                                   # Fw in the next iteration will be equal to Fwd this iteration.
  1545         6         18.0      3.0      0.0          Fw = Fwd
  1546
  1547                                                   # update coefficients
  1548         6         76.0     12.7      0.0          coef += la * d
  1549
  1550                                                   # calculate eta, mu, score, Fisher matrix for next iteration
  1551        12    5453705.0 454475.4     78.5          eta, mu, score, fisher = family._eta_mu_score_fisher(
  1552         6         17.0      2.8      0.0              coef=coef,
  1553         6         17.0      2.8      0.0              phi=1,
  1554         6         17.0      2.8      0.0              X=X,
  1555         6         16.0      2.7      0.0              y=y,
  1556         6         18.0      3.0      0.0              weights=weights,
  1557         6         17.0      2.8      0.0              link=link,
  1558         6         17.0      2.8      0.0              diag_fisher=diag_fisher,
  1559                                                   )
  1560
  1561                                                   # stopping criterion for outer loop
  1562                                                   # sum_i(|minimum-norm of subgrad of F(w)_i|)
  1563                                                   # fp_wP2 = f'(w) + w*P2
  1564                                                   # Note: eta, mu and score are already updated
  1565                                                   # this also updates the inner tolerance for the next loop!
  1566         6       1286.0    214.3      0.0          mn_subgrad_norm = calc_mn_subgrad_norm()
  1567         6         29.0      4.8      0.0          if mn_subgrad_norm <= tol:
  1568         1          3.0      3.0      0.0              converged = True
  1569         1          3.0      3.0      0.0              break
  1570
  1571         5         27.0      5.4      0.0          inner_tol = calc_inner_tol(mn_subgrad_norm)
  1572                                                   # end of outer loop
  1573
  1574         1          3.0      3.0      0.0      if not converged:
  1575                                                   warnings.warn(
  1576                                                       "Coordinate descent failed to converge. Increase"
  1577                                                       " the maximum number of iterations max_iter"
  1578                                                       " (currently {})".format(max_iter),
  1579                                                       ConvergenceWarning,
  1580                                                   )
  1581
  1582         1          3.0      3.0      0.0      return coef, n_iter, n_cycles

Total time: 8.55864 s
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py
Function: fit at line 1884

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
  1884                                               @profile
  1885                                               def fit(self, X, y, sample_weight=None):
  1886                                                   """Fit a Generalized Linear Model.
  1887
  1888                                                   Parameters
  1889                                                   ----------
  1890                                                   X : {array-like, sparse matrix}, shape (n_samples, n_features)
  1891                                                       Training data.
  1892
  1893                                                   y : array-like, shape (n_samples,)
  1894                                                       Target values.
  1895
  1896                                                   sample_weight : {None, array-like}, shape (n_samples,),\
  1897                                                           optional (default=None)
  1898                                                       Individual weights w_i for each sample. Note that for an
  1899                                                       Exponential Dispersion Model (EDM), one has
  1900                                                       Var[Y_i]=phi/w_i * v(mu).
  1901                                                       If Y_i ~ EDM(mu, phi/w_i), then
  1902                                                       sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a
  1903                                                       weighted average with weights=sample_weight.
  1904
  1905                                                   Returns
  1906                                                   -------
  1907                                                   self : returns an instance of self.
  1908                                                   """
  1909                                                   # 99.8% of time is in the _cd_solver function
  1910                                                   #######################################################################
  1911                                                   # 1. input validation                                                 #
  1912                                                   #######################################################################
  1913                                                   # 1.1 validate arguments of __init__ ##################################
  1914                                                   # Guarantee that self._family_instance is an instance of class
  1915                                                   # ExponentialDispersionModel
  1916         1         56.0     56.0      0.0          if isinstance(self.family, ExponentialDispersionModel):
  1917                                                       self._family_instance = self.family
  1918                                                   else:
  1919         1          5.0      5.0      0.0              if self.family == "normal":
  1920                                                           self._family_instance = NormalDistribution()
  1921         1          5.0      5.0      0.0              elif self.family == "poisson":
  1922         1         27.0     27.0      0.0                  self._family_instance = PoissonDistribution()
  1923                                                       elif self.family == "gamma":
  1924                                                           self._family_instance = GammaDistribution()
  1925                                                       elif self.family == "inverse.gaussian":
  1926                                                           self._family_instance = InverseGaussianDistribution()
  1927                                                       elif self.family == "binomial":
  1928                                                           self._family_instance = BinomialDistribution()
  1929                                                       else:
  1930                                                           raise ValueError(
  1931                                                               "The family must be an instance of class"
  1932                                                               " ExponentialDispersionModel or an element of"
  1933                                                               " ['normal', 'poisson', 'gamma', 'inverse.gaussian', "
  1934                                                               "'binomial']; got (family={})".format(self.family)
  1935                                                           )
  1936
  1937                                                   # Guarantee that self._link_instance is set to an instance of
  1938                                                   # class Link
  1939         1         16.0     16.0      0.0          if isinstance(self.link, Link):
  1940                                                       self._link_instance = self.link
  1941                                                   else:
  1942         1          5.0      5.0      0.0              if self.link == "auto":
  1943         1          7.0      7.0      0.0                  if isinstance(self._family_instance, TweedieDistribution):
  1944         1          6.0      6.0      0.0                      if self._family_instance.power <= 0:
  1945                                                                   self._link_instance = IdentityLink()
  1946         1          5.0      5.0      0.0                      if self._family_instance.power >= 1:
  1947         1          8.0      8.0      0.0                          self._link_instance = LogLink()
  1948                                                           elif isinstance(self._family_instance, GeneralizedHyperbolicSecant):
  1949                                                               self._link_instance = IdentityLink()
  1950                                                           elif isinstance(self._family_instance, BinomialDistribution):
  1951                                                               self._link_instance = LogitLink()
  1952                                                           else:
  1953                                                               raise ValueError(
  1954                                                                   "No default link known for the "
  1955                                                                   "specified distribution family. Please "
  1956                                                                   "set link manually, i.e. not to 'auto'; "
  1957                                                                   "got (link='auto', family={}".format(self.family)
  1958                                                               )
  1959                                                       elif self.link == "identity":
  1960                                                           self._link_instance = IdentityLink()
  1961                                                       elif self.link == "log":
  1962                                                           self._link_instance = LogLink()
  1963                                                       elif self.link == "logit":
  1964                                                           self._link_instance = LogitLink()
  1965                                                       else:
  1966                                                           raise ValueError(
  1967                                                               "The link must be an instance of class Link or "
  1968                                                               "an element of ['auto', 'identity', 'log', 'logit']; "
  1969                                                               "got (link={})".format(self.link)
  1970                                                           )
  1971
  1972         1         15.0     15.0      0.0          if not isinstance(self.alpha, numbers.Number) or self.alpha < 0:
  1973                                                       raise ValueError(
  1974                                                           "Penalty term must be a non-negative number;"
  1975                                                           " got (alpha={})".format(self.alpha)
  1976                                                       )
  1977         3         15.0      5.0      0.0          if (
  1978         1          5.0      5.0      0.0              not isinstance(self.l1_ratio, numbers.Number)
  1979         1          5.0      5.0      0.0              or self.l1_ratio < 0
  1980         1          5.0      5.0      0.0              or self.l1_ratio > 1
  1981                                                   ):
  1982                                                       raise ValueError(
  1983                                                           "l1_ratio must be a number in interval [0, 1];"
  1984                                                           " got (l1_ratio={})".format(self.l1_ratio)
  1985                                                       )
  1986         1          6.0      6.0      0.0          if not isinstance(self.fit_intercept, bool):
  1987                                                       raise ValueError(
  1988                                                           "The argument fit_intercept must be bool;"
  1989                                                           " got {}".format(self.fit_intercept)
  1990                                                       )
  1991         1          5.0      5.0      0.0          if self.solver not in ["auto", "irls", "lbfgs", "newton-cg", "cd"]:
  1992                                                       raise ValueError(
  1993                                                           "GeneralizedLinearRegressor supports only solvers"
  1994                                                           " 'auto', 'irls', 'lbfgs', 'newton-cg' and 'cd';"
  1995                                                           " got {}".format(self.solver)
  1996                                                       )
  1997         1          4.0      4.0      0.0          solver = self.solver
  1998         1          5.0      5.0      0.0          if self.solver == "auto":
  1999         1          5.0      5.0      0.0              if self.l1_ratio == 0:
  2000                                                           solver = "irls"
  2001                                                       else:
  2002         1          5.0      5.0      0.0                  solver = "cd"
  2003         1          4.0      4.0      0.0          if self.alpha > 0 and self.l1_ratio > 0 and solver not in ["cd"]:
  2004                                                       raise ValueError(
  2005                                                           "The chosen solver (solver={}) can't deal "
  2006                                                           "with L1 penalties, which are included with "
  2007                                                           "(alpha={}) and (l1_ratio={}).".format(
  2008                                                               solver, self.alpha, self.l1_ratio
  2009                                                           )
  2010                                                       )
  2011         1          5.0      5.0      0.0          if not isinstance(self.max_iter, int) or self.max_iter <= 0:
  2012                                                       raise ValueError(
  2013                                                           "Maximum number of iteration must be a positive "
  2014                                                           "integer;"
  2015                                                           " got (max_iter={!r})".format(self.max_iter)
  2016                                                       )
  2017         1          6.0      6.0      0.0          if not isinstance(self.tol, numbers.Number) or self.tol <= 0:
  2018                                                       raise ValueError(
  2019                                                           "Tolerance for stopping criteria must be "
  2020                                                           "positive; got (tol={!r})".format(self.tol)
  2021                                                       )
  2022         1          5.0      5.0      0.0          if not isinstance(self.warm_start, bool):
  2023                                                       raise ValueError(
  2024                                                           "The argument warm_start must be bool;"
  2025                                                           " got {}".format(self.warm_start)
  2026                                                       )
  2027         1          4.0      4.0      0.0          if self.selection not in ["cyclic", "random"]:
  2028                                                       raise ValueError(
  2029                                                           "The argument selection must be 'cyclic' or "
  2030                                                           "'random'; got (selection={})".format(self.selection)
  2031                                                       )
  2032         1        316.0    316.0      0.0          random_state = check_random_state(self.random_state)
  2033         1          6.0      6.0      0.0          if not isinstance(self.diag_fisher, bool):
  2034                                                       raise ValueError(
  2035                                                           "The argument diag_fisher must be bool;"
  2036                                                           " got {}".format(self.diag_fisher)
  2037                                                       )
  2038         1          5.0      5.0      0.0          if not isinstance(self.copy_X, bool):
  2039                                                       raise ValueError(
  2040                                                           "The argument copy_X must be bool;" " got {}".format(self.copy_X)
  2041                                                       )
  2042         1          5.0      5.0      0.0          if not isinstance(self.check_input, bool):
  2043                                                       raise ValueError(
  2044                                                           "The argument check_input must be bool; got "
  2045                                                           "(check_input={})".format(self.check_input)
  2046                                                       )
  2047
  2048         1          4.0      4.0      0.0          family = self._family_instance
  2049         1          5.0      5.0      0.0          link = self._link_instance
  2050
  2051                                                   # 1.2 validate arguments of fit #######################################
  2052         1          6.0      6.0      0.0          _dtype = [np.float64, np.float32]
  2053         1          5.0      5.0      0.0          if solver == "cd":
  2054         1          5.0      5.0      0.0              _stype = ["csc"]
  2055                                                   else:
  2056                                                       _stype = ["csc", "csr"]
  2057         2     260252.0 130126.0      3.0          X, y = check_X_y(
  2058         1          4.0      4.0      0.0              X,
  2059         1          4.0      4.0      0.0              y,
  2060         1          5.0      5.0      0.0              accept_sparse=_stype,
  2061         1          5.0      5.0      0.0              dtype=_dtype,
  2062         1          5.0      5.0      0.0              y_numeric=True,
  2063         1          5.0      5.0      0.0              multi_output=False,
  2064         1          5.0      5.0      0.0              copy=self.copy_X,
  2065                                                   )
  2066                                                   # Without converting y to float, deviance might raise
  2067                                                   # ValueError: Integers to negative integer powers are not allowed.
  2068                                                   # Also, y must not be sparse.
  2069         1         10.0     10.0      0.0          y = np.asarray(y, dtype=np.float64)
  2070
  2071         1       1742.0   1742.0      0.0          weights = _check_weights(sample_weight, y.shape[0])
  2072
  2073         1          7.0      7.0      0.0          n_samples, n_features = X.shape
  2074
  2075                                                   # 1.3 arguments to take special care ##################################
  2076                                                   # P1, P2, start_params
  2077         1          6.0      6.0      0.0          if isinstance(self.P1, str) and self.P1 == "identity":
  2078         1         25.0     25.0      0.0              P1 = np.ones(n_features)
  2079                                                   else:
  2080                                                       P1 = np.atleast_1d(self.P1)
  2081                                                       try:
  2082                                                           P1 = P1.astype(np.float64, casting="safe", copy=False)
  2083                                                       except TypeError:
  2084                                                           raise TypeError(
  2085                                                               "The given P1 cannot be converted to a numeric"
  2086                                                               "array; got (P1.dtype={}).".format(P1.dtype)
  2087                                                           )
  2088                                                       if (P1.ndim != 1) or (P1.shape[0] != n_features):
  2089                                                           raise ValueError(
  2090                                                               "P1 must be either 'identity' or a 1d array "
  2091                                                               "with the length of X.shape[1]; "
  2092                                                               "got (P1.shape[0]={}), "
  2093                                                               "needed (X.shape[1]={}).".format(P1.shape[0], n_features)
  2094                                                           )
  2095                                                   # If X is sparse, make P2 sparse, too.
  2096         1          5.0      5.0      0.0          if isinstance(self.P2, str) and self.P2 == "identity":
  2097         1          8.0      8.0      0.0              if sparse.issparse(X):
  2098                                                           P2 = (
  2099                                                               sparse.dia_matrix(
  2100                                                                   (np.ones(n_features), 0), shape=(n_features, n_features)
  2101                                                               )
  2102                                                           ).tocsc()
  2103                                                       else:
  2104         1         10.0     10.0      0.0                  P2 = np.ones(n_features)
  2105                                                   else:
  2106                                                       P2 = check_array(
  2107                                                           self.P2, copy=True, accept_sparse=_stype, dtype=_dtype, ensure_2d=False
  2108                                                       )
  2109                                                       if P2.ndim == 1:
  2110                                                           P2 = np.asarray(P2)
  2111                                                           if P2.shape[0] != n_features:
  2112                                                               raise ValueError(
  2113                                                                   "P2 should be a 1d array of shape "
  2114                                                                   "(n_features,) with "
  2115                                                                   "n_features=X.shape[1]; "
  2116                                                                   "got (P2.shape=({},)), needed ({},)".format(
  2117                                                                       P2.shape[0], X.shape[1]
  2118                                                                   )
  2119                                                               )
  2120                                                           if sparse.issparse(X):
  2121                                                               P2 = (
  2122                                                                   sparse.dia_matrix((P2, 0), shape=(n_features, n_features))
  2123                                                               ).tocsc()
  2124                                                       elif (
  2125                                                           P2.ndim == 2
  2126                                                           and P2.shape[0] == P2.shape[1]
  2127                                                           and P2.shape[0] == X.shape[1]
  2128                                                       ):
  2129                                                           if sparse.issparse(X):
  2130                                                               P2 = sparse.csc_matrix(P2)
  2131                                                       else:
  2132                                                           raise ValueError(
  2133                                                               "P2 must be either None or an array of shape "
  2134                                                               "(n_features, n_features) with "
  2135                                                               "n_features=X.shape[1]; "
  2136                                                               "got (P2.shape=({0}, {1})), needed ({2}, {2})".format(
  2137                                                                   P2.shape[0], P2.shape[1], X.shape[1]
  2138                                                               )
  2139                                                           )
  2140
  2141         1          6.0      6.0      0.0          start_params = self.start_params
  2142         1          5.0      5.0      0.0          if isinstance(start_params, str):
  2143         1          5.0      5.0      0.0              if start_params not in ["guess", "zero"]:
  2144                                                           raise ValueError(
  2145                                                               "The argument start_params must be 'guess', "
  2146                                                               "'zero' or an array of correct length; "
  2147                                                               "got(start_params={})".format(start_params)
  2148                                                           )
  2149                                                   else:
  2150                                                       start_params = check_array(
  2151                                                           start_params,
  2152                                                           accept_sparse=False,
  2153                                                           force_all_finite=True,
  2154                                                           ensure_2d=False,
  2155                                                           dtype=_dtype,
  2156                                                           copy=True,
  2157                                                       )
  2158                                                       if (start_params.shape[0] != X.shape[1] + self.fit_intercept) or (
  2159                                                           start_params.ndim != 1
  2160                                                       ):
  2161                                                           raise ValueError(
  2162                                                               "Start values for parameters must have the"
  2163                                                               "right length and dimension; required (length"
  2164                                                               "={}, ndim=1); got (length={}, ndim={}).".format(
  2165                                                                   X.shape[1] + self.fit_intercept,
  2166                                                                   start_params.shape[0],
  2167                                                                   start_params.ndim,
  2168                                                               )
  2169                                                           )
  2170
  2171         1          6.0      6.0      0.0          l1 = self.alpha * self.l1_ratio
  2172         1          6.0      6.0      0.0          l2 = self.alpha * (1 - self.l1_ratio)
  2173                                                   # P1 and P2 are now for sure copies
  2174         1         11.0     11.0      0.0          P1 = l1 * P1
  2175         1          6.0      6.0      0.0          P2 = l2 * P2
  2176                                                   # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2')
  2177                                                   # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric
  2178         1          5.0      5.0      0.0          if P2.ndim == 2:
  2179                                                       if sparse.issparse(P2):
  2180                                                           if sparse.isspmatrix_csc(P2):
  2181                                                               P2 = 0.5 * (P2 + P2.transpose()).tocsc()
  2182                                                           else:
  2183                                                               P2 = 0.5 * (P2 + P2.transpose()).tocsr()
  2184                                                       else:
  2185                                                           P2 = 0.5 * (P2 + P2.T)
  2186
  2187                                                   # For coordinate descent, if X is sparse, P2 must also be csc
  2188         1          5.0      5.0      0.0          if solver == "cd" and sparse.issparse(X):
  2189                                                       P2 = sparse.csc_matrix(P2)
  2190
  2191                                                   # 1.4 additional validations ##########################################
  2192         1          5.0      5.0      0.0          if self.check_input:
  2193         1       1086.0   1086.0      0.0              if not np.all(family.in_y_range(y)):
  2194                                                           raise ValueError(
  2195                                                               "Some value(s) of y are out of the valid "
  2196                                                               "range for family {}".format(family.__class__.__name__)
  2197                                                           )
  2198                                                       # check if P1 has only non-negative values, negative values might
  2199                                                       # indicate group lasso in the future.
  2200         1          6.0      6.0      0.0              if not isinstance(self.P1, str):  # if self.P1 != 'identity':
  2201                                                           if not np.all(P1 >= 0):
  2202                                                               raise ValueError("P1 must not have negative values.")
  2203                                                       # check if P2 is positive semidefinite
  2204                                                       # np.linalg.cholesky(P2) 'only' asserts positive definite
  2205         1          5.0      5.0      0.0              if not isinstance(self.P2, str):  # self.P2 != 'identity'
  2206                                                           # due to numerical precision, we allow eigenvalues to be a
  2207                                                           # tiny bit negative
  2208                                                           epsneg = -10 * np.finfo(P2.dtype).epsneg
  2209                                                           if P2.ndim == 1 or P2.shape[0] == 1:
  2210                                                               p2 = P2
  2211                                                               if sparse.issparse(P2):
  2212                                                                   p2 = P2.toarray()
  2213                                                               if not np.all(p2 >= 0):
  2214                                                                   raise ValueError(
  2215                                                                       "1d array P2 must not have negative " "values."
  2216                                                                   )
  2217                                                           elif sparse.issparse(P2):
  2218                                                               # for sparse matrices, not all eigenvals can be computed
  2219                                                               # efficiently, use only half of n_features
  2220                                                               # k = how many eigenvals to compute
  2221                                                               k = np.min([10, n_features // 10 + 1])
  2222                                                               sigma = -1000 * epsneg  # start searching near this value
  2223                                                               which = "SA"  # find smallest algebraic eigenvalues first
  2224                                                               eigenvalues = splinalg.eigsh(
  2225                                                                   P2, k=k, sigma=sigma, which=which, return_eigenvectors=False
  2226                                                               )
  2227                                                               if not np.all(eigenvalues >= epsneg):
  2228                                                                   raise ValueError("P2 must be positive semi-definite.")
  2229                                                           else:
  2230                                                               if not np.all(linalg.eigvalsh(P2) >= epsneg):
  2231                                                                   raise ValueError("P2 must be positive semi-definite.")
  2232                                                       # TODO: if alpha=0 check that X is not rank deficient
  2233                                                       # TODO: what else to check?
  2234
  2235                                                   #######################################################################
  2236                                                   # 2a. rescaling of weights (sample_weight)                             #
  2237                                                   #######################################################################
  2238                                                   # IMPORTANT NOTE: Since we want to minimize
  2239                                                   # 1/(2*sum(sample_weight)) * deviance + L1 + L2,
  2240                                                   # deviance = sum(sample_weight * unit_deviance),
  2241                                                   # we rescale weights such that sum(weights) = 1 and this becomes
  2242                                                   # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance)
  2243         1        351.0    351.0      0.0          weights_sum = np.sum(weights)
  2244         1       1916.0   1916.0      0.0          weights = weights / weights_sum
  2245
  2246                                                   #######################################################################
  2247                                                   # 2b. potentially rescale predictors
  2248                                                   #######################################################################
  2249         1          7.0      7.0      0.0          if self.standardize:
  2250         1     356277.0 356277.0      4.2              X, P1, P2, col_means, col_stds = _standardize(X, P1, P2, weights)
  2251
  2252                                                   #######################################################################
  2253                                                   # 3. initialization of coef = (intercept_, coef_)                     #
  2254                                                   #######################################################################
  2255                                                   # Note: Since phi=self.dispersion_ does not enter the estimation
  2256                                                   #       of mu_i=E[y_i], set it to 1.
  2257
  2258                                                   # set start values for coef
  2259         1         11.0     11.0      0.0          if self.warm_start and hasattr(self, "coef_"):
  2260                                                       coef = self.coef_
  2261                                                       intercept = self.intercept_
  2262                                                       if self.fit_intercept:
  2263                                                           coef = np.concatenate((np.array([intercept]), coef))
  2264                                                       if self.standardize:
  2265                                                           _standardize_warm_start(coef, col_means, col_stds)
  2266         1          9.0      9.0      0.0          elif isinstance(start_params, str):
  2267         1          8.0      8.0      0.0              if start_params == "guess":
  2268                                                           # Set mu=starting_mu of the family and do one Newton step
  2269                                                           # If solver=cd use cd, else irls
  2270         1       7813.0   7813.0      0.1                  mu = family.starting_mu(y, weights=weights)
  2271         1      21668.0  21668.0      0.3                  eta = link.link(mu)  # linear predictor
  2272         1         11.0     11.0      0.0                  if solver in ["cd", "lbfgs", "newton-cg"]:
  2273                                                               # see function _cd_solver
  2274         1      14785.0  14785.0      0.2                      sigma_inv = 1 / family.variance(mu, phi=1, weights=weights)
  2275         1      17775.0  17775.0      0.2                      d1 = link.inverse_derivative(eta)
  2276         1       7121.0   7121.0      0.1                      temp = sigma_inv * d1 * (y - mu)
  2277         1         11.0     11.0      0.0                      if self.fit_intercept:
  2278         1      11419.0  11419.0      0.1                          score = np.concatenate(([temp.sum()], temp @ X))
  2279                                                               else:
  2280                                                                   score = temp @ X  # same as X.T @ temp
  2281
  2282         1       1965.0   1965.0      0.0                      d2_sigma_inv = d1 * d1 * sigma_inv
  2283         1          9.0      9.0      0.0                      diag_fisher = self.diag_fisher
  2284         1          7.0      7.0      0.0                      if diag_fisher:
  2285                                                                   fisher = d2_sigma_inv
  2286                                                               else:
  2287         2     766539.0 383269.5      9.0                          fisher = _safe_sandwich_dot(
  2288         1          7.0      7.0      0.0                              X, d2_sigma_inv, intercept=self.fit_intercept
  2289                                                                   )
  2290                                                               # set up space for search direction d for inner loop
  2291         1          9.0      9.0      0.0                      if self.fit_intercept:
  2292         1         20.0     20.0      0.0                          coef = np.zeros(n_features + 1)
  2293                                                               else:
  2294                                                                   coef = np.zeros(n_features)
  2295         1         65.0     65.0      0.0                      d = np.zeros_like(coef)
  2296                                                               # initial stopping tolerance of inner loop
  2297                                                               # use L1-norm of minimum of norm of subgradient of F
  2298                                                               # use less restrictive tolerance for initial guess
  2299         1        122.0    122.0      0.0                      inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1)
  2300         1        114.0    114.0      0.0                      inner_tol = 4 * linalg.norm(inner_tol, ord=1)
  2301                                                               # just one outer loop = Newton step
  2302         1          9.0      9.0      0.0                      n_cycles = 0
  2303         2       6689.0   3344.5      0.1                      d, coef_P2, n_cycles, inner_tol = _cd_cycle(
  2304         1          8.0      8.0      0.0                          d,
  2305         1          7.0      7.0      0.0                          X,
  2306         1          7.0      7.0      0.0                          coef,
  2307         1          8.0      8.0      0.0                          score,
  2308         1          8.0      8.0      0.0                          fisher,
  2309         1          8.0      8.0      0.0                          P1,
  2310         1          8.0      8.0      0.0                          P2,
  2311         1          8.0      8.0      0.0                          n_cycles,
  2312         1          8.0      8.0      0.0                          inner_tol,
  2313         1          8.0      8.0      0.0                          max_inner_iter=1000,
  2314         1          9.0      9.0      0.0                          selection=self.selection,
  2315         1          8.0      8.0      0.0                          random_state=random_state,
  2316         1          8.0      8.0      0.0                          diag_fisher=self.diag_fisher,
  2317                                                               )
  2318         1         13.0     13.0      0.0                      coef += d  # for simplicity no line search here
  2319                                                           else:
  2320                                                               # See _irls_solver
  2321                                                               # h'(eta)
  2322                                                               hp = link.inverse_derivative(eta)
  2323                                                               # working weights W, in principle a diagonal matrix
  2324                                                               # therefore here just as 1d array
  2325                                                               W = hp ** 2 / family.variance(mu, phi=1, weights=weights)
  2326                                                               # working observations
  2327                                                               z = eta + (y - mu) / hp
  2328                                                               # solve A*coef = b
  2329                                                               # A = X' W X + l2 P2, b = X' W z
  2330                                                               coef = _irls_step(X, W, P2, z, fit_intercept=self.fit_intercept)
  2331                                                       else:  # start_params == 'zero'
  2332                                                           if self.fit_intercept:
  2333                                                               coef = np.zeros(n_features + 1)
  2334                                                               coef[0] = link.link(np.average(y, weights=weights))
  2335                                                           else:
  2336                                                               coef = np.zeros(n_features)
  2337                                                   else:  # assign given array as start values
  2338                                                       coef = start_params
  2339                                                       if self.standardize:
  2340                                                           _standardize_warm_start(coef, col_means, col_stds)
  2341
  2342                                                   #######################################################################
  2343                                                   # 4. fit                                                              #
  2344                                                   #######################################################################
  2345                                                   # algorithms for optimization
  2346                                                   # TODO: Parallelize it?
  2347
  2348                                                   # 4.1 IRLS ############################################################
  2349                                                   # Note: we already set P2 = l2*P2, see above
  2350                                                   # Note: we already symmetrized P2 = 1/2 (P2 + P2')
  2351         1          8.0      8.0      0.0          if solver == "irls":
  2352                                                       coef, self.n_iter_ = _irls_solver(
  2353                                                           coef=coef,
  2354                                                           X=X,
  2355                                                           y=y,
  2356                                                           weights=weights,
  2357                                                           P2=P2,
  2358                                                           fit_intercept=self.fit_intercept,
  2359                                                           family=family,
  2360                                                           link=link,
  2361                                                           max_iter=self.max_iter,
  2362                                                           tol=self.tol,
  2363                                                       )
  2364
  2365                                                   # 4.2 L-BFGS ##########################################################
  2366         1          8.0      8.0      0.0          elif solver == "lbfgs":
  2367
  2368                                                       def func(coef, X, y, weights, P2, family, link):
  2369                                                           mu, devp = family._mu_deviance_derivative(coef, X, y, weights, link)
  2370                                                           dev = family.deviance(y, mu, weights)
  2371                                                           intercept = coef.size == X.shape[1] + 1
  2372                                                           idx = 1 if intercept else 0  # offset if coef[0] is intercept
  2373                                                           if P2.ndim == 1:
  2374                                                               L2 = P2 * coef[idx:]
  2375                                                           else:
  2376                                                               L2 = P2 @ coef[idx:]
  2377                                                           obj = 0.5 * dev + 0.5 * (coef[idx:] @ L2)
  2378                                                           objp = 0.5 * devp
  2379                                                           objp[idx:] += L2
  2380                                                           return obj, objp
  2381
  2382                                                       args = (X, y, weights, P2, family, link)
  2383                                                       coef, loss, info = fmin_l_bfgs_b(
  2384                                                           func,
  2385                                                           coef,
  2386                                                           fprime=None,
  2387                                                           args=args,
  2388                                                           iprint=(self.verbose > 0) - 1,
  2389                                                           pgtol=self.tol,
  2390                                                           maxiter=self.max_iter,
  2391                                                           factr=1e3,
  2392                                                       )
  2393                                                       if info["warnflag"] == 1:
  2394                                                           warnings.warn(
  2395                                                               "lbfgs failed to converge." " Increase the number of iterations.",
  2396                                                               ConvergenceWarning,
  2397                                                           )
  2398                                                       elif info["warnflag"] == 2:
  2399                                                           warnings.warn("lbfgs failed for the reason: {}".format(info["task"]))
  2400                                                       self.n_iter_ = info["nit"]
  2401
  2402                                                   # 4.3 Newton-CG #######################################################
  2403                                                   # We use again the fisher matrix instead of the hessian. More
  2404                                                   # precisely, expected hessian of deviance.
  2405         1          8.0      8.0      0.0          elif solver == "newton-cg":
  2406
  2407                                                       def func(coef, X, y, weights, P2, family, link):
  2408                                                           intercept = coef.size == X.shape[1] + 1
  2409                                                           idx = 1 if intercept else 0  # offset if coef[0] is intercept
  2410                                                           if P2.ndim == 1:
  2411                                                               L2 = coef[idx:] @ (P2 * coef[idx:])
  2412                                                           else:
  2413                                                               L2 = coef[idx:] @ (P2 @ coef[idx:])
  2414                                                           mu = link.inverse(_safe_lin_pred(X, coef))
  2415                                                           return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2
  2416
  2417                                                       def grad(coef, X, y, weights, P2, family, link):
  2418                                                           mu, devp = family._mu_deviance_derivative(coef, X, y, weights, link)
  2419                                                           intercept = coef.size == X.shape[1] + 1
  2420                                                           idx = 1 if intercept else 0  # offset if coef[0] is intercept
  2421                                                           if P2.ndim == 1:
  2422                                                               L2 = P2 * coef[idx:]
  2423                                                           else:
  2424                                                               L2 = P2 @ coef[idx:]
  2425                                                           objp = 0.5 * devp
  2426                                                           objp[idx:] += L2
  2427                                                           return objp
  2428
  2429                                                       def grad_hess(coef, X, y, weights, P2, family, link):
  2430                                                           intercept = coef.size == X.shape[1] + 1
  2431                                                           idx = 1 if intercept else 0  # offset if coef[0] is intercept
  2432                                                           if P2.ndim == 1:
  2433                                                               L2 = P2 * coef[idx:]
  2434                                                           else:
  2435                                                               L2 = P2 @ coef[idx:]
  2436                                                           eta = _safe_lin_pred(X, coef)
  2437                                                           mu = link.inverse(eta)
  2438                                                           d1 = link.inverse_derivative(eta)
  2439                                                           temp = d1 * family.deviance_derivative(y, mu, weights)
  2440                                                           if intercept:
  2441                                                               grad = np.concatenate(([0.5 * temp.sum()], 0.5 * temp @ X + L2))
  2442                                                           else:
  2443                                                               grad = 0.5 * temp @ X + L2  # same as 0.5* X.T @ temp + L2
  2444
  2445                                                           # expected hessian = fisher = X.T @ diag_matrix @ X
  2446                                                           # calculate only diag_matrix
  2447                                                           diag = d1 ** 2 / family.variance(mu, phi=1, weights=weights)
  2448                                                           if intercept:
  2449                                                               h0i = np.concatenate(([diag.sum()], diag @ X))
  2450
  2451                                                           def Hs(coef):
  2452                                                               # return (0.5 * fisher + P2) @ coef
  2453                                                               # ret = 0.5 * (X.T @ (diag * (X @ coef)))
  2454                                                               ret = 0.5 * ((diag * (X @ coef[idx:])) @ X)
  2455                                                               if P2.ndim == 1:
  2456                                                                   ret += P2 * coef[idx:]
  2457                                                               else:
  2458                                                                   ret += P2 @ coef[idx:]
  2459                                                               if intercept:
  2460                                                                   ret = np.concatenate(
  2461                                                                       ([0.5 * (h0i @ coef)], ret + 0.5 * coef[0] * h0i[1:])
  2462                                                                   )
  2463                                                               return ret
  2464
  2465                                                           return grad, Hs
  2466
  2467                                                       args = (X, y, weights, P2, family, link)
  2468                                                       coef, self.n_iter_ = newton_cg(
  2469                                                           grad_hess,
  2470                                                           func,
  2471                                                           grad,
  2472                                                           coef,
  2473                                                           args=args,
  2474                                                           maxiter=self.max_iter,
  2475                                                           tol=self.tol,
  2476                                                       )
  2477
  2478                                                   # 4.4 coordinate descent ##############################################
  2479                                                   # Note: we already set P1 = l1*P1, see above
  2480                                                   # Note: we already set P2 = l2*P2, see above
  2481                                                   # Note: we already symmetrized P2 = 1/2 (P2 + P2')
  2482         1          8.0      8.0      0.0          elif solver == "cd":
  2483         2    6952782.0 3476391.0     81.2              coef, self.n_iter_, self._n_cycles = _cd_solver(
  2484         1          8.0      8.0      0.0                  coef=coef,
  2485         1          8.0      8.0      0.0                  X=X,
  2486         1          8.0      8.0      0.0                  y=y,
  2487         1          9.0      9.0      0.0                  weights=weights,
  2488         1          9.0      9.0      0.0                  P1=P1,
  2489         1          8.0      8.0      0.0                  P2=P2,
  2490         1          8.0      8.0      0.0                  fit_intercept=self.fit_intercept,
  2491         1          8.0      8.0      0.0                  family=family,
  2492         1          8.0      8.0      0.0                  link=link,
  2493         1          8.0      8.0      0.0                  max_iter=self.max_iter,
  2494         1          8.0      8.0      0.0                  tol=self.tol,
  2495         1          8.0      8.0      0.0                  selection=self.selection,
  2496         1          8.0      8.0      0.0                  random_state=random_state,
  2497         1          8.0      8.0      0.0                  diag_fisher=self.diag_fisher,
  2498         1          8.0      8.0      0.0                  copy_X=self.copy_X,
  2499                                                       )
  2500
  2501                                                   #######################################################################
  2502                                                   # 5a. handle intercept
  2503                                                   #######################################################################
  2504         1         12.0     12.0      0.0          if self.fit_intercept:
  2505         1         12.0     12.0      0.0              self.intercept_ = coef[0]
  2506         1         13.0     13.0      0.0              self.coef_ = coef[1:]
  2507                                                   else:
  2508                                                       # set intercept to zero as the other linear models do
  2509                                                       self.intercept_ = 0.0
  2510                                                       self.coef_ = coef
  2511
  2512                                                   #######################################################################
  2513                                                   # 5a. undo standardization
  2514                                                   #######################################################################
  2515         1          9.0      9.0      0.0          if self.standardize:
  2516                                                       # TODO: Make sure this doesn't copy X
  2517         2     126923.0  63461.5      1.5              X, self.intercept_, self.coef_ = _unstandardize(
  2518         1          9.0      9.0      0.0                  X, col_means, col_stds, self.intercept_, self.coef_
  2519                                                       )
  2520
  2521         1         13.0     13.0      0.0          if self.fit_dispersion in ["chisqr", "deviance"]:
  2522                                                       # attention because of rescaling of weights
  2523                                                       self.dispersion_ = self.estimate_phi(X, y, weights) * weights_sum
  2524
  2525         1          7.0      7.0      0.0          return self


## profile_1M_sparse_alpha_1_l1_ratio_0.5.log
Dim of data: (1000000, 107)
Sparseness: 0.683
Size of data frame: 0.80 GB

*********************************
alpha=1.0, l1_ratio=0.5, sparse=True

Creating sparse matrix...

Using new implementation...
0.09618600747421668 0 0
0.034404141330196404 1 1
0.011721161752972374 2 2
0.0034913993816849454 3 3
0.0007158709705561006 4 4
5.3700190453114465e-05 5 5
44.3958 seconds
gini=0.1111423209994666
mean outcome: 0.04787563746463126
mean prediction: 0.046850885751455344
Wrote profile results to glm_qc.py.lprof
Timer unit: 1e-06 s

Total time: 42.2249 s
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py
Function: _safe_sandwich_dot at line 105

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
   105                                           @profile
   106                                           def _safe_sandwich_dot(X, d, intercept=False):
   107                                               """Compute sandwich product X.T @ diag(d) @ X.
   108
   109                                               With ``intercept=True``, X is treated as if a column of 1 were appended as
   110                                               first column of X.
   111                                               X can be sparse, d must be an ndarray. Always returns a ndarray."""
   112         8         44.0      5.5      0.0      if sparse.issparse(X):
   113                                                   # TODO: There's a bit of room to accelerate this by avoiding the
   114                                                   # allocation of a new sparse matrix every time we pass through this
   115                                                   # step. The X.multiply creates a new matrix. Avoiding the allocation
   116                                                   # and updating the rows in place accelerates this line by ~20%.
   117                                                   temp = X.transpose() @ X.multiply(d[:, np.newaxis])
   118                                                   # for older versions of numpy and scipy, temp may be a np.matrix
   119                                                   temp = _safe_toarray(temp)
   120         8         15.0      1.9      0.0      elif type(X) is ColScaledSpMat:
   121         8   41538672.0 5192334.0     98.4          term1 = X.mat.transpose() @ X.mat.multiply(d[:, np.newaxis])
   122         8     341964.0  42745.5      0.8          term2 = X.mat.transpose().dot(d)[:, np.newaxis] * X.shift
   123         8         34.0      4.2      0.0          term3 = term2.T
   124         8       2622.0    327.8      0.0          term4 = (X.shift.T * X.shift) * d.sum()
   125         8       3603.0    450.4      0.0          temp = term1 + term2 + term3 + term4
   126         8         86.0     10.8      0.0          temp = _safe_toarray(temp)
   127                                               else:
   128                                                   temp = (X.T * d) @ X
   129         8          7.0      0.9      0.0      if intercept:
   130         8         11.0      1.4      0.0          dim = X.shape[1] + 1
   131         8         10.0      1.2      0.0          if type(X) is ColScaledSpMat:
   132         8          6.0      0.8      0.0              order = "F"
   133                                                   elif sparse.issparse(X):
   134                                                       order = "F" if sparse.isspmatrix_csc(X) else "C"
   135                                                   else:
   136                                                       order = "F" if X.flags["F_CONTIGUOUS"] else "C"
   137         8         42.0      5.2      0.0          res = np.empty((dim, dim), dtype=max(X.dtype, d.dtype), order=order)
   138         8       2157.0    269.6      0.0          res[0, 0] = d.sum()
   139         8     335305.0  41913.1      0.8          res[1:, 0] = d @ X
   140         8         83.0     10.4      0.0          res[0, 1:] = res[1:, 0]
   141         8        233.0     29.1      0.0          res[1:, 1:] = temp
   142                                               else:
   143                                                   res = temp
   144         8          7.0      0.9      0.0      return res

Total time: 37.7536 s
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py
Function: _eta_mu_score_fisher at line 730

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
   730                                               @profile
   731                                               def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link, diag_fisher=False):
   732                                                   """Compute linear predictor, mean, score function and fisher matrix.
   733
   734                                                   It calculates the linear predictor, the mean, score function
   735                                                   (derivative of log-likelihood) and Fisher information matrix
   736                                                   all in one go as function of `coef` (:math:`w`) and the data.
   737
   738                                                   Parameters
   739                                                   ----------
   740                                                   diag_fisher : boolean, optional (default=False)
   741                                                       If ``True``, returns only an array d such that
   742                                                       fisher = X.T @ np.diag(d) @ X.
   743
   744                                                   Returns
   745                                                   -------
   746                                                   (eta, mu, score, fisher) : tuple with 4 elements
   747                                                       The 4 elements are:
   748
   749                                                       * eta: ndarray, shape (X.shape[0],)
   750                                                       * mu: ndarray, shape (X.shape[0],)
   751                                                       * score: ndarray, shape (X.shape[0],)
   752                                                       * fisher:
   753
   754                                                           * If diag_fisher is ``False``, the full fisher matrix,
   755                                                             an array of shape (X.shape[1], X.shape[1])
   756                                                           * If diag_fisher is ``True`, an array of shape (X.shape[0])
   757                                                   """
   758         7         15.0      2.1      0.0          intercept = coef.size == X.shape[1] + 1
   759                                                   # eta = linear predictor
   760         7     292686.0  41812.3      0.8          eta = _safe_lin_pred(X, coef)
   761         7      71816.0  10259.4      0.2          mu = link.inverse(eta)
   762
   763                                                   # FOR TWEEDIE: sigma_inv = weights / (mu ** p) during optimization bc phi = 1
   764         7      61441.0   8777.3      0.2          sigma_inv = 1.0 / self.variance(mu, phi=phi, weights=weights)
   765         7      70751.0  10107.3      0.2          d1 = link.inverse_derivative(eta)  # = h'(eta)
   766                                                   # Alternatively:
   767                                                   # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g
   768                                                   # d1 = 1./link.derivative(mu)
   769         7      11318.0   1616.9      0.0          d1_sigma_inv = d1 * sigma_inv
   770         7      17883.0   2554.7      0.0          temp = d1_sigma_inv * (y - mu)
   771         7         13.0      1.9      0.0          if intercept:
   772         7     297231.0  42461.6      0.8              score = np.concatenate(([temp.sum()], temp @ X))
   773                                                   else:
   774                                                       score = temp @ X
   775
   776         7      12631.0   1804.4      0.0          d2_sigma_inv = d1 * d1_sigma_inv
   777         7         13.0      1.9      0.0          if diag_fisher:
   778                                                       fisher_matrix = d2_sigma_inv
   779                                                   else:
   780         7   36917769.0 5273967.0     97.8              fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv, intercept=intercept)
   781         7          9.0      1.3      0.0          return eta, mu, score, fisher_matrix

Total time: 0.019522 s
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py
Function: _cd_cycle at line 1131

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
  1131                                           @profile
  1132                                           def _cd_cycle(
  1133                                               d,
  1134                                               X,
  1135                                               coef,
  1136                                               score,
  1137                                               fisher,
  1138                                               P1,
  1139                                               P2,
  1140                                               n_cycles,
  1141                                               inner_tol,
  1142                                               max_inner_iter=1000,
  1143                                               selection="cyclic",
  1144                                               random_state=None,
  1145                                               diag_fisher=False,
  1146                                           ):
  1147                                               """Compute inner loop of coordinate descent, i.e. cycles through features.
  1148
  1149                                               Minimization of 1-d subproblems::
  1150
  1151                                                   min_z q(d+z*e_j) - q(d)
  1152                                                   = min_z A_j z + 1/2 B_jj z^2 + ||P1_j (w_j+d_j+z)||_1
  1153
  1154                                               A = f'(w) + d*H(w) + (w+d)*P2
  1155                                               B = H+P2
  1156                                               Note: f'=-score and H=fisher are updated at the end of outer iteration.
  1157                                               """
  1158                                               # TODO: use sparsity (coefficient already 0 due to L1 penalty)
  1159                                               #       => active set of features for featurelist, see paper
  1160                                               #          of Improved GLMNET or Gap Safe Screening Rules
  1161                                               #          https://arxiv.org/abs/1611.05780
  1162         7         15.0      2.1      0.1      n_samples, n_features = X.shape
  1163         7         17.0      2.4      0.1      intercept = coef.size == X.shape[1] + 1
  1164         7         10.0      1.4      0.1      idx = 1 if intercept else 0  # offset if coef[0] is intercept
  1165                                               # ES: This is weird. Should this be B = fisher.copy()? Why would you do this?
  1166         7          7.0      1.0      0.0      B = fisher
  1167         7         10.0      1.4      0.1      if P2.ndim == 1:
  1168                                                   coef_P2 = coef[idx:] * P2
  1169                                                   if not diag_fisher:
  1170                                                       idiag = np.arange(start=idx, stop=B.shape[0])
  1171                                                       # B[np.diag_indices_from(B)] += P2
  1172                                                       B[(idiag, idiag)] += P2
  1173                                               else:
  1174         7       1828.0    261.1      9.4          coef_P2 = coef[idx:] @ P2
  1175         7         12.0      1.7      0.1          if not diag_fisher:
  1176         7         23.0      3.3      0.1              if sparse.issparse(P2):
  1177         7        602.0     86.0      3.1                  B[idx:, idx:] += P2.toarray()
  1178                                                       else:
  1179                                                           B[idx:, idx:] += P2
  1180         7         28.0      4.0      0.1      A = -score
  1181         7         37.0      5.3      0.2      A[idx:] += coef_P2
  1182                                               # A += d @ (H+P2) but so far d=0
  1183                                               # inner loop
  1184         7         21.0      3.0      0.1      for inner_iter in range(1, max_inner_iter + 1):
  1185         7          9.0      1.3      0.0          inner_iter += 1
  1186         7          7.0      1.0      0.0          n_cycles += 1
  1187                                                   # cycle through features, update intercept separately at the end
  1188         7          8.0      1.1      0.0          if selection == "random":
  1189         7        152.0     21.7      0.8              featurelist = random_state.permutation(n_features)
  1190                                                   else:
  1191                                                       featurelist = np.arange(n_features)
  1192       756        833.0      1.1      4.3          for j in featurelist:
  1193                                                       # minimize_z: a z + 1/2 b z^2 + c |d+z|
  1194                                                       # a = A_j
  1195                                                       # b = B_jj > 0
  1196                                                       # c = |P1_j| = P1_j > 0, see 1.3
  1197                                                       # d = w_j + d_j
  1198                                                       # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4)
  1199                                                       # with beta = z+d, beta_hat = d-a/b and gamma = c/b
  1200                                                       # z = 1/b * S(bd-a,c) - d
  1201                                                       # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding
  1202       749        962.0      1.3      4.9              jdx = j + idx  # index for arrays containing entries for intercept
  1203       749        851.0      1.1      4.4              a = A[jdx]
  1204       749        688.0      0.9      3.5              if diag_fisher:
  1205                                                           # Note: fisher is ndarray of shape (n_samples,) => no idx
  1206                                                           # Calculate Bj = B[j, :] = B[:, j] as it is needed later anyway
  1207                                                           Bj = np.zeros_like(A)
  1208                                                           if intercept:
  1209                                                               Bj[0] = fisher.sum()
  1210                                                           if sparse.issparse(X):
  1211                                                               Bj[idx:] = _safe_toarray(
  1212                                                                   X[:, j].transpose() @ X.multiply(fisher[:, np.newaxis])
  1213                                                               ).ravel()
  1214                                                           else:
  1215                                                               Bj[idx:] = (fisher * X[:, j]) @ X
  1216
  1217                                                           if P2.ndim == 1:
  1218                                                               Bj[idx:] += P2[j]
  1219                                                           else:
  1220                                                               if sparse.issparse(P2):
  1221                                                                   # slice columns as P2 is csc
  1222                                                                   Bj[idx:] += P2[:, j].toarray().ravel()
  1223                                                               else:
  1224                                                                   Bj[idx:] += P2[:, j]
  1225                                                           b = Bj[jdx]
  1226                                                       else:
  1227       749        878.0      1.2      4.5                  b = B[jdx, jdx]
  1228
  1229                                                       # those ten lines are what it is all about
  1230       749        913.0      1.2      4.7              if b <= 0:
  1231                                                           z = 0
  1232       749       1028.0      1.4      5.3              elif P1[j] == 0:
  1233                                                           z = -a / b
  1234       749       1434.0      1.9      7.3              elif a + P1[j] < b * (coef[jdx] + d[jdx]):
  1235                                                           z = -(a + P1[j]) / b
  1236       749       1370.0      1.8      7.0              elif a - P1[j] > b * (coef[jdx] + d[jdx]):
  1237                                                           z = -(a - P1[j]) / b
  1238                                                       else:
  1239       749       1096.0      1.5      5.6                  z = -(coef[jdx] + d[jdx])
  1240
  1241                                                       # update direction d
  1242       749       1004.0      1.3      5.1              d[jdx] += z
  1243                                                       # update A because d_j is now d_j+z
  1244                                                       # A = f'(w) + d*H(w) + (w+d)*P2
  1245                                                       # => A += (H+P2)*e_j z = B_j * z
  1246                                                       # Note: B is symmetric B = B.transpose
  1247       749        681.0      0.9      3.5              if diag_fisher:
  1248                                                           # Bj = B[:, j] calculated above, still valid
  1249                                                           A += Bj * z
  1250                                                       else:
  1251                                                           # B is symmetric, C- or F-contiguous, but never sparse
  1252       749        878.0      1.2      4.5                  if B.flags["F_CONTIGUOUS"]:
  1253                                                               # slice columns like for sparse csc
  1254       749       3088.0      4.1     15.8                      A += B[:, jdx] * z
  1255                                                           else:  # B.flags['C_CONTIGUOUS'] might be true
  1256                                                               # slice rows
  1257                                                               A += B[jdx, :] * z
  1258                                                       # end of cycle over features
  1259                                                   # update intercept
  1260         7          7.0      1.0      0.0          if intercept:
  1261         7          7.0      1.0      0.0              if diag_fisher:
  1262                                                           Bj = np.zeros_like(A)
  1263                                                           Bj[0] = fisher.sum()
  1264                                                           Bj[1:] = fisher @ X
  1265                                                           b = Bj[0]
  1266                                                       else:
  1267         7         13.0      1.9      0.1                  b = B[0, 0]
  1268         7         15.0      2.1      0.1              z = 0 if b <= 0 else -A[0] / b
  1269         7         11.0      1.6      0.1              d[0] += z
  1270         7          6.0      0.9      0.0              if diag_fisher:
  1271                                                           A += Bj * z
  1272                                                       else:
  1273         7          8.0      1.1      0.0                  if B.flags["F_CONTIGUOUS"]:
  1274         7         30.0      4.3      0.2                      A += B[:, 0] * z
  1275                                                           else:
  1276                                                               A += B[0, :] * z
  1277                                                   # end of complete cycle
  1278                                                   # stopping criterion for inner loop
  1279                                                   # sum_i(|minimum of norm of subgrad of q(d)_i|)
  1280                                                   # subgrad q(d) = A + subgrad ||P1*(w+d)||_1
  1281         7        442.0     63.1      2.3          mn_subgrad = _min_norm_sugrad(coef=coef + d, grad=A, P2=None, P1=P1)
  1282         7        454.0     64.9      2.3          mn_subgrad = linalg.norm(mn_subgrad, ord=1)
  1283         7         13.0      1.9      0.1          if mn_subgrad <= inner_tol:
  1284         7          8.0      1.1      0.0              if inner_iter == 1:
  1285                                                           inner_tol = inner_tol / 4.0
  1286         7          9.0      1.3      0.0              break
  1287                                                   # end of inner loop
  1288         7          9.0      1.3      0.0      return d, coef_P2, n_cycles, inner_tol

Total time: 38.1615 s
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py
Function: _cd_solver at line 1291

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
  1291                                           @profile
  1292                                           def _cd_solver(
  1293                                               coef,
  1294                                               X,
  1295                                               y,
  1296                                               weights,
  1297                                               P1,
  1298                                               P2,
  1299                                               fit_intercept,
  1300                                               family,
  1301                                               link,
  1302                                               max_iter=100,
  1303                                               max_inner_iter=1000,
  1304                                               tol=1e-4,
  1305                                               selection="cyclic ",
  1306                                               random_state=None,
  1307                                               diag_fisher=False,
  1308                                               copy_X=True,
  1309                                           ):
  1310                                               """Solve GLM with L1 and L2 penalty by coordinate descent algorithm.
  1311
  1312                                               The objective being minimized in the coefficients w=coef is::
  1313
  1314                                                   F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1
  1315
  1316                                               An Improved GLMNET for L1-regularized Logistic Regression:
  1317
  1318                                               1. Find optimal descent direction d by minimizing
  1319                                                  min_d F(w+d) = min_d F(w+d) - F(w)
  1320                                               2. Quadratic approximation of F(w+d)-F(w) = q(d):
  1321                                                  using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives:
  1322                                                  q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d
  1323                                                  + ||P1*(w+d)||_1 - ||P1*w||_1
  1324                                                  Then minimize q(d): min_d q(d)
  1325                                               3. Coordinate descent by updating coordinate j (d -> d+z*e_j):
  1326                                                  min_z q(d+z*e_j)
  1327                                                  = min_z q(d+z*e_j) - q(d)
  1328                                                  = min_z A_j z + 1/2 B_jj z^2
  1329                                                          + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1
  1330                                                  A = f'(w) + d*H(w) + (w+d)*P2
  1331                                                  B = H + P2
  1332
  1333                                               Repeat steps 1-3 until convergence.
  1334                                               Note: Use Fisher matrix instead of Hessian for H.
  1335                                               Note: f' = -score, H = Fisher matrix
  1336
  1337                                               Parameters
  1338                                               ----------
  1339                                               coef : ndarray, shape (c,)
  1340                                                   If fit_intercept=False, shape c=X.shape[1].
  1341                                                   If fit_intercept=True, then c=X.shape[1] + 1.
  1342
  1343                                               X : {ndarray, csc sparse matrix}, shape (n_samples, n_features)
  1344                                                   Training data (with intercept included if present). If not sparse,
  1345                                                   pass directly as Fortran-contiguous data to avoid
  1346                                                   unnecessary memory duplication.
  1347
  1348                                               y : ndarray, shape (n_samples,)
  1349                                                   Target values.
  1350
  1351                                               weights: ndarray, shape (n_samples,)
  1352                                                   Sample weights with which the deviance is weighted. The weights must
  1353                                                   bee normalized and sum to 1.
  1354
  1355                                               P1 : {ndarray}, shape (n_features,)
  1356                                                   The L1-penalty vector (=diagonal matrix)
  1357
  1358                                               P2 : {ndarray, csc sparse matrix}, shape (n_features, n_features)
  1359                                                   The L2-penalty matrix or vector (=diagonal matrix). If a matrix is
  1360                                                   passed, it must be symmetric. If X is sparse, P2 must also be sparse.
  1361
  1362                                               fit_intercept : boolean, optional (default=True)
  1363                                                   Specifies if a constant (a.k.a. bias or intercept) should be
  1364                                                   added to the linear predictor (X*coef+intercept).
  1365
  1366                                               family : ExponentialDispersionModel
  1367
  1368                                               link : Link
  1369
  1370                                               max_iter : int, optional (default=100)
  1371                                                   Maximum numer of outer (Newton) iterations.
  1372
  1373                                               max_inner_iter : int, optional (default=1000)
  1374                                                   Maximum number of iterations in each inner loop, i.e. max number of
  1375                                                   cycles over all features per inner loop.
  1376
  1377                                               tol : float, optional (default=1e-4)
  1378                                                   Convergence criterion is
  1379                                                   sum_i(|minimum of norm of subgrad of objective_i|)<=tol.
  1380
  1381                                               selection : str, optional (default='cyclic')
  1382                                                   If 'random', randomly chose features in inner loop.
  1383
  1384                                               random_state : {int, RandomState instance, None}, optional (default=None)
  1385
  1386                                               diag_fisher : boolean, optional (default=False)
  1387                                                   ``False`` calculates full fisher matrix, ``True`` only diagonal matrix
  1388                                                   s.t. fisher = X.T @ diag @ X. This saves storage but needs more
  1389                                                   matrix-vector multiplications.
  1390
  1391                                               copy_X : boolean, optional (default=True)
  1392                                                   If ``True``, X will be copied; else, it may be overwritten.
  1393
  1394                                               Returns
  1395                                               -------
  1396                                               coef : ndarray, shape (c,)
  1397                                                   If fit_intercept=False, shape c=X.shape[1].
  1398                                                   If fit_intercept=True, then c=X.shape[1] + 1.
  1399
  1400                                               n_iter : number of outer iterations = newton iterations
  1401
  1402                                               n_cycles : number of cycles over features
  1403
  1404                                               References
  1405                                               ----------
  1406                                               Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
  1407                                               An Improved GLMNET for L1-regularized Logistic Regression,
  1408                                               Journal of Machine Learning Research 13 (2012) 1999-2030
  1409                                               https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
  1410                                               """
  1411         1          3.0      3.0      0.0      if type(X) is not ColScaledSpMat:
  1412                                                   # TODO: because copy_X is being passed here and is also being passed in
  1413                                                   # check_X_y in fit(...), X is being copied twice. Check if this is true.
  1414                                                   X = check_array(
  1415                                                       X, "csc", dtype=[np.float64, np.float32], order="F", copy=copy_X
  1416                                                   )
  1417         1          3.0      3.0      0.0      if P2.ndim == 2:
  1418         2        457.0    228.5      0.0          P2 = check_array(
  1419         1          4.0      4.0      0.0              P2, "csc", dtype=[np.float64, np.float32], order="F", copy=copy_X
  1420                                                   )
  1421         1          5.0      5.0      0.0      if sparse.issparse(X):
  1422                                                   if not sparse.isspmatrix_csc(P2):
  1423                                                       raise ValueError(
  1424                                                           "If X is sparse, P2 must also be sparse csc"
  1425                                                           "format. Got P2 not sparse."
  1426                                                       )
  1427         1         12.0     12.0      0.0      random_state = check_random_state(random_state)
  1428                                               # Note: we already set P2 = l2*P2, P1 = l1*P1
  1429                                               # Note: we already symmetrized P2 = 1/2 (P2 + P2')
  1430         1          3.0      3.0      0.0      n_iter = 0  # number of outer iterations
  1431         1          2.0      2.0      0.0      n_cycles = 0  # number of (complete) cycles over features
  1432         1          1.0      1.0      0.0      converged = False
  1433         1          2.0      2.0      0.0      n_samples, n_features = X.shape
  1434         1          2.0      2.0      0.0      idx = 1 if fit_intercept else 0  # offset if coef[0] is intercept
  1435                                               # line search parameters
  1436         1          2.0      2.0      0.0      (beta, sigma) = (0.5, 0.01)
  1437                                               # some precalculations
  1438                                               # Note: For diag_fisher=False, fisher = X.T @ fisher @ X and fisher is a
  1439                                               #       1d array representing a diagonal matrix.
  1440         2    5409710.0 2704855.0     14.2      eta, mu, score, fisher = family._eta_mu_score_fisher(
  1441         1          2.0      2.0      0.0          coef=coef, phi=1, X=X, y=y, weights=weights, link=link, diag_fisher=diag_fisher
  1442                                               )
  1443                                               # set up space for search direction d for inner loop
  1444         1         31.0     31.0      0.0      d = np.zeros_like(coef)
  1445
  1446                                               # minimum subgradient norm
  1447         1          3.0      3.0      0.0      def calc_mn_subgrad_norm():
  1448                                                   return linalg.norm(
  1449                                                       _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1), ord=1
  1450                                                   )
  1451
  1452                                               # the ratio of inner _cd_cycle tolerance to the minimum subgradient norm
  1453                                               # This wasn't explored in the newGLMNET paper linked above.
  1454                                               # That paper essentially uses inner_tol_ratio = 1.0, but using a slightly
  1455                                               # lower value is much faster.
  1456                                               # By comparison, the original GLMNET paper uses inner_tol = tol.
  1457                                               # So, inner_tol_ratio < 1 is sort of a compromise between the two papers.
  1458                                               # The value should probably be between 0.01 and 0.5. 0.1 works well for many problems
  1459         1          2.0      2.0      0.0      inner_tol_ratio = 0.1
  1460
  1461         1          2.0      2.0      0.0      def calc_inner_tol(mn_subgrad_norm):
  1462                                                   # Another potential rule limits the inner tol to be no smaller than tol
  1463                                                   # return max(mn_subgrad_norm * inner_tol_ratio, tol)
  1464                                                   return mn_subgrad_norm * inner_tol_ratio
  1465
  1466                                               # initial stopping tolerance of inner loop
  1467                                               # use L1-norm of minimum of norm of subgradient of F
  1468         1        417.0    417.0      0.0      inner_tol = calc_inner_tol(calc_mn_subgrad_norm())
  1469
  1470         1          2.0      2.0      0.0      Fw = None
  1471
  1472                                               # outer loop
  1473         6         12.0      2.0      0.0      while n_iter < max_iter:
  1474         6       2301.0    383.5      0.0          print(inner_tol, n_iter, n_cycles)
  1475         6         19.0      3.2      0.0          n_iter += 1
  1476                                                   # initialize search direction d (to be optimized) with zero
  1477         6         35.0      5.8      0.0          d.fill(0)
  1478                                                   # inner loop = _cd_cycle
  1479        12      27764.0   2313.7      0.1          d, coef_P2, n_cycles, inner_tol = _cd_cycle(
  1480         6         13.0      2.2      0.0              d,
  1481         6         13.0      2.2      0.0              X,
  1482         6         14.0      2.3      0.0              coef,
  1483         6         13.0      2.2      0.0              score,
  1484         6         13.0      2.2      0.0              fisher,
  1485         6         14.0      2.3      0.0              P1,
  1486         6         13.0      2.2      0.0              P2,
  1487         6         13.0      2.2      0.0              n_cycles,
  1488         6         13.0      2.2      0.0              inner_tol,
  1489         6         15.0      2.5      0.0              max_inner_iter=max_inner_iter,
  1490         6         15.0      2.5      0.0              selection=selection,
  1491         6         15.0      2.5      0.0              random_state=random_state,
  1492         6         13.0      2.2      0.0              diag_fisher=diag_fisher,
  1493                                                   )
  1494
  1495                                                   # line search by sequence beta^k, k=0, 1, ..
  1496                                                   # F(w + lambda d) - F(w) <= lambda * bound
  1497                                                   # bound = sigma * (f'(w)*d + w*P2*d
  1498                                                   #                  +||P1 (w+d)||_1 - ||P1 w||_1)
  1499         6        247.0     41.2      0.0          P1w_1 = linalg.norm(P1 * coef[idx:], ord=1)
  1500         6        202.0     33.7      0.0          P1wd_1 = linalg.norm(P1 * (coef + d)[idx:], ord=1)
  1501                                                   # Note: coef_P2 already calculated and still valid
  1502         6         86.0     14.3      0.0          bound = sigma * (-(score @ d) + coef_P2 @ d[idx:] + P1wd_1 - P1w_1)
  1503
  1504                                                   # In the first iteration, we must compute Fw explicitly.
  1505                                                   # In later iterations, we just use Fwd from the previous iteration
  1506                                                   # as set after the line search loop below.
  1507         6         12.0      2.0      0.0          if Fw is None:
  1508         1          2.0      2.0      0.0              Fw = (
  1509         3       8018.0   2672.7      0.0                  0.5 * family.deviance(y, mu, weights)
  1510         1         17.0     17.0      0.0                  + 0.5 * (coef_P2 @ coef[idx:])
  1511         1          2.0      2.0      0.0                  + P1w_1
  1512                                                       )
  1513
  1514         6         25.0      4.2      0.0          la = 1.0 / beta
  1515
  1516         6     247723.0  41287.2      0.6          X_dot_d = _safe_lin_pred(X, d)
  1517
  1518                                                   # Try progressively shorter line search steps.
  1519         6         49.0      8.2      0.0          for k in range(20):
  1520         6         19.0      3.2      0.0              la *= beta  # starts with la=1
  1521         6         76.0     12.7      0.0              coef_wd = coef + la * d
  1522
  1523                                                       # The simple version of the next line is:
  1524                                                       # mu_wd = link.inverse(_safe_lin_pred(X, coef_wd))
  1525                                                       # but because coef_wd can be factored as
  1526                                                       # coef_wd = coef + la * d
  1527                                                       # we can rewrite to only perform one dot product with the data
  1528                                                       # matrix per loop which is substantially faster
  1529         6      69558.0  11593.0      0.2              mu_wd = link.inverse(eta + la * X_dot_d)
  1530
  1531                                                       # TODO - optimize: for Tweedie that isn't one of the special cases
  1532                                                       # (gaussian, poisson, gamma), family.deviance is quite slow! Can we
  1533                                                       # fix that somehow?
  1534        12      46967.0   3913.9      0.1              Fwd = 0.5 * family.deviance(y, mu_wd, weights) + linalg.norm(
  1535         6         61.0     10.2      0.0                  P1 * coef_wd[idx:], ord=1
  1536                                                       )
  1537         6         20.0      3.3      0.0              if P2.ndim == 1:
  1538                                                           Fwd += 0.5 * ((coef_wd[idx:] * P2) @ coef_wd[idx:])
  1539                                                       else:
  1540         6        514.0     85.7      0.0                  Fwd += 0.5 * (coef_wd[idx:] @ (P2 @ coef_wd[idx:]))
  1541         6         23.0      3.8      0.0              if Fwd - Fw <= sigma * la * bound:
  1542         6         12.0      2.0      0.0                  break
  1543
  1544                                                   # Fw in the next iteration will be equal to Fwd this iteration.
  1545         6         13.0      2.2      0.0          Fw = Fwd
  1546
  1547                                                   # update coefficients
  1548         6         77.0     12.8      0.0          coef += la * d
  1549
  1550                                                   # calculate eta, mu, score, Fisher matrix for next iteration
  1551        12   32344270.0 2695355.8     84.8          eta, mu, score, fisher = family._eta_mu_score_fisher(
  1552         6         12.0      2.0      0.0              coef=coef,
  1553         6         11.0      1.8      0.0              phi=1,
  1554         6         12.0      2.0      0.0              X=X,
  1555         6         12.0      2.0      0.0              y=y,
  1556         6         12.0      2.0      0.0              weights=weights,
  1557         6         12.0      2.0      0.0              link=link,
  1558         6         12.0      2.0      0.0              diag_fisher=diag_fisher,
  1559                                                   )
  1560
  1561                                                   # stopping criterion for outer loop
  1562                                                   # sum_i(|minimum-norm of subgrad of F(w)_i|)
  1563                                                   # fp_wP2 = f'(w) + w*P2
  1564                                                   # Note: eta, mu and score are already updated
  1565                                                   # this also updates the inner tolerance for the next loop!
  1566         6       2366.0    394.3      0.0          mn_subgrad_norm = calc_mn_subgrad_norm()
  1567         6         24.0      4.0      0.0          if mn_subgrad_norm <= tol:
  1568         1          2.0      2.0      0.0              converged = True
  1569         1          2.0      2.0      0.0              break
  1570
  1571         5         18.0      3.6      0.0          inner_tol = calc_inner_tol(mn_subgrad_norm)
  1572                                                   # end of outer loop
  1573
  1574         1          2.0      2.0      0.0      if not converged:
  1575                                                   warnings.warn(
  1576                                                       "Coordinate descent failed to converge. Increase"
  1577                                                       " the maximum number of iterations max_iter"
  1578                                                       " (currently {})".format(max_iter),
  1579                                                       ConvergenceWarning,
  1580                                                   )
  1581
  1582         1          2.0      2.0      0.0      return coef, n_iter, n_cycles

Total time: 44.3836 s
File: /home/X916841/code/glm_benchmarks/src/glm_benchmarks/sklearn_fork/_glm.py
Function: fit at line 1884

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
  1884                                               @profile
  1885                                               def fit(self, X, y, sample_weight=None):
  1886                                                   """Fit a Generalized Linear Model.
  1887
  1888                                                   Parameters
  1889                                                   ----------
  1890                                                   X : {array-like, sparse matrix}, shape (n_samples, n_features)
  1891                                                       Training data.
  1892
  1893                                                   y : array-like, shape (n_samples,)
  1894                                                       Target values.
  1895
  1896                                                   sample_weight : {None, array-like}, shape (n_samples,),\
  1897                                                           optional (default=None)
  1898                                                       Individual weights w_i for each sample. Note that for an
  1899                                                       Exponential Dispersion Model (EDM), one has
  1900                                                       Var[Y_i]=phi/w_i * v(mu).
  1901                                                       If Y_i ~ EDM(mu, phi/w_i), then
  1902                                                       sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a
  1903                                                       weighted average with weights=sample_weight.
  1904
  1905                                                   Returns
  1906                                                   -------
  1907                                                   self : returns an instance of self.
  1908                                                   """
  1909                                                   # 99.8% of time is in the _cd_solver function
  1910                                                   #######################################################################
  1911                                                   # 1. input validation                                                 #
  1912                                                   #######################################################################
  1913                                                   # 1.1 validate arguments of __init__ ##################################
  1914                                                   # Guarantee that self._family_instance is an instance of class
  1915                                                   # ExponentialDispersionModel
  1916         1         57.0     57.0      0.0          if isinstance(self.family, ExponentialDispersionModel):
  1917                                                       self._family_instance = self.family
  1918                                                   else:
  1919         1          5.0      5.0      0.0              if self.family == "normal":
  1920                                                           self._family_instance = NormalDistribution()
  1921         1          5.0      5.0      0.0              elif self.family == "poisson":
  1922         1         25.0     25.0      0.0                  self._family_instance = PoissonDistribution()
  1923                                                       elif self.family == "gamma":
  1924                                                           self._family_instance = GammaDistribution()
  1925                                                       elif self.family == "inverse.gaussian":
  1926                                                           self._family_instance = InverseGaussianDistribution()
  1927                                                       elif self.family == "binomial":
  1928                                                           self._family_instance = BinomialDistribution()
  1929                                                       else:
  1930                                                           raise ValueError(
  1931                                                               "The family must be an instance of class"
  1932                                                               " ExponentialDispersionModel or an element of"
  1933                                                               " ['normal', 'poisson', 'gamma', 'inverse.gaussian', "
  1934                                                               "'binomial']; got (family={})".format(self.family)
  1935                                                           )
  1936
  1937                                                   # Guarantee that self._link_instance is set to an instance of
  1938                                                   # class Link
  1939         1         15.0     15.0      0.0          if isinstance(self.link, Link):
  1940                                                       self._link_instance = self.link
  1941                                                   else:
  1942         1          5.0      5.0      0.0              if self.link == "auto":
  1943         1          7.0      7.0      0.0                  if isinstance(self._family_instance, TweedieDistribution):
  1944         1          6.0      6.0      0.0                      if self._family_instance.power <= 0:
  1945                                                                   self._link_instance = IdentityLink()
  1946         1          5.0      5.0      0.0                      if self._family_instance.power >= 1:
  1947         1          8.0      8.0      0.0                          self._link_instance = LogLink()
  1948                                                           elif isinstance(self._family_instance, GeneralizedHyperbolicSecant):
  1949                                                               self._link_instance = IdentityLink()
  1950                                                           elif isinstance(self._family_instance, BinomialDistribution):
  1951                                                               self._link_instance = LogitLink()
  1952                                                           else:
  1953                                                               raise ValueError(
  1954                                                                   "No default link known for the "
  1955                                                                   "specified distribution family. Please "
  1956                                                                   "set link manually, i.e. not to 'auto'; "
  1957                                                                   "got (link='auto', family={}".format(self.family)
  1958                                                               )
  1959                                                       elif self.link == "identity":
  1960                                                           self._link_instance = IdentityLink()
  1961                                                       elif self.link == "log":
  1962                                                           self._link_instance = LogLink()
  1963                                                       elif self.link == "logit":
  1964                                                           self._link_instance = LogitLink()
  1965                                                       else:
  1966                                                           raise ValueError(
  1967                                                               "The link must be an instance of class Link or "
  1968                                                               "an element of ['auto', 'identity', 'log', 'logit']; "
  1969                                                               "got (link={})".format(self.link)
  1970                                                           )
  1971
  1972         1         14.0     14.0      0.0          if not isinstance(self.alpha, numbers.Number) or self.alpha < 0:
  1973                                                       raise ValueError(
  1974                                                           "Penalty term must be a non-negative number;"
  1975                                                           " got (alpha={})".format(self.alpha)
  1976                                                       )
  1977         3         15.0      5.0      0.0          if (
  1978         1          5.0      5.0      0.0              not isinstance(self.l1_ratio, numbers.Number)
  1979         1          5.0      5.0      0.0              or self.l1_ratio < 0
  1980         1          5.0      5.0      0.0              or self.l1_ratio > 1
  1981                                                   ):
  1982                                                       raise ValueError(
  1983                                                           "l1_ratio must be a number in interval [0, 1];"
  1984                                                           " got (l1_ratio={})".format(self.l1_ratio)
  1985                                                       )
  1986         1          5.0      5.0      0.0          if not isinstance(self.fit_intercept, bool):
  1987                                                       raise ValueError(
  1988                                                           "The argument fit_intercept must be bool;"
  1989                                                           " got {}".format(self.fit_intercept)
  1990                                                       )
  1991         1          5.0      5.0      0.0          if self.solver not in ["auto", "irls", "lbfgs", "newton-cg", "cd"]:
  1992                                                       raise ValueError(
  1993                                                           "GeneralizedLinearRegressor supports only solvers"
  1994                                                           " 'auto', 'irls', 'lbfgs', 'newton-cg' and 'cd';"
  1995                                                           " got {}".format(self.solver)
  1996                                                       )
  1997         1          4.0      4.0      0.0          solver = self.solver
  1998         1          5.0      5.0      0.0          if self.solver == "auto":
  1999         1          5.0      5.0      0.0              if self.l1_ratio == 0:
  2000                                                           solver = "irls"
  2001                                                       else:
  2002         1          5.0      5.0      0.0                  solver = "cd"
  2003         1          5.0      5.0      0.0          if self.alpha > 0 and self.l1_ratio > 0 and solver not in ["cd"]:
  2004                                                       raise ValueError(
  2005                                                           "The chosen solver (solver={}) can't deal "
  2006                                                           "with L1 penalties, which are included with "
  2007                                                           "(alpha={}) and (l1_ratio={}).".format(
  2008                                                               solver, self.alpha, self.l1_ratio
  2009                                                           )
  2010                                                       )
  2011         1          6.0      6.0      0.0          if not isinstance(self.max_iter, int) or self.max_iter <= 0:
  2012                                                       raise ValueError(
  2013                                                           "Maximum number of iteration must be a positive "
  2014                                                           "integer;"
  2015                                                           " got (max_iter={!r})".format(self.max_iter)
  2016                                                       )
  2017         1          6.0      6.0      0.0          if not isinstance(self.tol, numbers.Number) or self.tol <= 0:
  2018                                                       raise ValueError(
  2019                                                           "Tolerance for stopping criteria must be "
  2020                                                           "positive; got (tol={!r})".format(self.tol)
  2021                                                       )
  2022         1          5.0      5.0      0.0          if not isinstance(self.warm_start, bool):
  2023                                                       raise ValueError(
  2024                                                           "The argument warm_start must be bool;"
  2025                                                           " got {}".format(self.warm_start)
  2026                                                       )
  2027         1          4.0      4.0      0.0          if self.selection not in ["cyclic", "random"]:
  2028                                                       raise ValueError(
  2029                                                           "The argument selection must be 'cyclic' or "
  2030                                                           "'random'; got (selection={})".format(self.selection)
  2031                                                       )
  2032         1        379.0    379.0      0.0          random_state = check_random_state(self.random_state)
  2033         1          6.0      6.0      0.0          if not isinstance(self.diag_fisher, bool):
  2034                                                       raise ValueError(
  2035                                                           "The argument diag_fisher must be bool;"
  2036                                                           " got {}".format(self.diag_fisher)
  2037                                                       )
  2038         1          5.0      5.0      0.0          if not isinstance(self.copy_X, bool):
  2039                                                       raise ValueError(
  2040                                                           "The argument copy_X must be bool;" " got {}".format(self.copy_X)
  2041                                                       )
  2042         1          5.0      5.0      0.0          if not isinstance(self.check_input, bool):
  2043                                                       raise ValueError(
  2044                                                           "The argument check_input must be bool; got "
  2045                                                           "(check_input={})".format(self.check_input)
  2046                                                       )
  2047
  2048         1          5.0      5.0      0.0          family = self._family_instance
  2049         1          5.0      5.0      0.0          link = self._link_instance
  2050
  2051                                                   # 1.2 validate arguments of fit #######################################
  2052         1          6.0      6.0      0.0          _dtype = [np.float64, np.float32]
  2053         1          5.0      5.0      0.0          if solver == "cd":
  2054         1          4.0      4.0      0.0              _stype = ["csc"]
  2055                                                   else:
  2056                                                       _stype = ["csc", "csr"]
  2057         2     122584.0  61292.0      0.3          X, y = check_X_y(
  2058         1          4.0      4.0      0.0              X,
  2059         1          5.0      5.0      0.0              y,
  2060         1          5.0      5.0      0.0              accept_sparse=_stype,
  2061         1          4.0      4.0      0.0              dtype=_dtype,
  2062         1          5.0      5.0      0.0              y_numeric=True,
  2063         1          4.0      4.0      0.0              multi_output=False,
  2064         1          5.0      5.0      0.0              copy=self.copy_X,
  2065                                                   )
  2066                                                   # Without converting y to float, deviance might raise
  2067                                                   # ValueError: Integers to negative integer powers are not allowed.
  2068                                                   # Also, y must not be sparse.
  2069         1         10.0     10.0      0.0          y = np.asarray(y, dtype=np.float64)
  2070
  2071         1       1781.0   1781.0      0.0          weights = _check_weights(sample_weight, y.shape[0])
  2072
  2073         1          9.0      9.0      0.0          n_samples, n_features = X.shape
  2074
  2075                                                   # 1.3 arguments to take special care ##################################
  2076                                                   # P1, P2, start_params
  2077         1          7.0      7.0      0.0          if isinstance(self.P1, str) and self.P1 == "identity":
  2078         1         26.0     26.0      0.0              P1 = np.ones(n_features)
  2079                                                   else:
  2080                                                       P1 = np.atleast_1d(self.P1)
  2081                                                       try:
  2082                                                           P1 = P1.astype(np.float64, casting="safe", copy=False)
  2083                                                       except TypeError:
  2084                                                           raise TypeError(
  2085                                                               "The given P1 cannot be converted to a numeric"
  2086                                                               "array; got (P1.dtype={}).".format(P1.dtype)
  2087                                                           )
  2088                                                       if (P1.ndim != 1) or (P1.shape[0] != n_features):
  2089                                                           raise ValueError(
  2090                                                               "P1 must be either 'identity' or a 1d array "
  2091                                                               "with the length of X.shape[1]; "
  2092                                                               "got (P1.shape[0]={}), "
  2093                                                               "needed (X.shape[1]={}).".format(P1.shape[0], n_features)
  2094                                                           )
  2095                                                   # If X is sparse, make P2 sparse, too.
  2096         1          5.0      5.0      0.0          if isinstance(self.P2, str) and self.P2 == "identity":
  2097         1          7.0      7.0      0.0              if sparse.issparse(X):
  2098         1        281.0    281.0      0.0                  P2 = (
  2099         2        173.0     86.5      0.0                      sparse.dia_matrix(
  2100         1         10.0     10.0      0.0                          (np.ones(n_features), 0), shape=(n_features, n_features)
  2101                                                               )
  2102                                                           ).tocsc()
  2103                                                       else:
  2104                                                           P2 = np.ones(n_features)
  2105                                                   else:
  2106                                                       P2 = check_array(
  2107                                                           self.P2, copy=True, accept_sparse=_stype, dtype=_dtype, ensure_2d=False
  2108                                                       )
  2109                                                       if P2.ndim == 1:
  2110                                                           P2 = np.asarray(P2)
  2111                                                           if P2.shape[0] != n_features:
  2112                                                               raise ValueError(
  2113                                                                   "P2 should be a 1d array of shape "
  2114                                                                   "(n_features,) with "
  2115                                                                   "n_features=X.shape[1]; "
  2116                                                                   "got (P2.shape=({},)), needed ({},)".format(
  2117                                                                       P2.shape[0], X.shape[1]
  2118                                                                   )
  2119                                                               )
  2120                                                           if sparse.issparse(X):
  2121                                                               P2 = (
  2122                                                                   sparse.dia_matrix((P2, 0), shape=(n_features, n_features))
  2123                                                               ).tocsc()
  2124                                                       elif (
  2125                                                           P2.ndim == 2
  2126                                                           and P2.shape[0] == P2.shape[1]
  2127                                                           and P2.shape[0] == X.shape[1]
  2128                                                       ):
  2129                                                           if sparse.issparse(X):
  2130                                                               P2 = sparse.csc_matrix(P2)
  2131                                                       else:
  2132                                                           raise ValueError(
  2133                                                               "P2 must be either None or an array of shape "
  2134                                                               "(n_features, n_features) with "
  2135                                                               "n_features=X.shape[1]; "
  2136                                                               "got (P2.shape=({0}, {1})), needed ({2}, {2})".format(
  2137                                                                   P2.shape[0], P2.shape[1], X.shape[1]
  2138                                                               )
  2139                                                           )
  2140
  2141         1          6.0      6.0      0.0          start_params = self.start_params
  2142         1          6.0      6.0      0.0          if isinstance(start_params, str):
  2143         1          5.0      5.0      0.0              if start_params not in ["guess", "zero"]:
  2144                                                           raise ValueError(
  2145                                                               "The argument start_params must be 'guess', "
  2146                                                               "'zero' or an array of correct length; "
  2147                                                               "got(start_params={})".format(start_params)
  2148                                                           )
  2149                                                   else:
  2150                                                       start_params = check_array(
  2151                                                           start_params,
  2152                                                           accept_sparse=False,
  2153                                                           force_all_finite=True,
  2154                                                           ensure_2d=False,
  2155                                                           dtype=_dtype,
  2156                                                           copy=True,
  2157                                                       )
  2158                                                       if (start_params.shape[0] != X.shape[1] + self.fit_intercept) or (
  2159                                                           start_params.ndim != 1
  2160                                                       ):
  2161                                                           raise ValueError(
  2162                                                               "Start values for parameters must have the"
  2163                                                               "right length and dimension; required (length"
  2164                                                               "={}, ndim=1); got (length={}, ndim={}).".format(
  2165                                                                   X.shape[1] + self.fit_intercept,
  2166                                                                   start_params.shape[0],
  2167                                                                   start_params.ndim,
  2168                                                               )
  2169                                                           )
  2170
  2171         1          5.0      5.0      0.0          l1 = self.alpha * self.l1_ratio
  2172         1          6.0      6.0      0.0          l2 = self.alpha * (1 - self.l1_ratio)
  2173                                                   # P1 and P2 are now for sure copies
  2174         1          8.0      8.0      0.0          P1 = l1 * P1
  2175         1        146.0    146.0      0.0          P2 = l2 * P2
  2176                                                   # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2')
  2177                                                   # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric
  2178         1          5.0      5.0      0.0          if P2.ndim == 2:
  2179         1          6.0      6.0      0.0              if sparse.issparse(P2):
  2180         1          6.0      6.0      0.0                  if sparse.isspmatrix_csc(P2):
  2181         1        811.0    811.0      0.0                      P2 = 0.5 * (P2 + P2.transpose()).tocsc()
  2182                                                           else:
  2183                                                               P2 = 0.5 * (P2 + P2.transpose()).tocsr()
  2184                                                       else:
  2185                                                           P2 = 0.5 * (P2 + P2.T)
  2186
  2187                                                   # For coordinate descent, if X is sparse, P2 must also be csc
  2188         1          7.0      7.0      0.0          if solver == "cd" and sparse.issparse(X):
  2189         1         80.0     80.0      0.0              P2 = sparse.csc_matrix(P2)
  2190
  2191                                                   # 1.4 additional validations ##########################################
  2192         1          5.0      5.0      0.0          if self.check_input:
  2193         1       1319.0   1319.0      0.0              if not np.all(family.in_y_range(y)):
  2194                                                           raise ValueError(
  2195                                                               "Some value(s) of y are out of the valid "
  2196                                                               "range for family {}".format(family.__class__.__name__)
  2197                                                           )
  2198                                                       # check if P1 has only non-negative values, negative values might
  2199                                                       # indicate group lasso in the future.
  2200         1          6.0      6.0      0.0              if not isinstance(self.P1, str):  # if self.P1 != 'identity':
  2201                                                           if not np.all(P1 >= 0):
  2202                                                               raise ValueError("P1 must not have negative values.")
  2203                                                       # check if P2 is positive semidefinite
  2204                                                       # np.linalg.cholesky(P2) 'only' asserts positive definite
  2205         1          5.0      5.0      0.0              if not isinstance(self.P2, str):  # self.P2 != 'identity'
  2206                                                           # due to numerical precision, we allow eigenvalues to be a
  2207                                                           # tiny bit negative
  2208                                                           epsneg = -10 * np.finfo(P2.dtype).epsneg
  2209                                                           if P2.ndim == 1 or P2.shape[0] == 1:
  2210                                                               p2 = P2
  2211                                                               if sparse.issparse(P2):
  2212                                                                   p2 = P2.toarray()
  2213                                                               if not np.all(p2 >= 0):
  2214                                                                   raise ValueError(
  2215                                                                       "1d array P2 must not have negative " "values."
  2216                                                                   )
  2217                                                           elif sparse.issparse(P2):
  2218                                                               # for sparse matrices, not all eigenvals can be computed
  2219                                                               # efficiently, use only half of n_features
  2220                                                               # k = how many eigenvals to compute
  2221                                                               k = np.min([10, n_features // 10 + 1])
  2222                                                               sigma = -1000 * epsneg  # start searching near this value
  2223                                                               which = "SA"  # find smallest algebraic eigenvalues first
  2224                                                               eigenvalues = splinalg.eigsh(
  2225                                                                   P2, k=k, sigma=sigma, which=which, return_eigenvectors=False
  2226                                                               )
  2227                                                               if not np.all(eigenvalues >= epsneg):
  2228                                                                   raise ValueError("P2 must be positive semi-definite.")
  2229                                                           else:
  2230                                                               if not np.all(linalg.eigvalsh(P2) >= epsneg):
  2231                                                                   raise ValueError("P2 must be positive semi-definite.")
  2232                                                       # TODO: if alpha=0 check that X is not rank deficient
  2233                                                       # TODO: what else to check?
  2234
  2235                                                   #######################################################################
  2236                                                   # 2a. rescaling of weights (sample_weight)                             #
  2237                                                   #######################################################################
  2238                                                   # IMPORTANT NOTE: Since we want to minimize
  2239                                                   # 1/(2*sum(sample_weight)) * deviance + L1 + L2,
  2240                                                   # deviance = sum(sample_weight * unit_deviance),
  2241                                                   # we rescale weights such that sum(weights) = 1 and this becomes
  2242                                                   # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance)
  2243         1        428.0    428.0      0.0          weights_sum = np.sum(weights)
  2244         1       1972.0   1972.0      0.0          weights = weights / weights_sum
  2245
  2246                                                   #######################################################################
  2247                                                   # 2b. potentially rescale predictors
  2248                                                   #######################################################################
  2249         1          7.0      7.0      0.0          if self.standardize:
  2250         1     479789.0 479789.0      1.1              X, P1, P2, col_means, col_stds = _standardize(X, P1, P2, weights)
  2251
  2252                                                   #######################################################################
  2253                                                   # 3. initialization of coef = (intercept_, coef_)                     #
  2254                                                   #######################################################################
  2255                                                   # Note: Since phi=self.dispersion_ does not enter the estimation
  2256                                                   #       of mu_i=E[y_i], set it to 1.
  2257
  2258                                                   # set start values for coef
  2259         1         26.0     26.0      0.0          if self.warm_start and hasattr(self, "coef_"):
  2260                                                       coef = self.coef_
  2261                                                       intercept = self.intercept_
  2262                                                       if self.fit_intercept:
  2263                                                           coef = np.concatenate((np.array([intercept]), coef))
  2264                                                       if self.standardize:
  2265                                                           _standardize_warm_start(coef, col_means, col_stds)
  2266         1          7.0      7.0      0.0          elif isinstance(start_params, str):
  2267         1          5.0      5.0      0.0              if start_params == "guess":
  2268                                                           # Set mu=starting_mu of the family and do one Newton step
  2269                                                           # If solver=cd use cd, else irls
  2270         1       4731.0   4731.0      0.0                  mu = family.starting_mu(y, weights=weights)
  2271         1      12563.0  12563.0      0.0                  eta = link.link(mu)  # linear predictor
  2272         1          8.0      8.0      0.0                  if solver in ["cd", "lbfgs", "newton-cg"]:
  2273                                                               # see function _cd_solver
  2274         1      10930.0  10930.0      0.0                      sigma_inv = 1 / family.variance(mu, phi=1, weights=weights)
  2275         1      10093.0  10093.0      0.0                      d1 = link.inverse_derivative(eta)
  2276         1       5497.0   5497.0      0.0                      temp = sigma_inv * d1 * (y - mu)
  2277         1          8.0      8.0      0.0                      if self.fit_intercept:
  2278         1      42518.0  42518.0      0.1                          score = np.concatenate(([temp.sum()], temp @ X))
  2279                                                               else:
  2280                                                                   score = temp @ X  # same as X.T @ temp
  2281
  2282         1       1980.0   1980.0      0.0                      d2_sigma_inv = d1 * d1 * sigma_inv
  2283         1          7.0      7.0      0.0                      diag_fisher = self.diag_fisher
  2284         1          5.0      5.0      0.0                      if diag_fisher:
  2285                                                                   fisher = d2_sigma_inv
  2286                                                               else:
  2287         2    5307565.0 2653782.5     12.0                          fisher = _safe_sandwich_dot(
  2288         1          6.0      6.0      0.0                              X, d2_sigma_inv, intercept=self.fit_intercept
  2289                                                                   )
  2290                                                               # set up space for search direction d for inner loop
  2291         1          8.0      8.0      0.0                      if self.fit_intercept:
  2292         1         12.0     12.0      0.0                          coef = np.zeros(n_features + 1)
  2293                                                               else:
  2294                                                                   coef = np.zeros(n_features)
  2295         1         43.0     43.0      0.0                      d = np.zeros_like(coef)
  2296                                                               # initial stopping tolerance of inner loop
  2297                                                               # use L1-norm of minimum of norm of subgradient of F
  2298                                                               # use less restrictive tolerance for initial guess
  2299         1        340.0    340.0      0.0                      inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1)
  2300         1         84.0     84.0      0.0                      inner_tol = 4 * linalg.norm(inner_tol, ord=1)
  2301                                                               # just one outer loop = Newton step
  2302         1          6.0      6.0      0.0                      n_cycles = 0
  2303         2       4408.0   2204.0      0.0                      d, coef_P2, n_cycles, inner_tol = _cd_cycle(
  2304         1          5.0      5.0      0.0                          d,
  2305         1          6.0      6.0      0.0                          X,
  2306         1          5.0      5.0      0.0                          coef,
  2307         1          5.0      5.0      0.0                          score,
  2308         1          5.0      5.0      0.0                          fisher,
  2309         1          6.0      6.0      0.0                          P1,
  2310         1          6.0      6.0      0.0                          P2,
  2311         1          6.0      6.0      0.0                          n_cycles,
  2312         1          5.0      5.0      0.0                          inner_tol,
  2313         1          5.0      5.0      0.0                          max_inner_iter=1000,
  2314         1          6.0      6.0      0.0                          selection=self.selection,
  2315         1          6.0      6.0      0.0                          random_state=random_state,
  2316         1          6.0      6.0      0.0                          diag_fisher=self.diag_fisher,
  2317                                                               )
  2318         1         10.0     10.0      0.0                      coef += d  # for simplicity no line search here
  2319                                                           else:
  2320                                                               # See _irls_solver
  2321                                                               # h'(eta)
  2322                                                               hp = link.inverse_derivative(eta)
  2323                                                               # working weights W, in principle a diagonal matrix
  2324                                                               # therefore here just as 1d array
  2325                                                               W = hp ** 2 / family.variance(mu, phi=1, weights=weights)
  2326                                                               # working observations
  2327                                                               z = eta + (y - mu) / hp
  2328                                                               # solve A*coef = b
  2329                                                               # A = X' W X + l2 P2, b = X' W z
  2330                                                               coef = _irls_step(X, W, P2, z, fit_intercept=self.fit_intercept)
  2331                                                       else:  # start_params == 'zero'
  2332                                                           if self.fit_intercept:
  2333                                                               coef = np.zeros(n_features + 1)
  2334                                                               coef[0] = link.link(np.average(y, weights=weights))
  2335                                                           else:
  2336                                                               coef = np.zeros(n_features)
  2337                                                   else:  # assign given array as start values
  2338                                                       coef = start_params
  2339                                                       if self.standardize:
  2340                                                           _standardize_warm_start(coef, col_means, col_stds)
  2341
  2342                                                   #######################################################################
  2343                                                   # 4. fit                                                              #
  2344                                                   #######################################################################
  2345                                                   # algorithms for optimization
  2346                                                   # TODO: Parallelize it?
  2347
  2348                                                   # 4.1 IRLS ############################################################
  2349                                                   # Note: we already set P2 = l2*P2, see above
  2350                                                   # Note: we already symmetrized P2 = 1/2 (P2 + P2')
  2351         1          6.0      6.0      0.0          if solver == "irls":
  2352                                                       coef, self.n_iter_ = _irls_solver(
  2353                                                           coef=coef,
  2354                                                           X=X,
  2355                                                           y=y,
  2356                                                           weights=weights,
  2357                                                           P2=P2,
  2358                                                           fit_intercept=self.fit_intercept,
  2359                                                           family=family,
  2360                                                           link=link,
  2361                                                           max_iter=self.max_iter,
  2362                                                           tol=self.tol,
  2363                                                       )
  2364
  2365                                                   # 4.2 L-BFGS ##########################################################
  2366         1          5.0      5.0      0.0          elif solver == "lbfgs":
  2367
  2368                                                       def func(coef, X, y, weights, P2, family, link):
  2369                                                           mu, devp = family._mu_deviance_derivative(coef, X, y, weights, link)
  2370                                                           dev = family.deviance(y, mu, weights)
  2371                                                           intercept = coef.size == X.shape[1] + 1
  2372                                                           idx = 1 if intercept else 0  # offset if coef[0] is intercept
  2373                                                           if P2.ndim == 1:
  2374                                                               L2 = P2 * coef[idx:]
  2375                                                           else:
  2376                                                               L2 = P2 @ coef[idx:]
  2377                                                           obj = 0.5 * dev + 0.5 * (coef[idx:] @ L2)
  2378                                                           objp = 0.5 * devp
  2379                                                           objp[idx:] += L2
  2380                                                           return obj, objp
  2381
  2382                                                       args = (X, y, weights, P2, family, link)
  2383                                                       coef, loss, info = fmin_l_bfgs_b(
  2384                                                           func,
  2385                                                           coef,
  2386                                                           fprime=None,
  2387                                                           args=args,
  2388                                                           iprint=(self.verbose > 0) - 1,
  2389                                                           pgtol=self.tol,
  2390                                                           maxiter=self.max_iter,
  2391                                                           factr=1e3,
  2392                                                       )
  2393                                                       if info["warnflag"] == 1:
  2394                                                           warnings.warn(
  2395                                                               "lbfgs failed to converge." " Increase the number of iterations.",
  2396                                                               ConvergenceWarning,
  2397                                                           )
  2398                                                       elif info["warnflag"] == 2:
  2399                                                           warnings.warn("lbfgs failed for the reason: {}".format(info["task"]))
  2400                                                       self.n_iter_ = info["nit"]
  2401
  2402                                                   # 4.3 Newton-CG #######################################################
  2403                                                   # We use again the fisher matrix instead of the hessian. More
  2404                                                   # precisely, expected hessian of deviance.
  2405         1          6.0      6.0      0.0          elif solver == "newton-cg":
  2406
  2407                                                       def func(coef, X, y, weights, P2, family, link):
  2408                                                           intercept = coef.size == X.shape[1] + 1
  2409                                                           idx = 1 if intercept else 0  # offset if coef[0] is intercept
  2410                                                           if P2.ndim == 1:
  2411                                                               L2 = coef[idx:] @ (P2 * coef[idx:])
  2412                                                           else:
  2413                                                               L2 = coef[idx:] @ (P2 @ coef[idx:])
  2414                                                           mu = link.inverse(_safe_lin_pred(X, coef))
  2415                                                           return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2
  2416
  2417                                                       def grad(coef, X, y, weights, P2, family, link):
  2418                                                           mu, devp = family._mu_deviance_derivative(coef, X, y, weights, link)
  2419                                                           intercept = coef.size == X.shape[1] + 1
  2420                                                           idx = 1 if intercept else 0  # offset if coef[0] is intercept
  2421                                                           if P2.ndim == 1:
  2422                                                               L2 = P2 * coef[idx:]
  2423                                                           else:
  2424                                                               L2 = P2 @ coef[idx:]
  2425                                                           objp = 0.5 * devp
  2426                                                           objp[idx:] += L2
  2427                                                           return objp
  2428
  2429                                                       def grad_hess(coef, X, y, weights, P2, family, link):
  2430                                                           intercept = coef.size == X.shape[1] + 1
  2431                                                           idx = 1 if intercept else 0  # offset if coef[0] is intercept
  2432                                                           if P2.ndim == 1:
  2433                                                               L2 = P2 * coef[idx:]
  2434                                                           else:
  2435                                                               L2 = P2 @ coef[idx:]
  2436                                                           eta = _safe_lin_pred(X, coef)
  2437                                                           mu = link.inverse(eta)
  2438                                                           d1 = link.inverse_derivative(eta)
  2439                                                           temp = d1 * family.deviance_derivative(y, mu, weights)
  2440                                                           if intercept:
  2441                                                               grad = np.concatenate(([0.5 * temp.sum()], 0.5 * temp @ X + L2))
  2442                                                           else:
  2443                                                               grad = 0.5 * temp @ X + L2  # same as 0.5* X.T @ temp + L2
  2444
  2445                                                           # expected hessian = fisher = X.T @ diag_matrix @ X
  2446                                                           # calculate only diag_matrix
  2447                                                           diag = d1 ** 2 / family.variance(mu, phi=1, weights=weights)
  2448                                                           if intercept:
  2449                                                               h0i = np.concatenate(([diag.sum()], diag @ X))
  2450
  2451                                                           def Hs(coef):
  2452                                                               # return (0.5 * fisher + P2) @ coef
  2453                                                               # ret = 0.5 * (X.T @ (diag * (X @ coef)))
  2454                                                               ret = 0.5 * ((diag * (X @ coef[idx:])) @ X)
  2455                                                               if P2.ndim == 1:
  2456                                                                   ret += P2 * coef[idx:]
  2457                                                               else:
  2458                                                                   ret += P2 @ coef[idx:]
  2459                                                               if intercept:
  2460                                                                   ret = np.concatenate(
  2461                                                                       ([0.5 * (h0i @ coef)], ret + 0.5 * coef[0] * h0i[1:])
  2462                                                                   )
  2463                                                               return ret
  2464
  2465                                                           return grad, Hs
  2466
  2467                                                       args = (X, y, weights, P2, family, link)
  2468                                                       coef, self.n_iter_ = newton_cg(
  2469                                                           grad_hess,
  2470                                                           func,
  2471                                                           grad,
  2472                                                           coef,
  2473                                                           args=args,
  2474                                                           maxiter=self.max_iter,
  2475                                                           tol=self.tol,
  2476                                                       )
  2477
  2478                                                   # 4.4 coordinate descent ##############################################
  2479                                                   # Note: we already set P1 = l1*P1, see above
  2480                                                   # Note: we already set P2 = l2*P2, see above
  2481                                                   # Note: we already symmetrized P2 = 1/2 (P2 + P2')
  2482         1          6.0      6.0      0.0          elif solver == "cd":
  2483         2   38164237.0 19082118.5     86.0              coef, self.n_iter_, self._n_cycles = _cd_solver(
  2484         1          6.0      6.0      0.0                  coef=coef,
  2485         1          5.0      5.0      0.0                  X=X,
  2486         1          5.0      5.0      0.0                  y=y,
  2487         1          5.0      5.0      0.0                  weights=weights,
  2488         1          6.0      6.0      0.0                  P1=P1,
  2489         1          6.0      6.0      0.0                  P2=P2,
  2490         1          7.0      7.0      0.0                  fit_intercept=self.fit_intercept,
  2491         1          6.0      6.0      0.0                  family=family,
  2492         1          5.0      5.0      0.0                  link=link,
  2493         1          5.0      5.0      0.0                  max_iter=self.max_iter,
  2494         1          6.0      6.0      0.0                  tol=self.tol,
  2495         1          6.0      6.0      0.0                  selection=self.selection,
  2496         1          6.0      6.0      0.0                  random_state=random_state,
  2497         1          6.0      6.0      0.0                  diag_fisher=self.diag_fisher,
  2498         1          5.0      5.0      0.0                  copy_X=self.copy_X,
  2499                                                       )
  2500
  2501                                                   #######################################################################
  2502                                                   # 5a. handle intercept
  2503                                                   #######################################################################
  2504         1          8.0      8.0      0.0          if self.fit_intercept:
  2505         1          8.0      8.0      0.0              self.intercept_ = coef[0]
  2506         1          8.0      8.0      0.0              self.coef_ = coef[1:]
  2507                                                   else:
  2508                                                       # set intercept to zero as the other linear models do
  2509                                                       self.intercept_ = 0.0
  2510                                                       self.coef_ = coef
  2511
  2512                                                   #######################################################################
  2513                                                   # 5a. undo standardization
  2514                                                   #######################################################################
  2515         1          7.0      7.0      0.0          if self.standardize:
  2516                                                       # TODO: Make sure this doesn't copy X
  2517         2     208084.0 104042.0      0.5              X, self.intercept_, self.coef_ = _unstandardize(
  2518         1          6.0      6.0      0.0                  X, col_means, col_stds, self.intercept_, self.coef_
  2519                                                       )
  2520
  2521         1         10.0     10.0      0.0          if self.fit_dispersion in ["chisqr", "deviance"]:
  2522                                                       # attention because of rescaling of weights
  2523                                                       self.dispersion_ = self.estimate_phi(X, y, weights) * weights_sum
  2524
  2525         1          5.0      5.0      0.0          return self
	# Setup
	#
	# # You need to clone
	# - https://github.axa.com/martin-rohbeck-external/k-claims-modelling
	# - https://github.axa.com/jan-tilly/glmnet_python
	# - https://github.axa.com/jan-tilly/glm_benchmarks
	# and then run (adjust paths to repos accordingly)
	# conda run -n ${PROJECT_FLAVOR} mamba install -y fortran-compiler
	# conda run -n ${PROJECT_FLAVOR} pip install --no-use-pep517 --disable-pip-version-check -e ~/code/k-claims-modelling
	# conda run -n ${PROJECT_FLAVOR} --cwd ~/code/glmnet_python python setup.py install
	# conda run -n ${PROJECT_FLAVOR} pip install --no-use-pep517 --disable-pip-version-check -e ~/code/glm_benchmarks

	import os
	from time import perf_counter

	import glm_benchmarks.sklearn_fork as sklearn_qc
	import numpy as np
	import pandas as pd
	import scipy.sparse
	from glmnet_python import glmnet, glmnetPredict
	from npp.config import get_logger

	import k_claims_modelling.external.sklearn.linear_model.glm as sklearn
	from k_claims_modelling.evaluation.gini import compute_gini
	from k_claims_modelling.modelling.regressors import exposure_correction

	logger = get_logger()
	data_dir = "/home/shared_folders/pricing/claims_modelling/test_data_for_profiling_glms"

	# %%
	# load data
	df = pd.read_parquet(os.path.join(data_dir, "outcomes.parquet"))
	X = pd.read_parquet(os.path.join(data_dir, "X.parquet"))

	# subsample
	idx = df.sample(n=1_000_000).index
	df = df.loc[idx].reset_index(drop=True)
	X = X.loc[idx].reset_index(drop=True)

	print(f"Dim of data: {X.shape}")
	print(f"Sparseness: {(np.abs(X.values) < 1e-10).mean():.3f}")
	print(f"Size of data frame: {X.memory_usage().sum() / float(1<<30):.2f} GB")

	# %%
	y = df["sanzkh02"].to_numpy()
	exposure = df["je"].to_numpy()
	train_set = (df["sample"] == "train").to_numpy()
	test_set = (df["sample"] == "test").to_numpy()
	future_set = (df["sample"] == "future").to_numpy()
	offset = df["offset_kh_sach_frequenz"].to_numpy()
	y_adj, sample_weight = exposure_correction(
	power=1, y=y, exposure=exposure, offset=offset
	)

	# %%
	model_args = dict(
	family="poisson",
	max_iter=100,
	random_state=12345,
	copy_X=True,
	selection="random",
	tol=1e-5,
	)

	# %%
	for alpha, l1_ratio, sparse in (
	(0.01, 0.00, False),
	(1.00, 0.50, False),
	(0.01, 0.00, True),
	(1.00, 0.50, True),
	):

	print("")
	print("*********************************")
	print(f"{alpha=}, {l1_ratio=}, {sparse=}")
	print("")

	if sparse:
	print("Creating sparse matrix...")
	Xtrain = scipy.sparse.csc_matrix(X[train_set])
	else:
	Xtrain = X[train_set].to_numpy()

	# new implementation
	print("")
	print("Using new implementation...")
	tic = perf_counter()
	poisson_regressor_qc = sklearn_qc.GeneralizedLinearRegressor(
	standardize=True, alpha=alpha, l1_ratio=l1_ratio, **model_args
	).fit(X=Xtrain, y=y_adj[train_set], sample_weight=sample_weight[train_set])
	toc = perf_counter()
	print(f"{toc-tic:.4f} seconds")

	y_pred_qc = poisson_regressor_qc.predict(X[test_set]) * sample_weight[test_set]
	gini = compute_gini(
	y_true=y[test_set], y_score=y_pred_qc, exposure=exposure[test_set]
	)
	print(f"{gini=}")
	print(f"mean outcome: {y[test_set].sum() / exposure[test_set].sum()}")
	print(f"mean prediction: {y_pred_qc.sum() / exposure[test_set].sum()}")

	# glmnet
	print("")
	print("Using glmnet...")
	tic = perf_counter()
	poisson_regressor_glmnet = glmnet(
	x=Xtrain,
	y=y_adj[train_set].copy(),
	weights=sample_weight[train_set],
	standardize=True,
	lambdau=np.array([alpha]),
	alpha=l1_ratio,
	family=model_args["family"],
	thresh=model_args["tol"],
	)
	toc = perf_counter()
	print(f"{toc-tic:.4f} seconds")

	y_pred_glmnet = (
	np.exp(glmnetPredict(poisson_regressor_glmnet, X[test_set]).flatten())
	* sample_weight[test_set]
	)
	gini = compute_gini(
	y_true=y[test_set], y_score=y_pred_glmnet, exposure=exposure[test_set]
	)
	print(f"{gini=}")
	print(f"mean outcome: {y[test_set].sum() / exposure[test_set].sum()}")
	print(f"mean prediction: {y_pred_glmnet.sum() / exposure[test_set].sum()}")

	# compare predictions
	print(f"all close? {np.allclose(y_pred_qc, y_pred_glmnet, atol=1e-04)}")

	# old implementation
	print("")
	print("Using old implementation... (no standardization)")
	tic = perf_counter()
	poisson_regressor = sklearn.GeneralizedLinearRegressor(
	alpha=alpha, l1_ratio=l1_ratio, **model_args
	).fit(X=Xtrain, y=y_adj[train_set], sample_weight=sample_weight[train_set])
	toc = perf_counter()
	print(f"{toc-tic:.4f} seconds")

	y_pred = poisson_regressor.predict(X[test_set]) * sample_weight[test_set]
	gini = compute_gini(y_true=y[test_set], y_score=y_pred, exposure=exposure[test_set])
	print(f"{gini=}")
	print(f"mean outcome: {y[test_set].sum() / exposure[test_set].sum()}")
	print(f"mean prediction: {y_pred.sum() / exposure[test_set].sum()}")

	# compare predictions
	print(f"all close? {np.allclose(y_pred_qc, y_pred, atol=1e-04)}")


	# %%
	Dim of data: (10000000, 107)
	Sparseness: 0.683
	Size of data frame: 7.97 GB

	*********************************
	alpha=0.01, l1_ratio=0.0, sparse=False


	Using new implementation...
	41.0648 seconds
	gini=0.26368907871297714
	mean outcome: 0.04747663840483652
	mean prediction: 0.047279319092816983

	Using glmnet...
	91.3531 seconds
	gini=0.27551655391490293
	mean outcome: 0.04747663840483652
	mean prediction: 0.04728485394366472
	all close? False

	Using old implementation... (no standardization)
	36.6776 seconds
	gini=0.25560893366357507
	mean outcome: 0.04747663840483652
	mean prediction: 0.047348574848387524
	all close? False

	*********************************
	alpha=1.0, l1_ratio=0.5, sparse=False


	Using new implementation...
	0.09614459466725817 0 0
	0.034378559002931665 1 1
	0.01170238383401215 2 2
	0.0034775112012216183 3 3
	0.000708324797992112 4 4
	5.2231794878223296e-05 5 5
	56.4671 seconds
	gini=0.11972711376286291
	mean outcome: 0.04747663840483652
	mean prediction: 0.04729607839402956

	Using glmnet...
	7.4578 seconds
	gini=0.12023107566353006
	mean outcome: 0.04747663840483652
	mean prediction: 0.04729181545853376
	all close? True

	Using old implementation... (no standardization)
	0.9614459466725815 0 0
	0.34378559002931663 1 1
	0.11702383834012149 2 2
	0.03477511201221618 3 3
	0.00708324797992112 4 4
	0.0005223179487822329 5 5
	52.6287 seconds
	gini=0.11972711376286291
	mean outcome: 0.04747663840483652
	mean prediction: 0.04729607839402956
	all close? True

	*********************************
	alpha=0.01, l1_ratio=0.0, sparse=True

	Creating sparse matrix...

	Using new implementation...
	338.1123 seconds
	gini=0.263689078359197
	mean outcome: 0.04747663840483652
	mean prediction: 0.047279319092816886

	Using glmnet...
	57.4690 seconds
	gini=0.2755165538902576
	mean outcome: 0.04747663840483652
	mean prediction: 0.0472848539436662
	all close? False

	Using old implementation... (no standardization)
	291.9076 seconds
	gini=0.2636890785530665
	mean outcome: 0.04747663840483652
	mean prediction: 0.04727931909281683
	all close? True

	*********************************
	alpha=1.0, l1_ratio=0.5, sparse=True

	Creating sparse matrix...

	Using new implementation...
	0.09614459466725817 0 0
	0.034378559002931665 1 1
	0.01170238383401215 2 2
	0.0034775112012216183 3 3
	0.000708324797992112 4 4
	5.2231794878223296e-05 5 5
	456.6564 seconds
	gini=0.11972711376286291
	mean outcome: 0.04747663840483652
	mean prediction: 0.04729607839402956

	Using glmnet...
	3.9307 seconds
	gini=0.12011246482376858
	mean outcome: 0.04747663840483652
	mean prediction: 0.04729181545858199
	all close? True

	Using old implementation... (no standardization)
	1.344542742914321 0 0
	0.6710968536517165 1 1
	0.11795095198240992 2 2
	0.03510388281768133 3 3
	0.007180409077755724 4 4
	0.0005352931468411203 5 5
	389.8593 seconds
	gini=0.12033554412788987
	mean outcome: 0.04747663840483652
	mean prediction: 0.04729629084703527
	all close? True
	Dim of data: (1000000, 107)
	Sparseness: 0.683
	Size of data frame: 0.80 GB

	*********************************
	alpha=0.01, l1_ratio=0.0, sparse=False


	Using new implementation...
	4.1740 seconds
	gini=0.263818790086771
	mean outcome: 0.04676540361757755
	mean prediction: 0.04670313684179153

	Using glmnet...
	7.9436 seconds
	gini=0.2724562578231543
	mean outcome: 0.04676540361757755
	mean prediction: 0.04674076928845567
	all close? False

	Using old implementation... (no standardization)
	3.6724 seconds
	gini=0.2532641359545414
	mean outcome: 0.04676540361757755
	mean prediction: 0.04619259199982138
	all close? False

	*********************************
	alpha=1.0, l1_ratio=0.5, sparse=False


	Using new implementation...
	0.09619816120501168 0 0
	0.03441165041414234 1 1
	0.011726676309588263 2 2
	0.003495483932480208 3 3
	0.000718099939306369 4 4
	5.41389992328904e-05 5 5
	5.6504 seconds
	gini=0.12005884445047671
	mean outcome: 0.04676540361757755
	mean prediction: 0.04666568650088688

	Using glmnet...
	0.6818 seconds
	gini=0.12027297891459444
	mean outcome: 0.04676540361757755
	mean prediction: 0.046661043630974286
	all close? True

	Using old implementation... (no standardization)
	0.9619816120501168 0 0
	0.3441165041414234 1 1
	0.11726676309588263 2 2
	0.034954839324802076 3 3
	0.0071809993930636905 4 4
	0.000541389992328904 5 5
	5.2115 seconds
	gini=0.12005884445047671
	mean outcome: 0.04676540361757755
	mean prediction: 0.04666568650088688
	all close? True

	*********************************
	alpha=0.01, l1_ratio=0.0, sparse=True

	Creating sparse matrix...

	Using new implementation...
	32.5009 seconds
	gini=0.2638187936317561
	mean outcome: 0.04676540361757755
	mean prediction: 0.046703136841791504

	Using glmnet...
	4.2908 seconds
	gini=0.2724562578231521
	mean outcome: 0.04676540361757755
	mean prediction: 0.046740769288456385
	all close? False

	Using old implementation... (no standardization)
	28.1692 seconds
	gini=0.2638187961688558
	mean outcome: 0.04676540361757755
	mean prediction: 0.04670313684179154
	all close? True

	*********************************
	alpha=1.0, l1_ratio=0.5, sparse=True

	Creating sparse matrix...

	Using new implementation...
	0.09619816120501168 0 0
	0.03441165041414234 1 1
	0.011726676309588263 2 2
	0.003495483932480208 3 3
	0.000718099939306369 4 4
	5.41389992328904e-05 5 5
	42.4812 seconds
	gini=0.12005884445047671
	mean outcome: 0.04676540361757755
	mean prediction: 0.04666568650088688

	Using glmnet...
	0.3295 seconds
	gini=0.11954649640739994
	mean outcome: 0.04676540361757755
	mean prediction: 0.04666104363097935
	all close? True

	Using old implementation... (no standardization)
	1.3457867135381405 0 0
	0.6713103191660521 1 1
	0.11819264109487121 2 2
	0.03528348997508118 3 3
	0.0072786327343033605 4 4
	0.0005546862828521083 5 5
	36.9609 seconds
	gini=0.11950432517998748
	mean outcome: 0.04676540361757755
	mean prediction: 0.04666591512425464
	all close? True