Skip to content

Instantly share code, notes, and snippets.

@fabianp
Created September 9, 2014 13:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fabianp/804e8a2891633d4618dd to your computer and use it in GitHub Desktop.
Save fabianp/804e8a2891633d4618dd to your computer and use it in GitHub Desktop.
profile strong rules
=====================
Lasso and Elastic Net
=====================
Lasso and elastic net (L1 and L2 penalisation) implemented using a
coordinate descent.
The coefficients can be forced to be positive.
Computing regularization path using the lasso...
Computing regularization path using the positive lasso...
Computing regularization path using the elastic net...
Computing regularization path using the positve elastic net...
Wrote profile results to plot_lasso_coordinate_descent_path.py.lprof
Timer unit: 1e-06 s
Total time: 0.016946 s
File: /home/fp985994/dev/scikit-learn/sklearn/linear_model/coordinate_descent.py
Function: compute_strong_active_set at line 38
Line # Hits Time Per Hit % Time Line Contents
==============================================================
38 @profile
39 def compute_strong_active_set(X, y, current_alpha, prev_alpha, l1_ratio, coef,
40 residual):
41 """
42 Compute the features that are active along a regularization path.
43
44 An active feature satisfies the following criteria.
45 |X_{j} * (y - X*w)| < 2 * current_alpha - prev_alpha
46
47 Parameters
48 ----------
49 X : {ndarray, sparse matrix}, shape (n_samples, n_features)
50 Training data.
51
52 y : ndarray, shape (n_samples,) or (n_samples, n_outputs)
53 Target values.
54
55 current_alpha : float
56 Current regularization parameter.
57
58 prev_alpha : float
59 Previous regularization parameter while fitting along the grid of
60 alphas.
61
62 l1_ratio : float
63 ElasticNet mixing parameter. A l1_ratio of 1.0 implies a l1 penalty
64 while that of 0.0 indicates a l2 penalty.
65
66 coef : ndarray, shape (n_features)
67 Initial feature vector along the regularization path.
68
69 Returns
70 -------
71 active_features : ndarray
72 Array of active features.
73 """
74 400 657 1.6 3.9 prev_alpha, current_alpha = prev_alpha * l1_ratio, current_alpha * l1_ratio
75
76 # If the right hand side is negative, there is no point doing
77 # the remaining computation as the left hand side is always positive.
78 400 1490 3.7 8.8 if 2*current_alpha - prev_alpha < 0:
79 return np.arange(X.shape[1])
80
81 400 486 1.2 2.9 multi_output = (y.ndim >= 2)
82
83 400 267 0.7 1.6 if multi_output:
84 # coef is of shape n_outputs * n_features.
85 # We need n_features * n_outputs.
86 coef = coef.T
87
88 400 5448 13.6 32.1 active_rule = safe_sparse_dot(X.T, residual)
89 400 1724 4.3 10.2 active_rule = np.abs(active_rule, active_rule)
90 400 307 0.8 1.8 if multi_output:
91 active_rule = row_norms(active_rule)
92
93 400 3664 9.2 21.6 active_features = (active_rule >= 2*current_alpha - prev_alpha)
94 400 2302 5.8 13.6 active_features = np.nonzero(active_features)[0]
95 400 352 0.9 2.1 del active_rule
96 400 249 0.6 1.5 return active_features
Total time: 0.114876 s
File: /home/fp985994/dev/scikit-learn/sklearn/linear_model/coordinate_descent.py
Function: check_kkt_conditions at line 98
Line # Hits Time Per Hit % Time Line Contents
==============================================================
98 @profile
99 def check_kkt_conditions(X, y, coef, strong_active_set, alpha, l1_ratio,
100 residual, active_set=None,
101 return_active_set=True, rtol=1e-3):
102 """
103 Checks if the KKT conditions are satisfied for ElasticNet and Lasso.
104
105 These necessary conditions are used to check if a given feature vector
106 minimizes the loss function.
107 If return_active_set is set to True, then it returns the set of
108 features for which the KKT conditions are violated (active_features).
109 Else it returns False when the conditions are violated for the first
110 feature.
111
112 Parameters
113 ----------
114 X : {ndarray, sparse matrix}, shape (n_samples, n_features)
115 Training data.
116
117 y : ndarray, shape (n_samples,) or (n_samples, n_outputs)
118 Target values.
119
120 coef : ndarray, shape (n_features,)
121 Feature vector.
122
123 strong_active_set : array-like
124 The features indices for which the KKT conditions are to be verified.
125
126 alpha : float
127 Regularization parameter.
128
129 l1_ratio : float
130 ElasticNet mixing parameter. A l1_ratio of 1.0 implies a l1 penalty
131 while that of 0.0 indicates a l2 penalty.
132
133 active_set : array-like, optional
134 If supplied, the features for which the KKT conditions are violated
135 are appended to this.
136
137 return_active_set : bool
138 Whether or not to return the active_set.
139
140 Returns
141 -------
142 kkt_violations : bool
143 Whether or not the KKT violations are violated across the
144 strong_active_set.
145
146 active_set : ndarray
147 Active features across the strong_active_set. Returned only if
148 return_active_set is set to True.
149 """
150 718 911 1.3 0.8 kkt_violations = False
151
152 718 915 1.3 0.8 if active_set is None:
153 active_set = []
154 718 1638 2.3 1.4 if not isinstance(active_set, list):
155 459 1024 2.2 0.9 active_set = active_set.tolist()
156
157 718 1070 1.5 0.9 multi_output = (y.ndim >= 2)
158 718 795 1.1 0.7 if multi_output:
159 coef = coef.T
160
161 718 2145 3.0 1.9 X_sparse = sparse.isspmatrix(X)
162 718 813 1.1 0.7 if X_sparse:
163 X_indices = X.indices
164 X_data = X.data
165 X_indptr = X.indptr
166
167 3954 6339 1.6 5.5 for i in strong_active_set:
168 3236 3606 1.1 3.1 if X_sparse:
169 startptr = X_indptr[i]
170 endptr = X_indptr[i + 1]
171 dot_data = X_data[startptr: endptr]
172 residual_data = residual.take(X_indices[startptr: endptr],
173 mode='clip')
174 loss_derivative = np.dot(dot_data, residual_data)
175 else:
176 3236 14044 4.3 12.2 feature = X[:, i]
177 3236 13740 4.2 12.0 loss_derivative = np.dot(feature, residual)
178 3236 5011 1.5 4.4 active_coef = coef[i]
179
180 # XXX: I'm not sure about the multi_output case.
181 3236 3719 1.1 3.2 if multi_output:
182 loss_derivative = norm(loss_derivative)
183 mean_coef = np.mean(active_coef)
184 l1_penalty = alpha * l1_ratio
185 if mean_coef < 0:
186 l1_penalty = -l1_penalty
187 active_coef = norm(active_coef)
188 l2_penalty = alpha * (1 - l1_ratio) * active_coef
189 else:
190 3236 5452 1.7 4.7 l1_penalty = alpha * l1_ratio
191 3236 9169 2.8 8.0 if active_coef < 0:
192 416 618 1.5 0.5 l1_penalty = -l1_penalty
193 3236 7357 2.3 6.4 l2_penalty = alpha * (1 - l1_ratio) * active_coef
194
195 3236 5163 1.6 4.5 penalty = l1_penalty + l2_penalty
196 3236 7399 2.3 6.4 tol = 1e-3 + rtol * abs(penalty)
197
198 3236 8742 2.7 7.6 if active_coef != 0 and (abs(loss_derivative - penalty) > tol):
199 1328 2257 1.7 2.0 active_set.append(i)
200 1328 1585 1.2 1.4 kkt_violations = True
201 1328 1618 1.2 1.4 if not return_active_set:
202 break
203 1908 5612 2.9 4.9 elif active_coef == 0 and abs(loss_derivative) > alpha*l1_ratio:
204 588 1012 1.7 0.9 active_set.append(i)
205 588 717 1.2 0.6 kkt_violations = True
206 588 703 1.2 0.6 if not return_active_set:
207 break
208
209 718 860 1.2 0.7 if return_active_set:
210 718 842 1.2 0.7 return active_set, kkt_violations
211 else:
212 return kkt_violations
Total time: 0.600602 s
File: /home/fp985994/dev/scikit-learn/sklearn/linear_model/coordinate_descent.py
Function: enet_path at line 487
Line # Hits Time Per Hit % Time Line Contents
==============================================================
487 @profile
488 def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
489 precompute='auto', Xy=None, fit_intercept=True,
490 normalize=False, copy_X=True, coef_init=None,
491 verbose=False, return_models=False, return_n_iter=False,
492 **params):
493 """Compute elastic net path with coordinate descent
494
495 The elastic net optimization function varies for mono and multi-outputs.
496
497 For mono-output tasks it is::
498
499 1 / (2 * n_samples) * ||y - Xw||^2_2 +
500 + alpha * l1_ratio * ||w||_1
501 + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2
502
503 For multi-output tasks it is::
504
505 (1 / (2 * n_samples)) * ||Y - XW||^Fro_2
506 + alpha * l1_ratio * ||W||_21
507 + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2
508
509 Where::
510
511 ||W||_21 = \sum_i \sqrt{\sum_j w_{ij}^2}
512
513 i.e. the sum of norm of each row.
514
515 Parameters
516 ----------
517 X : {array-like}, shape (n_samples, n_features)
518 Training data. Pass directly as Fortran-contiguous data to avoid
519 unnecessary memory duplication. If ``y`` is mono-output then ``X``
520 can be sparse.
521
522 y : ndarray, shape = (n_samples,) or (n_samples, n_outputs)
523 Target values
524
525 l1_ratio : float, optional
526 float between 0 and 1 passed to elastic net (scaling between
527 l1 and l2 penalties). ``l1_ratio=1`` corresponds to the Lasso
528
529 eps : float
530 Length of the path. ``eps=1e-3`` means that
531 ``alpha_min / alpha_max = 1e-3``
532
533 n_alphas : int, optional
534 Number of alphas along the regularization path
535
536 alphas : ndarray, optional
537 List of alphas where to compute the models.
538 If None alphas are set automatically
539
540 precompute : True | False | 'auto' | array-like
541 Whether to use a precomputed Gram matrix to speed up
542 calculations. If set to ``'auto'`` let us decide. The Gram
543 matrix can also be passed as argument.
544
545 Xy : array-like, optional
546 Xy = np.dot(X.T, y) that can be precomputed. It is useful
547 only when the Gram matrix is precomputed.
548
549 fit_intercept : bool
550 Fit or not an intercept.
551 WARNING : deprecated, will be removed in 0.16.
552
553 normalize : boolean, optional, default False
554 If ``True``, the regressors X will be normalized before regression.
555 WARNING : deprecated, will be removed in 0.16.
556
557 copy_X : boolean, optional, default True
558 If ``True``, X will be copied; else, it may be overwritten.
559
560 coef_init : array, shape (n_features, ) | None
561 The initial values of the coefficients.
562
563 verbose : bool or integer
564 Amount of verbosity.
565
566 return_models : boolean, optional, default False
567 If ``True``, the function will return list of models. Setting it
568 to ``False`` will change the function output returning the values
569 of the alphas and the coefficients along the path. Returning the
570 model list will be removed in version 0.16.
571
572 params : kwargs
573 keyword arguments passed to the coordinate descent solver.
574
575 return_n_iter : bool
576 whether to return the number of iterations or not.
577
578 Returns
579 -------
580 models : a list of models along the regularization path
581 (Is returned if ``return_models`` is set ``True`` (default).
582
583 alphas : array, shape (n_alphas,)
584 The alphas along the path where models are computed.
585 (Is returned, along with ``coefs``, when ``return_models`` is set
586 to ``False``)
587
588 coefs : array, shape (n_features, n_alphas) or
589 (n_outputs, n_features, n_alphas)
590 Coefficients along the path.
591 (Is returned, along with ``alphas``, when ``return_models`` is set
592 to ``False``).
593
594 dual_gaps : array, shape (n_alphas,)
595 The dual gaps at the end of the optimization for each alpha.
596 (Is returned, along with ``alphas``, when ``return_models`` is set
597 to ``False``).
598
599 n_iters : array-like, shape (n_alphas,)
600 The number of iterations taken by the coordinate descent optimizer to
601 reach the specified tolerance for each alpha.
602 (Is returned, along with ``alphas``, when ``return_models`` is set
603 to ``False`` and ``return_n_iter`` is set to True).
604
605 Notes
606 -----
607 See examples/plot_lasso_coordinate_descent_path.py for an example.
608
609 Deprecation Notice: Setting ``return_models`` to ``False`` will make
610 the Lasso Path return an output in the style used by :func:`lars_path`.
611 This will be become the norm as of version 0.15. Leaving ``return_models``
612 set to `True` will let the function return a list of models as before.
613
614 See also
615 --------
616 MultiTaskElasticNet
617 MultiTaskElasticNetCV
618 ElasticNet
619 ElasticNetCV
620 """
621 4 12 3.0 0.0 if return_models:
622 warnings.warn("Use enet_path(return_models=False), as it returns the"
623 " coefficients and alphas instead of just a list of"
624 " models as previously `lasso_path`/`enet_path` did."
625 " `return_models` will eventually be removed in 0.16,"
626 " after which, returning alphas and coefs"
627 " will become the norm.",
628 DeprecationWarning, stacklevel=2)
629
630 4 12 3.0 0.0 if normalize is True:
631 warnings.warn("normalize param will be removed in 0.16."
632 " Intercept fitting and feature normalization will be"
633 " done in estimators.",
634 DeprecationWarning, stacklevel=2)
635 else:
636 4 9 2.2 0.0 normalize = False
637
638 4 11 2.8 0.0 if fit_intercept is True or fit_intercept is None:
639 warnings.warn("fit_intercept param will be removed in 0.16."
640 " Intercept fitting and feature normalization will be"
641 " done in estimators.",
642 DeprecationWarning, stacklevel=2)
643
644 4 11 2.8 0.0 if fit_intercept is None:
645 fit_intercept = True
646
647 4 12 3.0 0.0 X = check_array(X, 'csc', dtype=np.float64, order='F', copy=copy_X and
648 4 338 84.5 0.1 fit_intercept)
649 4 16 4.0 0.0 n_samples, n_features = X.shape
650
651 4 11 2.8 0.0 multi_output = False
652 4 12 3.0 0.0 if y.ndim != 1:
653 multi_output = True
654 _, n_outputs = y.shape
655
656 # MultiTaskElasticNet does not support sparse matrices
657 4 9 2.2 0.0 X_sparse_scaling = None
658 4 22 5.5 0.0 if not multi_output and sparse.isspmatrix(X):
659 if 'X_mean' in params:
660 # As sparse matrices are not actually centered we need this
661 # to be passed to the CD solver.
662 X_sparse_scaling = params['X_mean'] / params['X_std']
663 else:
664 X_sparse_scaling = np.ones(n_features)
665
666 X, y, X_mean, y_mean, X_std, precompute, Xy = \
667 4 445 111.2 0.1 _pre_fit(X, y, Xy, precompute, normalize, fit_intercept, copy=False)
668 4 15 3.8 0.0 if alphas is None:
669 # No need to normalize of fit_intercept: it has been done
670 # above
671 4 12 3.0 0.0 alphas = _alpha_grid(X, y, Xy=Xy, l1_ratio=l1_ratio,
672 4 8 2.0 0.0 fit_intercept=False, eps=eps, n_alphas=n_alphas,
673 4 511 127.8 0.1 normalize=False, copy_X=False)
674 else:
675 alphas = np.sort(alphas)[::-1] # make sure alphas are properly ordered
676
677 4 36 9.0 0.0 feature_array = np.arange(X.shape[1], dtype=np.int32)
678 4 12 3.0 0.0 n_alphas = len(alphas)
679 4 13 3.2 0.0 tol = params.get('tol', 1e-4)
680 4 11 2.8 0.0 positive = params.get('positive', False)
681 4 11 2.8 0.0 max_iter = params.get('max_iter', 1000)
682 4 20 5.0 0.0 dual_gaps = np.empty(n_alphas)
683 4 10 2.5 0.0 n_iters = []
684
685 4 24 6.0 0.0 rng = check_random_state(params.get('random_state', None))
686 4 12 3.0 0.0 selection = params.get('selection', 'cyclic')
687 4 11 2.8 0.0 if selection not in ['random', 'cyclic']:
688 raise ValueError("selection should be either random or cyclic.")
689 4 11 2.8 0.0 random = (selection == 'random')
690 4 11 2.8 0.0 models = []
691
692 4 10 2.5 0.0 if not multi_output:
693 4 22 5.5 0.0 coefs = np.empty((n_features, n_alphas), dtype=np.float64)
694 else:
695 coefs = np.empty((n_outputs, n_features, n_alphas),
696 dtype=np.float64)
697
698 4 10 2.5 0.0 if coef_init is None:
699 4 37 9.2 0.0 coef_ = np.asfortranarray(np.zeros(coefs.shape[:-1]))
700 else:
701 coef_ = np.asfortranarray(coef_init)
702
703 # Compute the non zero coefficient for the first alpha, which we assume
704 # is the active set, that is checked on further across other alpha.
705 4 29 7.2 0.0 l1_reg = alphas[0] * l1_ratio * n_samples
706 4 22 5.5 0.0 l2_reg = alphas[0] * (1.0 - l1_ratio) * n_samples
707 4 25 6.2 0.0 active_set = np.arange(n_features, dtype=np.int32)
708 4 10 2.5 0.0 model = _cd_fast_path(
709 4 9 2.2 0.0 X, y, coef_, l1_reg, l2_reg, active_set, max_iter, tol,
710 4 255 63.8 0.0 rng, random, positive, precompute, Xy, X_sparse_scaling
711 )
712 4 12 3.0 0.0 coef_, dual_gap_, eps_, n_iter_ = model
713 4 14 3.5 0.0 n_iters.append(n_iter_)
714 4 13 3.2 0.0 dual_gaps[0] = dual_gap_
715 4 35 8.8 0.0 coefs[..., 0] = coef_
716
717 4 11 2.8 0.0 if multi_output:
718 active_set = np.nonzero(np.abs(coef_.T) > 1e-12)[0]
719 active_set = np.nonzero(np.bincount(active_set) == n_outputs)[0]
720 else:
721 4 81 20.2 0.0 active_set = np.nonzero(np.abs(coef_) > 1e-12)[0]
722
723 400 1309 3.3 0.2 for i, alpha in enumerate(alphas[1:]):
724
725 396 1070 2.7 0.2 kkt_violations = True
726 396 1267 3.2 0.2 residual = None
727 396 2204 5.6 0.4 l1_reg = alpha * l1_ratio * n_samples
728 396 1930 4.9 0.3 l2_reg = alpha * (1.0 - l1_ratio) * n_samples
729
730 796 2384 3.0 0.4 while kkt_violations:
731 400 4083 10.2 0.7 eligible_features = np.copy(active_set)
732 400 1168 2.9 0.2 if residual is None:
733 396 8957 22.6 1.5 residual = compute_residual(X, y, coef_)
734 400 1235 3.1 0.2 strong_active_set = compute_strong_active_set(
735 400 3028 7.6 0.5 X, y, n_samples*alpha, n_samples*alphas[i-1], l1_ratio,
736 400 24443 61.1 4.1 coef_, residual,
737 )
738 1169 3437 2.9 0.6 while kkt_violations:
739 769 2682 3.5 0.4 if isinstance(eligible_features, list):
740 369 1040 2.8 0.2 eligible_features = np.asarray(eligible_features,
741 369 4200 11.4 0.7 dtype=np.int32)
742 769 2133 2.8 0.4 model = _cd_fast_path(
743 769 2081 2.7 0.3 X, y, coef_, l1_reg, l2_reg, eligible_features,
744 769 2018 2.6 0.3 max_iter, tol, rng, random, positive, precompute,
745 769 136395 177.4 22.7 Xy, X_sparse_scaling
746 )
747 769 21483 27.9 3.6 residual = compute_residual(X, y, coef_)
748 769 2477 3.2 0.4 coef_, dual_gap_, eps_, n_iter_ = model
749
750 # Check KKT conditions for features present in strong_active_set.
751 # and not in eligible predictors. If KKT conditions are violated add them
752 # to the eligible features, and optimize again.
753 769 2102 2.7 0.3 kkt_violations = False
754 769 2343 3.0 0.4 common_features = np.in1d(strong_active_set, eligible_features,
755 769 49696 64.6 8.3 assume_unique=True)
756 769 6367 8.3 1.1 not_in_active = strong_active_set[~common_features]
757
758 769 16275 21.2 2.7 if np.any(not_in_active):
759 455 1378 3.0 0.2 eligible_features, kkt_violations = check_kkt_conditions(
760 455 2871 6.3 0.5 X, y, coef_, not_in_active, n_samples*alpha, l1_ratio,
761 455 137453 302.1 22.9 residual, eligible_features, return_active_set=True)
762
763 # Check KKT conditions for all predictors. The strong and eligible
764 # predictors have been removed since they have already been optimized
765 # for in the above while loop.
766 400 1109 2.8 0.2 kkt_violations = False
767 400 1193 3.0 0.2 non_active = np.setdiff1d(feature_array, eligible_features,
768 400 31963 79.9 5.3 assume_unique=True)
769 400 1387 3.5 0.2 non_active = np.setdiff1d(non_active, strong_active_set,
770 400 29590 74.0 4.9 assume_unique=True)
771 400 8007 20.0 1.3 if np.any(non_active):
772 263 802 3.0 0.1 active_set, kkt_violations = check_kkt_conditions(
773 263 1610 6.1 0.3 X, y, coef_, non_active, n_samples*alpha, l1_ratio,
774 263 63788 242.5 10.6 residual, active_set, return_active_set=True)
775
776 396 1158 2.9 0.2 if dual_gap_ > eps_:
777 warnings.warn('Objective did not converge.' +
778 ' You might want' +
779 ' to increase the number of iterations',
780 ConvergenceWarning)
781 396 3356 8.5 0.6 coefs[..., i + 1] = coef_
782 396 1407 3.6 0.2 dual_gaps[i + 1] = dual_gap_
783 396 1320 3.3 0.2 n_iters.append(n_iter_)
784 396 1096 2.8 0.2 if return_models:
785 if not multi_output:
786 model = ElasticNet(
787 alpha=alpha, l1_ratio=l1_ratio,
788 fit_intercept=fit_intercept
789 if sparse.isspmatrix(X) else False,
790 precompute=precompute)
791 else:
792 model = MultiTaskElasticNet(
793 alpha=alpha, l1_ratio=l1_ratio, fit_intercept=False)
794 model.dual_gap_ = dual_gaps[i + 1]
795 model.coef_ = coefs[..., i + 1]
796 model.n_iter_ = n_iters[i + 1]
797 if (fit_intercept and not sparse.isspmatrix(X)) or multi_output:
798 model.fit_intercept = True
799 model._set_intercept(X_mean, y_mean, X_std)
800 models.append(model)
801
802 396 1031 2.6 0.2 if verbose:
803 if verbose > 2:
804 print(model)
805 elif verbose > 1:
806 print('Path: %03i out of %03i' % (i, n_alphas))
807 else:
808 sys.stderr.write('.')
809
810 4 12 3.0 0.0 if return_models:
811 return models
812 4 10 2.5 0.0 elif return_n_iter:
813 return alphas, coefs, dual_gaps, n_iters
814 else:
815 4 11 2.8 0.0 return alphas, coefs, dual_gaps
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment