ENH fit intercept in ProxNewton solver (#77)

Badr-MOUFAD · PABannier · mathurinm · web-flow · commit db46d69e623b · 2022-10-16T12:19:40.000+02:00
Co-authored-by: PAB &lt;pierreantoine.bannier@gmail.com&gt;
Co-authored-by: mathurinm &lt;mathurin.massias@gmail.com&gt;
diff --git a/skglm/solvers/prox_newton.py b/skglm/solvers/prox_newton.py
@@ -24,6 +24,9 @@ class ProxNewton(BaseSolver):
     tol : float, default 1e-4
         Tolerance for convergence.
 
+    fit_intercept : bool, default True
+        If ``True``, fits an unpenalized intercept.
+
     verbose : bool, default False
         Amount of verbosity. 0/False is silent.
 
@@ -53,7 +56,8 @@ def __init__(self, p0=10, max_iter=20, max_pn_iter=1000, tol=1e-4,
 
     def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
         n_samples, n_features = X.shape
-        w = np.zeros(n_features) if w_init is None else w_init
+        fit_intercept = self.fit_intercept
+        w = np.zeros(n_features + fit_intercept) if w_init is None else w_init
         Xw = np.zeros(n_samples) if Xw_init is None else Xw_init
         all_features = np.arange(n_features)
         stop_crit = 0.
@@ -63,20 +67,38 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
         if is_sparse:
             X_bundles = (X.data, X.indptr, X.indices)
 
+        if len(w) != n_features + self.fit_intercept:
+            if self.fit_intercept:
+                val_error_message = (
+                    "w should be of size n_features + 1 when using fit_intercept=True: "
+                    f"expected {n_features + 1}, got {len(w)}.")
+            else:
+                val_error_message = (
+                    "w should be of size n_features: "
+                    f"expected {n_features}, got {len(w)}.")
+            raise ValueError(val_error_message)
+
         for t in range(self.max_iter):
             # compute scores
             if is_sparse:
                 grad = _construct_grad_sparse(
-                    *X_bundles, y, w, Xw, datafit, all_features)
+                    *X_bundles, y, w[:n_features], Xw, datafit, all_features)
             else:
-                grad = _construct_grad(X, y, w, Xw, datafit, all_features)
+                grad = _construct_grad(X, y, w[:n_features], Xw, datafit, all_features)
 
-            opt = penalty.subdiff_distance(w, grad, all_features)
+            opt = penalty.subdiff_distance(w[:n_features], grad, all_features)
+
+            # optimality of intercept
+            if fit_intercept:
+                # gradient w.r.t. intercept (constant features of ones)
+                intercept_opt = np.abs(np.sum(datafit.raw_grad(y, Xw)))
+            else:
+                intercept_opt = 0.
 
             # check convergences
-            stop_crit = np.max(opt)
+            stop_crit = max(np.max(opt), intercept_opt)
             if self.verbose:
-                p_obj = datafit.value(y, w, Xw) + penalty.value(w)
+                p_obj = datafit.value(y, w, Xw) + penalty.value(w[:n_features])
                 print(
                     "Iteration {}: {:.10f}, ".format(t+1, p_obj) +
                     "stopping crit: {:.2e}".format(stop_crit)
@@ -101,20 +123,22 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
                 # find descent direction
                 if is_sparse:
                     delta_w_ws, X_delta_w_ws = _descent_direction_s(
-                        *X_bundles, y, w, Xw, grad_ws, datafit,
+                        *X_bundles, y, w, Xw, fit_intercept, grad_ws, datafit,
                         penalty, ws, tol=EPS_TOL*tol_in)
                 else:
                     delta_w_ws, X_delta_w_ws = _descent_direction(
-                        X, y, w, Xw, grad_ws, datafit, penalty, ws, tol=EPS_TOL*tol_in)
+                        X, y, w, Xw, fit_intercept, grad_ws, datafit,
+                        penalty, ws, tol=EPS_TOL*tol_in)
 
                 # backtracking line search with inplace update of w, Xw
                 if is_sparse:
                     grad_ws[:] = _backtrack_line_search_s(
-                        *X_bundles, y, w, Xw, datafit, penalty, delta_w_ws,
-                        X_delta_w_ws, ws)
+                        *X_bundles, y, w, Xw, fit_intercept, datafit, penalty,
+                        delta_w_ws, X_delta_w_ws, ws)
                 else:
                     grad_ws[:] = _backtrack_line_search(
-                        X, y, w, Xw, datafit, penalty, delta_w_ws, X_delta_w_ws, ws)
+                        X, y, w, Xw, fit_intercept, datafit, penalty,
+                        delta_w_ws, X_delta_w_ws, ws)
 
                 # check convergence
                 opt_in = penalty.subdiff_distance(w, grad_ws, ws)
@@ -138,7 +162,7 @@ def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
 
 
 @njit
-def _descent_direction(X, y, w_epoch, Xw_epoch, grad_ws, datafit,
+def _descent_direction(X, y, w_epoch, Xw_epoch, fit_intercept, grad_ws, datafit,
                        penalty, ws, tol):
     # Given:
     #   1) b = \nabla F(X w_epoch)
@@ -152,11 +176,16 @@ def _descent_direction(X, y, w_epoch, Xw_epoch, grad_ws, datafit,
     for idx, j in enumerate(ws):
         lipschitz[idx] = raw_hess @ X[:, j] ** 2
 
-    # for a less costly stopping criterion, we do no compute the exact gradient,
-    # but store each coordinate-wise gradient every time we upate one coordinate:
+    # for a less costly stopping criterion, we do not compute the exact gradient,
+    # but store each coordinate-wise gradient every time we update one coordinate
     past_grads = np.zeros(len(ws))
     X_delta_w_ws = np.zeros(X.shape[0])
-    w_ws = w_epoch[ws]
+    ws_intercept = np.append(ws, -1) if fit_intercept else ws
+    w_ws = w_epoch[ws_intercept]
+
+    if fit_intercept:
+        lipschitz_intercept = np.sum(raw_hess)
+        grad_intercept = np.sum(datafit.raw_grad(y, Xw_epoch))
 
     for cd_iter in range(MAX_CD_ITER):
         for idx, j in enumerate(ws):
@@ -174,22 +203,35 @@ def _descent_direction(X, y, w_epoch, Xw_epoch, grad_ws, datafit,
             if w_ws[idx] != old_w_idx:
                 X_delta_w_ws += (w_ws[idx] - old_w_idx) * X[:, j]
 
+        if fit_intercept:
+            past_grads_intercept = grad_intercept + raw_hess @ X_delta_w_ws
+            old_intercept = w_ws[-1]
+            w_ws[-1] -= past_grads_intercept / lipschitz_intercept
+
+            if w_ws[-1] != old_intercept:
+                X_delta_w_ws += w_ws[-1] - old_intercept
+
         if cd_iter % 5 == 0:
             # TODO: can be improved by passing in w_ws but breaks for WeightedL1
             current_w = w_epoch.copy()
-            current_w[ws] = w_ws
+            current_w[ws_intercept] = w_ws
             opt = penalty.subdiff_distance(current_w, past_grads, ws)
-            if np.max(opt) <= tol:
+            stop_crit = np.max(opt)
+
+            if fit_intercept:
+                stop_crit = max(stop_crit, np.abs(past_grads_intercept))
+
+            if stop_crit <= tol:
                 break
 
     # descent direction
-    return w_ws - w_epoch[ws], X_delta_w_ws
+    return w_ws - w_epoch[ws_intercept], X_delta_w_ws
 
 
-# sparse version of _compute_descent_direction
+# sparse version of _descent_direction
 @njit
 def _descent_direction_s(X_data, X_indptr, X_indices, y, w_epoch,
-                         Xw_epoch, grad_ws, datafit, penalty, ws, tol):
+                         Xw_epoch, fit_intercept, grad_ws, datafit, penalty, ws, tol):
     raw_hess = datafit.raw_hessian(y, Xw_epoch)
 
     lipschitz = np.zeros(len(ws))
@@ -201,7 +243,12 @@ def _descent_direction_s(X_data, X_indptr, X_indices, y, w_epoch,
     # see _descent_direction() comment
     past_grads = np.zeros(len(ws))
     X_delta_w_ws = np.zeros(len(y))
-    w_ws = w_epoch[ws]
+    ws_intercept = np.append(ws, -1) if fit_intercept else ws
+    w_ws = w_epoch[ws_intercept]
+
+    if fit_intercept:
+        lipschitz_intercept = np.sum(raw_hess)
+        grad_intercept = np.sum(datafit.raw_grad(y, Xw_epoch))
 
     for cd_iter in range(MAX_CD_ITER):
         for idx, j in enumerate(ws):
@@ -224,39 +271,57 @@ def _descent_direction_s(X_data, X_indptr, X_indices, y, w_epoch,
                 _update_X_delta_w(X_data, X_indptr, X_indices, X_delta_w_ws,
                                   w_ws[idx] - old_w_idx, j)
 
+        if fit_intercept:
+            past_grads_intercept = grad_intercept + raw_hess @ X_delta_w_ws
+            old_intercept = w_ws[-1]
+            w_ws[-1] -= past_grads_intercept / lipschitz_intercept
+
+            if w_ws[-1] != old_intercept:
+                X_delta_w_ws += w_ws[-1] - old_intercept
+
         if cd_iter % 5 == 0:
             # TODO: could be improved by passing in w_ws
             current_w = w_epoch.copy()
-            current_w[ws] = w_ws
+            current_w[ws_intercept] = w_ws
             opt = penalty.subdiff_distance(current_w, past_grads, ws)
-            if np.max(opt) <= tol:
+            stop_crit = np.max(opt)
+
+            if fit_intercept:
+                stop_crit = max(stop_crit, np.abs(past_grads_intercept))
+
+            if stop_crit <= tol:
                 break
 
     # descent direction
-    return w_ws - w_epoch[ws], X_delta_w_ws
+    return w_ws - w_epoch[ws_intercept], X_delta_w_ws
 
 
 @njit
-def _backtrack_line_search(X, y, w, Xw, datafit, penalty, delta_w_ws,
+def _backtrack_line_search(X, y, w, Xw, fit_intercept, datafit, penalty, delta_w_ws,
                            X_delta_w_ws, ws):
     # 1) find step in [0, 1] such that:
     #   penalty(w + step * delta_w) - penalty(w) +
     #   step * \nabla datafit(w + step * delta_w) @ delta_w < 0
     # ref: https://www.di.ens.fr/~aspremon/PDF/ENSAE/Newton.pdf
     # 2) inplace update of w and Xw and return grad_ws of the last w and Xw
     step, prev_step = 1., 0.
+    n_features = X.shape[1]
+    ws_intercept = np.append(ws, -1) if fit_intercept else ws
     # TODO: could be improved by passing in w[ws]
-    old_penalty_val = penalty.value(w)
+    old_penalty_val = penalty.value(w[:n_features])
 
     # try step = 1, 1/2, 1/4, ...
     for _ in range(MAX_BACKTRACK_ITER):
-        w[ws] += (step - prev_step) * delta_w_ws
+        w[ws_intercept] += (step - prev_step) * delta_w_ws
         Xw += (step - prev_step) * X_delta_w_ws
 
-        grad_ws = _construct_grad(X, y, w, Xw, datafit, ws)
+        grad_ws = _construct_grad(X, y, w[:n_features], Xw, datafit, ws)
         # TODO: could be improved by passing in w[ws]
-        stop_crit = penalty.value(w) - old_penalty_val
-        stop_crit += step * grad_ws @ delta_w_ws
+        stop_crit = penalty.value(w[:n_features]) - old_penalty_val
+        stop_crit += step * grad_ws @ delta_w_ws[:len(ws)]
+
+        if fit_intercept:
+            stop_crit += step * delta_w_ws[-1] * np.sum(datafit.raw_grad(y, Xw))
 
         if stop_crit < 0:
             break
@@ -272,21 +337,26 @@ def _backtrack_line_search(X, y, w, Xw, datafit, penalty, delta_w_ws,
 
 # sparse version of _backtrack_line_search
 @njit
-def _backtrack_line_search_s(X_data, X_indptr, X_indices, y, w, Xw, datafit,
-                             penalty, delta_w_ws, X_delta_w_ws, ws):
+def _backtrack_line_search_s(X_data, X_indptr, X_indices, y, w, Xw, fit_intercept,
+                             datafit, penalty, delta_w_ws, X_delta_w_ws, ws):
     step, prev_step = 1., 0.
+    n_features = len(X_indptr) - 1
+    ws_intercept = np.append(ws, -1) if fit_intercept else ws
     # TODO: could be improved by passing in w[ws]
-    old_penalty_val = penalty.value(w)
+    old_penalty_val = penalty.value(w[:n_features])
 
     for _ in range(MAX_BACKTRACK_ITER):
-        w[ws] += (step - prev_step) * delta_w_ws
+        w[ws_intercept] += (step - prev_step) * delta_w_ws
         Xw += (step - prev_step) * X_delta_w_ws
 
         grad_ws = _construct_grad_sparse(X_data, X_indptr, X_indices,
-                                         y, w, Xw, datafit, ws)
+                                         y, w[:n_features], Xw, datafit, ws)
         # TODO: could be improved by passing in w[ws]
-        stop_crit = penalty.value(w) - old_penalty_val
-        stop_crit += step * grad_ws.T @ delta_w_ws
+        stop_crit = penalty.value(w[:n_features]) - old_penalty_val
+        stop_crit += step * grad_ws.T @ delta_w_ws[:len(ws)]
+
+        if fit_intercept:
+            stop_crit += step * delta_w_ws[-1] * np.sum(datafit.raw_grad(y, Xw))
 
         if stop_crit < 0:
             break
diff --git a/skglm/tests/test_prox_newton.py b/skglm/tests/test_prox_newton.py
@@ -9,25 +9,10 @@
 from skglm.utils import make_correlated_data, compiled_clone
 
 
-@pytest.mark.parametrize('X_density', [1, 0.5])
-def test_alpha_max(X_density):
-    n_samples, n_features = 10, 20
-    X, y, _ = make_correlated_data(
-        n_samples, n_features, X_density=X_density, random_state=2)
-    y = np.sign(y)
-
-    alpha_max = np.linalg.norm(X.T @ y, ord=np.inf) / (2 * n_samples)
-
-    log_datafit = compiled_clone(Logistic())
-    l1_penalty = compiled_clone(L1(alpha_max))
-    w = ProxNewton().solve(X, y, log_datafit, l1_penalty)[0]
-
-    np.testing.assert_equal(w, 0)
-
-
-@pytest.mark.parametrize("rho, X_density", product([1e-1, 1e-2], [1., 0.5]))
-def test_pn_vs_sklearn(rho, X_density):
-    n_samples, n_features = 11, 19
+@pytest.mark.parametrize("X_density, fit_intercept", product([1., 0.5], [True, False]))
+def test_pn_vs_sklearn(X_density, fit_intercept):
+    n_samples, n_features = 12, 25
+    rho = 1e-1
 
     X, y, _ = make_correlated_data(n_samples, n_features, random_state=0,
                                    X_density=X_density)
@@ -37,14 +22,18 @@ def test_pn_vs_sklearn(rho, X_density):
     alpha = rho * alpha_max
 
     sk_log_reg = LogisticRegression(penalty='l1', C=1/(n_samples * alpha),
-                                    fit_intercept=False, tol=1e-9, solver='liblinear')
+                                    fit_intercept=fit_intercept, random_state=0,
+                                    tol=1e-12, solver='saga', max_iter=1_000_000)
     sk_log_reg.fit(X, y)
 
     log_datafit = compiled_clone(Logistic())
     l1_penalty = compiled_clone(L1(alpha))
-    w = ProxNewton(tol=1e-9).solve(X, y, log_datafit, l1_penalty)[0]
+    prox_solver = ProxNewton(fit_intercept=fit_intercept, tol=1e-12)
+    w = prox_solver.solve(X, y, log_datafit, l1_penalty)[0]
 
-    np.testing.assert_allclose(w, sk_log_reg.coef_.flatten(), rtol=1e-6, atol=1e-6)
+    np.testing.assert_allclose(w[:n_features], sk_log_reg.coef_.flatten())
+    if fit_intercept:
+        np.testing.assert_allclose(w[-1], sk_log_reg.intercept_)
 
 
 if __name__ == '__main__':