FIX support of float32 in ProxNewton solver (#170)

Badr-MOUFAD · web-flow · commit f6f0875fd860 · 2023-06-20T11:33:33.000+02:00
diff --git a/skglm/solvers/prox_newton.py b/skglm/solvers/prox_newton.py
@@ -59,10 +59,12 @@ def __init__(self, p0=10, max_iter=20, max_pn_iter=1000, tol=1e-4,
         self.verbose = verbose
 
     def solve(self, X, y, datafit, penalty, w_init=None, Xw_init=None):
+        dtype = X.dtype
         n_samples, n_features = X.shape
         fit_intercept = self.fit_intercept
-        w = np.zeros(n_features + fit_intercept) if w_init is None else w_init
-        Xw = np.zeros(n_samples) if Xw_init is None else Xw_init
+
+        w = np.zeros(n_features + fit_intercept, dtype) if w_init is None else w_init
+        Xw = np.zeros(n_samples, dtype) if Xw_init is None else Xw_init
         all_features = np.arange(n_features)
         stop_crit = 0.
         p_objs_out = []
@@ -181,16 +183,17 @@ def _descent_direction(X, y, w_epoch, Xw_epoch, fit_intercept, grad_ws, datafit,
     # Minimize quadratic approximation for delta_w = w - w_epoch:
     #  b.T @ X @ delta_w + \
     #  1/2 * delta_w.T @ (X.T @ D @ X) @ delta_w + penalty(w)
+    dtype = X.dtype
     raw_hess = datafit.raw_hessian(y, Xw_epoch)
 
-    lipschitz = np.zeros(len(ws))
+    lipschitz = np.zeros(len(ws), dtype)
     for idx, j in enumerate(ws):
         lipschitz[idx] = raw_hess @ X[:, j] ** 2
 
     # for a less costly stopping criterion, we do not compute the exact gradient,
     # but store each coordinate-wise gradient every time we update one coordinate
-    past_grads = np.zeros(len(ws))
-    X_delta_w_ws = np.zeros(X.shape[0])
+    past_grads = np.zeros(len(ws), dtype)
+    X_delta_w_ws = np.zeros(X.shape[0], dtype)
     ws_intercept = np.append(ws, -1) if fit_intercept else ws
     w_ws = w_epoch[ws_intercept]
 
@@ -243,17 +246,18 @@ def _descent_direction(X, y, w_epoch, Xw_epoch, fit_intercept, grad_ws, datafit,
 @njit
 def _descent_direction_s(X_data, X_indptr, X_indices, y, w_epoch,
                          Xw_epoch, fit_intercept, grad_ws, datafit, penalty, ws, tol):
+    dtype = X_data.dtype
     raw_hess = datafit.raw_hessian(y, Xw_epoch)
 
-    lipschitz = np.zeros(len(ws))
+    lipschitz = np.zeros(len(ws), dtype)
     for idx, j in enumerate(ws):
         # equivalent to: lipschitz[idx] += raw_hess * X[:, j] ** 2
         lipschitz[idx] = _sparse_squared_weighted_norm(
             X_data, X_indptr, X_indices, j, raw_hess)
 
     # see _descent_direction() comment
-    past_grads = np.zeros(len(ws))
-    X_delta_w_ws = np.zeros(Xw_epoch.shape[0])
+    past_grads = np.zeros(len(ws), dtype)
+    X_delta_w_ws = np.zeros(Xw_epoch.shape[0], dtype)
     ws_intercept = np.append(ws, -1) if fit_intercept else ws
     w_ws = w_epoch[ws_intercept]
 
@@ -329,7 +333,11 @@ def _backtrack_line_search(X, y, w, Xw, fit_intercept, datafit, penalty, delta_w
         grad_ws = _construct_grad(X, y, w[:n_features], Xw, datafit, ws)
         # TODO: could be improved by passing in w[ws]
         stop_crit = penalty.value(w[:n_features]) - old_penalty_val
-        stop_crit += step * grad_ws @ delta_w_ws[:len(ws)]
+
+        # it is mandatory to split the two operations, otherwise numba raises an error
+        # cf. https://github.com/numba/numba/issues/9025
+        dot = grad_ws @ delta_w_ws[:len(ws)]
+        stop_crit += step * dot
 
         if fit_intercept:
             stop_crit += step * delta_w_ws[-1] * np.sum(datafit.raw_grad(y, Xw))
@@ -364,7 +372,11 @@ def _backtrack_line_search_s(X_data, X_indptr, X_indices, y, w, Xw, fit_intercep
                                          y, w[:n_features], Xw, datafit, ws)
         # TODO: could be improved by passing in w[ws]
         stop_crit = penalty.value(w[:n_features]) - old_penalty_val
-        stop_crit += step * grad_ws.T @ delta_w_ws[:len(ws)]
+
+        # it is mandatory to split the two operations, otherwise numba raises an error
+        # cf. https://github.com/numba/numba/issues/9025
+        dot = grad_ws.T @ delta_w_ws[:len(ws)]
+        stop_crit += step * dot
 
         if fit_intercept:
             stop_crit += step * delta_w_ws[-1] * np.sum(datafit.raw_grad(y, Xw))
@@ -385,7 +397,7 @@ def _construct_grad(X, y, w, Xw, datafit, ws):
     # Compute grad of datafit restricted to ws. This function avoids
     # recomputing raw_grad for every j, which is costly for logreg
     raw_grad = datafit.raw_grad(y, Xw)
-    grad = np.zeros(len(ws))
+    grad = np.zeros(len(ws), dtype=X.dtype)
     for idx, j in enumerate(ws):
         grad[idx] = X[:, j] @ raw_grad
     return grad
@@ -395,7 +407,7 @@ def _construct_grad(X, y, w, Xw, datafit, ws):
 def _construct_grad_sparse(X_data, X_indptr, X_indices, y, w, Xw, datafit, ws):
     # Compute grad of datafit restricted to ws in case X sparse
     raw_grad = datafit.raw_grad(y, Xw)
-    grad = np.zeros(len(ws))
+    grad = np.zeros(len(ws), dtype=X_data.dtype)
     for idx, j in enumerate(ws):
         grad[idx] = _sparse_xj_dot(X_data, X_indptr, X_indices, j, raw_grad)
     return grad
diff --git a/skglm/tests/test_estimators.py b/skglm/tests/test_estimators.py
@@ -168,8 +168,9 @@ def test_mtl_path():
     np.testing.assert_allclose(coef_ours, coef_sk, rtol=1e-5)
 
 
-@pytest.mark.parametrize("use_efron", [True, False])
-def test_CoxEstimator(use_efron):
+@pytest.mark.parametrize("use_efron, use_float_32",
+                         product([True, False], [True, False]))
+def test_CoxEstimator(use_efron, use_float_32):
     try:
         from lifelines import CoxPHFitter
     except ModuleNotFoundError:
@@ -184,7 +185,8 @@ def test_CoxEstimator(use_efron):
     random_state = 1265
 
     tm, s, X = make_dummy_survival_data(n_samples, n_features, normalize=True,
-                                        with_ties=use_efron, random_state=random_state)
+                                        with_ties=use_efron, use_float_32=use_float_32,
+                                        random_state=random_state)
 
     # compute alpha_max
     B = (tm >= tm[:, None]).astype(X.dtype)
@@ -214,7 +216,7 @@ def test_CoxEstimator(use_efron):
         df, duration_col=0, event_col=1,
         fit_options={"max_steps": 10_000, "precision": 1e-12}
     )
-    w_ll = estimator.params_.values
+    w_ll = estimator.params_.values.astype(X.dtype)
 
     p_obj_skglm = datafit.value((tm, s), w, X @ w) + penalty.value(w)
     p_obj_ll = datafit.value((tm, s), w_ll, X @ w_ll) + penalty.value(w_ll)
@@ -223,14 +225,16 @@ def test_CoxEstimator(use_efron):
     np.testing.assert_allclose(p_obj_skglm, p_obj_ll, atol=1e-6)
 
 
-@pytest.mark.parametrize("use_efron", [True, False])
-def test_CoxEstimator_sparse(use_efron):
+@pytest.mark.parametrize("use_efron, use_float_32",
+                         product([True, False], [True, False]))
+def test_CoxEstimator_sparse(use_efron, use_float_32):
     reg = 1e-2
     n_samples, n_features = 100, 30
     X_density, random_state = 0.5, 1265
 
     tm, s, X = make_dummy_survival_data(n_samples, n_features, X_density=X_density,
-                                        with_ties=use_efron, random_state=random_state)
+                                        use_float_32=use_float_32, with_ties=use_efron,
+                                        random_state=random_state)
 
     # compute alpha_max
     B = (tm >= tm[:, None]).astype(X.dtype)
@@ -373,4 +377,5 @@ def test_warm_start(estimator_name):
 
 
 if __name__ == "__main__":
+    test_CoxEstimator(True, True)
     pass
diff --git a/skglm/utils/data.py b/skglm/utils/data.py
@@ -124,8 +124,8 @@ def make_correlated_data(
         return X, Y, w_true
 
 
-def make_dummy_survival_data(n_samples, n_features, normalize=False,
-                             X_density=1., with_ties=False, random_state=None):
+def make_dummy_survival_data(n_samples, n_features, normalize=False, X_density=1.,
+                             with_ties=False, use_float_32=False, random_state=None):
     """Generate a random dataset for survival analysis.
 
     The design matrix ``X`` is generated according to standard normal, the vector of
@@ -152,6 +152,9 @@ def make_dummy_survival_data(n_samples, n_features, normalize=False,
         Determine if the data contains tied observations: observations with the same
         occurrences times ``tm``.
 
+    use_float_32 : bool, default=False
+        It ``True`` returns data with type ``float32``, otherwise, it is ``float64``.
+
     random_state : int, default=None
         Determines random number generation for data generation.
 
@@ -167,20 +170,21 @@ def make_dummy_survival_data(n_samples, n_features, normalize=False,
         The matrix of predictors. If ``density < 1``, a CSC sparse matrix is returned.
     """
     rng = np.random.RandomState(random_state)
+    dtype = np.float64 if use_float_32 is False else np.float32
 
     if X_density == 1.:
-        X = rng.randn(n_samples, n_features).astype(float, order='F')
+        X = rng.randn(n_samples, n_features).astype(dtype, order='F')
     else:
         X = scipy.sparse.rand(
-            n_samples, n_features, density=X_density, format="csc", dtype=float)
+            n_samples, n_features, density=X_density, format="csc", dtype=dtype)
 
     if not with_ties:
-        tm = rng.weibull(a=1, size=n_samples)
+        tm = rng.weibull(a=1, size=n_samples).astype(dtype)
     else:
-        unique_tm = rng.weibull(a=1, size=n_samples // 10 + 1)
+        unique_tm = rng.weibull(a=1, size=n_samples // 10 + 1).astype(dtype)
         tm = rng.choice(unique_tm, size=n_samples)
 
-    s = rng.choice(2, size=n_samples).astype(float)
+    s = rng.choice(2, size=n_samples).astype(dtype)
 
     if normalize and X_density == 1.:
         X = StandardScaler().fit_transform(X)