scikit-learn-contrib
diff --git a/‎examples/plot_lasso_vs_weighted.py‎
Lines changed: 2 additions & 1 deletion b/‎examples/plot_lasso_vs_weighted.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎skglm/datafits/group.py‎
Lines changed: 3 additions & 0 deletions b/‎skglm/datafits/group.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎skglm/datafits/multi_task.py‎
Lines changed: 3 additions & 0 deletions b/‎skglm/datafits/multi_task.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎skglm/datafits/single_task.py‎
Lines changed: 33 additions & 16 deletions b/‎skglm/datafits/single_task.py‎
Lines changed: 33 additions & 16 deletions
diff --git a/‎skglm/estimators.py‎
Lines changed: 29 additions & 18 deletions b/‎skglm/estimators.py‎
Lines changed: 29 additions & 18 deletions
@@ -34,7 +34,8 @@
 alpha_max = np.max(np.abs(X.T @ y)) / len(y)
 alpha = alpha_max / 10
 las = Lasso(alpha=alpha, fit_intercept=False).fit(X, y)
-wei = WeightedLasso(alpha=alpha, weights=norm(X, axis=0)).fit(X, y)
+wei = WeightedLasso(
+    alpha=alpha, weights=norm(X, axis=0), fit_intercept=False).fit(X, y)
 
 
 fig, axarr = plt.subplots(1, 3, sharey=True, figsize=(10, 2.4))
 
@@ -68,3 +68,6 @@ def gradient_g(self, X, y, w, Xw, g):
 
     def gradient_scalar(self, X, y, w, Xw, j):
         return X[:, j] @ (Xw - y) / len(y)
+
+    def intercept_update_step(self, y, Xw):
+        return np.mean(Xw - y)
@@ -91,3 +91,6 @@ def full_grad_sparse(self, X_data, X_indptr, X_indices, Y, XW):
                     XjTXW[t] += X_data[i] * XW[X_indices[i], t]
             grad[j, :] = (XjTXW - self.XtY[j, :]) / n_samples
         return grad
+
+    def intercept_update_step(self, Y, XW):
+        return np.sum(XW - Y, axis=0) / len(Y)
@@ -87,6 +87,9 @@ def full_grad_sparse(
             grad[j] = (XjTXw - self.Xty[j]) / n_samples
         return grad
 
+    def intercept_update_step(self, y, Xw):
+        return np.mean(Xw - y)
+
 
 @njit
 def sigmoid(x):
@@ -169,6 +172,9 @@ def gradient_scalar_sparse(self, X_data, X_indptr, X_indices, y, Xw, j):
             grad -= X_data[i] * y[idx_i] * sigmoid(- y[idx_i] * Xw[idx_i])
         return grad / len(Xw)
 
+    def intercept_update_step(self, y, Xw):
+        return np.mean(- y * sigmoid(- y * Xw)) / 4
+
 
 class QuadraticSVC(BaseDatafit):
     """A Quadratic SVC datafit used for classification tasks.
@@ -300,32 +306,32 @@ def value(self, y, w, Xw):
         n_samples = len(y)
         res = 0.
         for i in range(n_samples):
-            tmp = abs(y[i] - Xw[i])
-            if tmp < self.delta:
-                res += 0.5 * tmp ** 2
+            residual = abs(y[i] - Xw[i])
+            if residual < self.delta:
+                res += 0.5 * residual ** 2
             else:
-                res += self.delta * tmp - 0.5 * self.delta ** 2
+                res += self.delta * residual - 0.5 * self.delta ** 2
         return res / n_samples
 
     def gradient_scalar(self, X, y, w, Xw, j):
         n_samples = len(y)
         grad_j = 0.
         for i in range(n_samples):
-            tmp = y[i] - Xw[i]
-            if abs(tmp) < self.delta:
-                grad_j += - X[i, j] * tmp
+            residual = y[i] - Xw[i]
+            if abs(residual) < self.delta:
+                grad_j += - X[i, j] * residual
             else:
-                grad_j += - X[i, j] * np.sign(tmp) * self.delta
+                grad_j += - X[i, j] * np.sign(residual) * self.delta
         return grad_j / n_samples
 
     def gradient_scalar_sparse(self, X_data, X_indptr, X_indices, y, Xw, j):
         grad_j = 0.
         for i in range(X_indptr[j], X_indptr[j + 1]):
-            tmp = y[X_indices[i]] - Xw[X_indices[i]]
-            if np.abs(tmp) < self.delta:
-                grad_j += - X_data[i] * tmp
+            residual = y[X_indices[i]] - Xw[X_indices[i]]
+            if np.abs(residual) < self.delta:
+                grad_j += - X_data[i] * residual
             else:
-                grad_j += - X_data[i] * np.sign(tmp) * self.delta
+                grad_j += - X_data[i] * np.sign(residual) * self.delta
         return grad_j / len(Xw)
 
     def full_grad_sparse(
@@ -336,10 +342,21 @@ def full_grad_sparse(
         for j in range(n_features):
             grad_j = 0.
             for i in range(X_indptr[j], X_indptr[j + 1]):
-                tmp = y[X_indices[i]] - Xw[X_indices[i]]
-                if np.abs(tmp) < self.delta:
-                    grad_j += - X_data[i] * tmp
+                residual = y[X_indices[i]] - Xw[X_indices[i]]
+                if np.abs(residual) < self.delta:
+                    grad_j += - X_data[i] * residual
                 else:
-                    grad_j += - X_data[i] * np.sign(tmp) * self.delta
+                    grad_j += - X_data[i] * np.sign(residual) * self.delta
             grad[j] = grad_j / n_samples
         return grad
+
+    def intercept_update_step(self, y, Xw):
+        n_samples = len(y)
+        update = 0.
+        for i in range(n_samples):
+            residual = y[i] - Xw[i]
+            if abs(residual) < self.delta:
+                update -= residual
+            else:
+                update -= np.sign(residual) * self.delta
+        return update / n_samples
@@ -9,7 +9,7 @@
 from sklearn.utils import check_array, check_consistent_length
 from sklearn.linear_model import MultiTaskLasso as MultiTaskLasso_sklearn
 from sklearn.linear_model._base import (
-    _preprocess_data, LinearModel, RegressorMixin,
+    LinearModel, RegressorMixin,
     LinearClassifierMixin, SparseCoefMixin, BaseEstimator
 )
 from sklearn.utils.extmath import softmax
@@ -98,6 +98,8 @@ def _glm_fit(X, y, model, datafit, penalty):
     else:
         X_ = X
 
+    n_samples, n_features = X_.shape
+
     penalty_jit = compiled_clone(penalty)
     datafit_jit = compiled_clone(datafit, to_float32=X.dtype == np.float32)
     if issparse(X):
@@ -112,22 +114,24 @@ def _glm_fit(X, y, model, datafit, penalty):
             w = model.coef_[0, :].copy()
         else:
             w = model.coef_.copy()
-        Xw = X_ @ w
+        if model.fit_intercept:
+            w = np.hstack([w, model.intercept_])
+        Xw = X_ @ w[:w.shape[0] - model.fit_intercept] + model.fit_intercept * w[-1]
     else:
         # TODO this should be solver.get_init() do delegate the work
         if y.ndim == 1:
-            w = np.zeros(X_.shape[1], dtype=X_.dtype)
-            Xw = np.zeros(X_.shape[0], dtype=X_.dtype)
+            w = np.zeros(n_features + model.fit_intercept, dtype=X_.dtype)
+            Xw = np.zeros(n_samples, dtype=X_.dtype)
         else:  # multitask
-            w = np.zeros((X_.shape[1], y.shape[1]), dtype=X_.dtype)
+            w = np.zeros((n_features + model.fit_intercept, y.shape[1]), dtype=X_.dtype)
             Xw = np.zeros(y.shape, dtype=X_.dtype)
 
     # check consistency of weights for WeightedL1
     if isinstance(penalty, WeightedL1):
-        if len(penalty.weights) != X.shape[1]:
+        if len(penalty.weights) != n_features:
             raise ValueError(
-                "The size of the WeightedL1 penalty weights should be n_features, \
-                expected %i, got %i" % (X_.shape[1], len(penalty.weights)))
+                "The size of the WeightedL1 penalty weights should be n_features, "
+                "expected %i, got %i." % (X_.shape[1], len(penalty.weights)))
 
     if is_classif:
         solver = cd_solver  # TODO to be be replaced by an instance of BaseSolver
@@ -141,15 +145,19 @@ def _glm_fit(X, y, model, datafit, penalty):
     coefs, p_obj, kkt = solver(
         X_, y, datafit_jit, penalty_jit, w, Xw, max_iter=model.max_iter,
         max_epochs=model.max_epochs, p0=model.p0,
-        tol=model.tol,  # ws_strategy=model.ws_strategy,
+        tol=model.tol, fit_intercept=model.fit_intercept,
         verbose=model.verbose)
+    model.coef_, model.stop_crit_ = coefs[:n_features], kkt
+    if y.ndim == 1:
+        model.intercept_ = coefs[-1] if model.fit_intercept else 0.
+    else:
+        model.intercept_ = coefs[-1, :] if model.fit_intercept else np.zeros(
+            y.shape[1])
 
-    model.coef_, model.stop_crit_ = coefs, kkt
     model.n_iter_ = len(p_obj)
-    model.intercept_ = 0.
 
     if is_classif and n_classes_ <= 2:
-        model.coef_ = coefs[np.newaxis, :]
+        model.coef_ = coefs[np.newaxis, :n_features]
         if isinstance(datafit, QuadraticSVC):
             if is_sparse:
                 primal_coef = ((yXT).multiply(model.coef_[0, :])).T
@@ -1212,6 +1220,7 @@ def fit(self, X, y):
     # TODO add predict_proba for LinearSVC
 
 
+# TODO we should no longer inherit from sklearn
 class MultiTaskLasso(MultiTaskLasso_sklearn):
     r"""MultiTaskLasso estimator.
 
@@ -1291,7 +1300,6 @@ def fit(self, X, Y):
         self :
             The fitted estimator.
         """
-        # TODO check if we could just patch `bcd_solver_path` as we do in Lasso case.
         # Below is copied from sklearn, with path replaced by our path.
         # Need to validate separately here.
         # We can't pass multi_output=True because that would allow y to be csr.
@@ -1312,9 +1320,10 @@ def fit(self, X, Y):
             raise ValueError("X and Y have inconsistent dimensions (%d != %d)"
                              % (n_samples, Y.shape[0]))
 
-        X, Y, X_offset, Y_offset, X_scale = _preprocess_data(
-            X, Y, self.fit_intercept, copy=False)
+        # X, Y, X_offset, Y_offset, X_scale = _preprocess_data(
+        #     X, Y, self.fit_intercept, copy=False)
 
+        # TODO handle and test warm start for MTL
         if not self.warm_start or not hasattr(self, "coef_"):
             self.coef_ = None
 
@@ -1324,9 +1333,10 @@ def fit(self, X, Y):
             max_epochs=self.max_epochs, p0=self.p0, verbose=self.verbose,
             tol=self.tol)
 
-        self.coef_, self.dual_gap_ = coefs[..., 0], kkt[-1]
+        self.coef_ = coefs[:, :X.shape[1], 0]
+        self.intercept_ = self.fit_intercept * coefs[:, -1, 0]
+        self.stopping_crit = kkt[-1]
         self.n_iter_ = len(kkt)
-        self._set_intercept(X_offset, Y_offset, X_scale)
 
         return self
 
@@ -1368,4 +1378,5 @@ def path(self, X, Y, alphas, coef_init=None, **params):
         penalty = compiled_clone(self.penalty)
 
         return multitask_bcd_solver_path(X, Y, datafit, penalty, alphas=alphas,
-                                         coef_init=coef_init, **params)
+                                         coef_init=coef_init,
+                                         fit_intercept=self.fit_intercept, tol=self.tol)