From 82fc26d1615a574d17f38846c38ef953b39c8a0f Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 6 Oct 2025 10:10:12 -0700 Subject: [PATCH 01/87] refactor: move/delete some methods in neighbors.py --- sklearnex/neighbors/common.py | 68 +++++++++++++++++++++++ sklearnex/neighbors/knn_classification.py | 15 +++-- sklearnex/neighbors/knn_regression.py | 18 ++++-- sklearnex/neighbors/knn_unsupervised.py | 11 +++- 4 files changed, 101 insertions(+), 11 deletions(-) diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index ed48c48e77..a3ee1df86b 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -35,6 +35,74 @@ class KNeighborsDispatchingBase(oneDALEstimator): + + def _parse_auto_method(self, method, n_samples, n_features): + """Parse auto method selection for neighbors algorithm.""" + result_method = method + + if method in ["auto", "ball_tree"]: + condition = ( + self.n_neighbors is not None and self.n_neighbors >= n_samples // 2 + ) + if self.metric == "precomputed" or n_features > 15 or condition: + result_method = "brute" + else: + if self.metric == "euclidean": + result_method = "kd_tree" + else: + result_method = "brute" + + return result_method + + def _get_weights(self, dist, weights): + """Get weights for neighbors based on distance and weights parameter.""" + if weights in (None, "uniform"): + return None + if weights == "distance": + # if user attempts to classify a point that was zero distance from one + # or more training points, those training points are weighted as 1.0 + # and the other points as 0.0 + if dist.dtype is np.dtype(object): + for point_dist_i, point_dist in enumerate(dist): + # check if point_dist is iterable + # (ex: RadiusNeighborClassifier.predict may set an element of + # dist to 1e-6 to represent an 'outlier') + if hasattr(point_dist, "__contains__") and 0.0 in point_dist: + dist[point_dist_i] = point_dist == 0.0 + else: + dist[point_dist_i] = 1.0 / point_dist + else: + with np.errstate(divide="ignore"): + dist = 1.0 / dist + inf_mask = np.isinf(dist) + inf_row = np.any(inf_mask, axis=1) + dist[inf_row] = inf_mask[inf_row] + return dist + elif callable(weights): + return weights(dist) + else: + raise ValueError( + "weights not recognized: should be 'uniform', " + "'distance', or a callable function" + ) + + def _validate_targets(self, y, dtype): + """Validate and convert target values.""" + from onedal.utils.validation import _column_or_1d + arr = _column_or_1d(y, warn=True) + + try: + return arr.astype(dtype, copy=False) + except ValueError: + return arr + + def _validate_n_classes(self): + """Validate that we have at least 2 classes for classification.""" + length = 0 if self.classes_ is None else len(self.classes_) + if length < 2: + raise ValueError( + f"The number of classes has to be greater than one; got {length}" + ) def _fit_validation(self, X, y=None): if sklearn_check_version("1.2"): self._validate_params() diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index 7e25fa5ae1..68424f2bee 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -14,6 +14,7 @@ # limitations under the License. # =============================================================================== +import numpy as np from sklearn.metrics import accuracy_score from sklearn.neighbors._classification import ( KNeighborsClassifier as _sklearn_KNeighborsClassifier, @@ -24,6 +25,8 @@ from daal4py.sklearn._utils import sklearn_check_version from daal4py.sklearn.utils.validation import get_requires_y_tag from onedal.neighbors import KNeighborsClassifier as onedal_KNeighborsClassifier +from onedal.utils.validation import _check_X_y, _check_classification_targets, _check_n_features +from onedal.common._estimator_checks import _is_classifier from .._device_offload import dispatch, wrap_output_data from ..utils.validation import check_feature_names @@ -141,16 +144,20 @@ def _onedal_fit(self, X, y, queue=None): onedal_params = { "n_neighbors": self.n_neighbors, "weights": self.weights, - "algorithm": self.algorithm, + "algorithm": self._fit_method, # Use parsed method "metric": self.effective_metric_, - "p": self.effective_metric_params_["p"], + "p": self.effective_metric_params_["p"] if self.effective_metric_params_ else 2, } self._onedal_estimator = onedal_KNeighborsClassifier(**onedal_params) - self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ - self._onedal_estimator.fit(X, y, queue=queue) + self._onedal_estimator._fit_method = self._fit_method + self._onedal_estimator.classes_ = self.classes_ + + # Prepare y for onedal + fit_y = self._validate_targets(processed_y, X.dtype).reshape((-1, 1)) + self._onedal_estimator.fit(X, fit_y, queue=queue) self._save_attributes() diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index ba1626b4ff..8d0ed23c53 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -14,6 +14,7 @@ # limitations under the License. # ============================================================================== +import numpy as np from sklearn.metrics import r2_score from sklearn.neighbors._regression import ( KNeighborsRegressor as _sklearn_KNeighborsRegressor, @@ -24,6 +25,8 @@ from daal4py.sklearn._utils import sklearn_check_version from daal4py.sklearn.utils.validation import get_requires_y_tag from onedal.neighbors import KNeighborsRegressor as onedal_KNeighborsRegressor +from onedal.utils.validation import _check_X_y, _check_n_features +from onedal.common._estimator_checks import _is_regressor from .._device_offload import dispatch, wrap_output_data from ..utils.validation import check_feature_names @@ -125,16 +128,23 @@ def _onedal_fit(self, X, y, queue=None): onedal_params = { "n_neighbors": self.n_neighbors, "weights": self.weights, - "algorithm": self.algorithm, + "algorithm": self._fit_method, # Use parsed method "metric": self.effective_metric_, - "p": self.effective_metric_params_["p"], + "p": self.effective_metric_params_["p"] if self.effective_metric_params_ else 2, } self._onedal_estimator = onedal_KNeighborsRegressor(**onedal_params) - self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ - self._onedal_estimator.fit(X, y, queue=queue) + self._onedal_estimator._fit_method = self._fit_method + + # For regression, prepare y data + fit_y = self._validate_targets(y, X.dtype).reshape((-1, 1)) + self._onedal_estimator.fit(X, fit_y, queue=queue) + + # Reshape y back if needed + if self._shape is not None: + self._y = np.reshape(y, self._shape) self._save_attributes() diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 80da8bb2cf..3c4dd62a40 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -14,6 +14,7 @@ # limitations under the License. # =============================================================================== +import numpy as np from sklearn.neighbors._unsupervised import NearestNeighbors as _sklearn_NearestNeighbors from sklearn.utils.validation import _deprecate_positional_args, check_is_fitted @@ -21,6 +22,7 @@ from daal4py.sklearn._utils import sklearn_check_version from daal4py.sklearn.utils.validation import get_requires_y_tag from onedal.neighbors import NearestNeighbors as onedal_NearestNeighbors +from onedal.utils.validation import _check_array, _check_n_features from .._device_offload import dispatch, wrap_output_data from ..utils.validation import check_feature_names @@ -131,15 +133,18 @@ def radius_neighbors_graph( def _onedal_fit(self, X, y=None, queue=None): onedal_params = { "n_neighbors": self.n_neighbors, - "algorithm": self.algorithm, + "algorithm": self._fit_method, # Use parsed method "metric": self.effective_metric_, - "p": self.effective_metric_params_["p"], + "p": self.effective_metric_params_["p"] if self.effective_metric_params_ else 2, } self._onedal_estimator = onedal_NearestNeighbors(**onedal_params) - self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ + self._onedal_estimator._fit_method = self._fit_method + self._onedal_estimator.fit(X, y, queue=queue) + + self._save_attributes() self._onedal_estimator.fit(X, y, queue=queue) self._save_attributes() From 325753c1b045afa0547f663d7903b91c0c0cf5d7 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 6 Oct 2025 16:13:50 -0700 Subject: [PATCH 02/87] fix: try it again --- onedal/neighbors/neighbors.py | 12 ++-- sklearnex/neighbors/_lof.py | 2 +- sklearnex/neighbors/common.py | 70 +---------------------- sklearnex/neighbors/knn_classification.py | 24 ++++---- sklearnex/neighbors/knn_regression.py | 20 ++----- sklearnex/neighbors/knn_unsupervised.py | 13 ++--- 6 files changed, 31 insertions(+), 110 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index e952dddebf..313a6253a9 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -269,7 +269,6 @@ def _fit(self, X, y): return result def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): - use_raw_input = _get_config().get("use_raw_input", False) is True n_features = getattr(self, "n_features_in_", None) shape = getattr(X, "shape", None) if n_features and shape and len(shape) > 1 and shape[1] != n_features: @@ -296,8 +295,12 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): if X is not None: query_is_train = False +<<<<<<< HEAD if not use_raw_input: X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) +======= + X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) +>>>>>>> e003b37f (fix: try it again) else: query_is_train = True X = self._fit_X @@ -646,6 +649,7 @@ def __init__( self, n_neighbors=5, *, + weights="uniform", algorithm="auto", p=2, metric="minkowski", @@ -660,7 +664,7 @@ def __init__( metric_params=metric_params, **kwargs, ) - self.requires_y = False + self.weights = weights @bind_default_backend("neighbors.search") def train(self, *args, **kwargs): ... @@ -682,9 +686,9 @@ def _onedal_predict(self, model, X, params): return self.infer(params, model, X) @supports_queue - def fit(self, X, y=None, queue=None): + def fit(self, X, y, queue=None): return self._fit(X, y) @supports_queue def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): - return self._kneighbors(X, n_neighbors, return_distance) + return self._kneighbors(X, n_neighbors, return_distance) \ No newline at end of file diff --git a/sklearnex/neighbors/_lof.py b/sklearnex/neighbors/_lof.py index 63a98164e7..7c115ce9c5 100644 --- a/sklearnex/neighbors/_lof.py +++ b/sklearnex/neighbors/_lof.py @@ -186,4 +186,4 @@ def score_samples(self, X): return -np.mean(lrd_ratios_array, axis=1) fit.__doc__ = _sklearn_LocalOutlierFactor.fit.__doc__ - kneighbors.__doc__ = _sklearn_LocalOutlierFactor.kneighbors.__doc__ + kneighbors.__doc__ = _sklearn_LocalOutlierFactor.kneighbors.__doc__ \ No newline at end of file diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index a3ee1df86b..11407ce8b4 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -35,74 +35,6 @@ class KNeighborsDispatchingBase(oneDALEstimator): - - def _parse_auto_method(self, method, n_samples, n_features): - """Parse auto method selection for neighbors algorithm.""" - result_method = method - - if method in ["auto", "ball_tree"]: - condition = ( - self.n_neighbors is not None and self.n_neighbors >= n_samples // 2 - ) - if self.metric == "precomputed" or n_features > 15 or condition: - result_method = "brute" - else: - if self.metric == "euclidean": - result_method = "kd_tree" - else: - result_method = "brute" - - return result_method - - def _get_weights(self, dist, weights): - """Get weights for neighbors based on distance and weights parameter.""" - if weights in (None, "uniform"): - return None - if weights == "distance": - # if user attempts to classify a point that was zero distance from one - # or more training points, those training points are weighted as 1.0 - # and the other points as 0.0 - if dist.dtype is np.dtype(object): - for point_dist_i, point_dist in enumerate(dist): - # check if point_dist is iterable - # (ex: RadiusNeighborClassifier.predict may set an element of - # dist to 1e-6 to represent an 'outlier') - if hasattr(point_dist, "__contains__") and 0.0 in point_dist: - dist[point_dist_i] = point_dist == 0.0 - else: - dist[point_dist_i] = 1.0 / point_dist - else: - with np.errstate(divide="ignore"): - dist = 1.0 / dist - inf_mask = np.isinf(dist) - inf_row = np.any(inf_mask, axis=1) - dist[inf_row] = inf_mask[inf_row] - return dist - elif callable(weights): - return weights(dist) - else: - raise ValueError( - "weights not recognized: should be 'uniform', " - "'distance', or a callable function" - ) - - def _validate_targets(self, y, dtype): - """Validate and convert target values.""" - from onedal.utils.validation import _column_or_1d - arr = _column_or_1d(y, warn=True) - - try: - return arr.astype(dtype, copy=False) - except ValueError: - return arr - - def _validate_n_classes(self): - """Validate that we have at least 2 classes for classification.""" - length = 0 if self.classes_ is None else len(self.classes_) - if length < 2: - raise ValueError( - f"The number of classes has to be greater than one; got {length}" - ) def _fit_validation(self, X, y=None): if sklearn_check_version("1.2"): self._validate_params() @@ -378,4 +310,4 @@ def kneighbors_graph(self, X=None, n_neighbors=None, mode="connectivity"): return kneighbors_graph - kneighbors_graph.__doc__ = KNeighborsMixin.kneighbors_graph.__doc__ + kneighbors_graph.__doc__ = KNeighborsMixin.kneighbors_graph.__doc__ \ No newline at end of file diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index 68424f2bee..e3f516d932 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -14,7 +14,6 @@ # limitations under the License. # =============================================================================== -import numpy as np from sklearn.metrics import accuracy_score from sklearn.neighbors._classification import ( KNeighborsClassifier as _sklearn_KNeighborsClassifier, @@ -25,8 +24,6 @@ from daal4py.sklearn._utils import sklearn_check_version from daal4py.sklearn.utils.validation import get_requires_y_tag from onedal.neighbors import KNeighborsClassifier as onedal_KNeighborsClassifier -from onedal.utils.validation import _check_X_y, _check_classification_targets, _check_n_features -from onedal.common._estimator_checks import _is_classifier from .._device_offload import dispatch, wrap_output_data from ..utils.validation import check_feature_names @@ -141,23 +138,26 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): ) def _onedal_fit(self, X, y, queue=None): +<<<<<<< HEAD +======= + # import sys + # print("=" * 50, file=sys.stderr, flush=True) + # print("DEBUG: _onedal_fit called!", file=sys.stderr, flush=True) + # print("=" * 50, file=sys.stderr, flush=True) +>>>>>>> e003b37f (fix: try it again) onedal_params = { "n_neighbors": self.n_neighbors, "weights": self.weights, - "algorithm": self._fit_method, # Use parsed method + "algorithm": self.algorithm, "metric": self.effective_metric_, - "p": self.effective_metric_params_["p"] if self.effective_metric_params_ else 2, + "p": self.effective_metric_params_["p"], } self._onedal_estimator = onedal_KNeighborsClassifier(**onedal_params) + self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ - self._onedal_estimator._fit_method = self._fit_method - self._onedal_estimator.classes_ = self.classes_ - - # Prepare y for onedal - fit_y = self._validate_targets(processed_y, X.dtype).reshape((-1, 1)) - self._onedal_estimator.fit(X, fit_y, queue=queue) + self._onedal_estimator.fit(X, y, queue=queue) self._save_attributes() @@ -193,4 +193,4 @@ def _save_attributes(self): predict.__doc__ = _sklearn_KNeighborsClassifier.predict.__doc__ predict_proba.__doc__ = _sklearn_KNeighborsClassifier.predict_proba.__doc__ score.__doc__ = _sklearn_KNeighborsClassifier.score.__doc__ - kneighbors.__doc__ = _sklearn_KNeighborsClassifier.kneighbors.__doc__ + kneighbors.__doc__ = _sklearn_KNeighborsClassifier.kneighbors.__doc__ \ No newline at end of file diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 8d0ed23c53..502dba72c6 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -14,7 +14,6 @@ # limitations under the License. # ============================================================================== -import numpy as np from sklearn.metrics import r2_score from sklearn.neighbors._regression import ( KNeighborsRegressor as _sklearn_KNeighborsRegressor, @@ -25,8 +24,6 @@ from daal4py.sklearn._utils import sklearn_check_version from daal4py.sklearn.utils.validation import get_requires_y_tag from onedal.neighbors import KNeighborsRegressor as onedal_KNeighborsRegressor -from onedal.utils.validation import _check_X_y, _check_n_features -from onedal.common._estimator_checks import _is_regressor from .._device_offload import dispatch, wrap_output_data from ..utils.validation import check_feature_names @@ -128,23 +125,16 @@ def _onedal_fit(self, X, y, queue=None): onedal_params = { "n_neighbors": self.n_neighbors, "weights": self.weights, - "algorithm": self._fit_method, # Use parsed method + "algorithm": self.algorithm, "metric": self.effective_metric_, - "p": self.effective_metric_params_["p"] if self.effective_metric_params_ else 2, + "p": self.effective_metric_params_["p"], } self._onedal_estimator = onedal_KNeighborsRegressor(**onedal_params) + self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ - self._onedal_estimator._fit_method = self._fit_method - - # For regression, prepare y data - fit_y = self._validate_targets(y, X.dtype).reshape((-1, 1)) - self._onedal_estimator.fit(X, fit_y, queue=queue) - - # Reshape y back if needed - if self._shape is not None: - self._y = np.reshape(y, self._shape) + self._onedal_estimator.fit(X, y, queue=queue) self._save_attributes() @@ -174,4 +164,4 @@ def _save_attributes(self): fit.__doc__ = _sklearn_KNeighborsRegressor.__doc__ predict.__doc__ = _sklearn_KNeighborsRegressor.predict.__doc__ kneighbors.__doc__ = _sklearn_KNeighborsRegressor.kneighbors.__doc__ - score.__doc__ = _sklearn_KNeighborsRegressor.score.__doc__ + score.__doc__ = _sklearn_KNeighborsRegressor.score.__doc__ \ No newline at end of file diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 3c4dd62a40..19706f812d 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -14,7 +14,6 @@ # limitations under the License. # =============================================================================== -import numpy as np from sklearn.neighbors._unsupervised import NearestNeighbors as _sklearn_NearestNeighbors from sklearn.utils.validation import _deprecate_positional_args, check_is_fitted @@ -22,7 +21,6 @@ from daal4py.sklearn._utils import sklearn_check_version from daal4py.sklearn.utils.validation import get_requires_y_tag from onedal.neighbors import NearestNeighbors as onedal_NearestNeighbors -from onedal.utils.validation import _check_array, _check_n_features from .._device_offload import dispatch, wrap_output_data from ..utils.validation import check_feature_names @@ -133,18 +131,15 @@ def radius_neighbors_graph( def _onedal_fit(self, X, y=None, queue=None): onedal_params = { "n_neighbors": self.n_neighbors, - "algorithm": self._fit_method, # Use parsed method + "algorithm": self.algorithm, "metric": self.effective_metric_, - "p": self.effective_metric_params_["p"] if self.effective_metric_params_ else 2, + "p": self.effective_metric_params_["p"], } self._onedal_estimator = onedal_NearestNeighbors(**onedal_params) + self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ - self._onedal_estimator._fit_method = self._fit_method - self._onedal_estimator.fit(X, y, queue=queue) - - self._save_attributes() self._onedal_estimator.fit(X, y, queue=queue) self._save_attributes() @@ -172,4 +167,4 @@ def _save_attributes(self): radius_neighbors.__doc__ = _sklearn_NearestNeighbors.radius_neighbors.__doc__ radius_neighbors_graph.__doc__ = ( _sklearn_NearestNeighbors.radius_neighbors_graph.__doc__ - ) + ) \ No newline at end of file From d17bb340bc6378a6995569350b78e53de65a4db5 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 6 Oct 2025 16:16:46 -0700 Subject: [PATCH 03/87] fix: try it again --- sklearnex/neighbors/common.py | 76 ++++++++++++++++++++++++++++++++-- sklearnex/tests/test_common.py | 2 +- 2 files changed, 73 insertions(+), 5 deletions(-) diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index 11407ce8b4..8013098247 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -31,14 +31,81 @@ from .._utils import PatchingConditionsChain from ..base import oneDALEstimator from ..utils._array_api import get_namespace -from ..utils.validation import check_feature_names class KNeighborsDispatchingBase(oneDALEstimator): + + def _parse_auto_method(self, method, n_samples, n_features): + """Parse auto method selection for neighbors algorithm.""" + result_method = method + + if method in ["auto", "ball_tree"]: + condition = ( + self.n_neighbors is not None and self.n_neighbors >= n_samples // 2 + ) + if self.metric == "precomputed" or n_features > 15 or condition: + result_method = "brute" + else: + if self.metric == "euclidean": + result_method = "kd_tree" + else: + result_method = "brute" + + return result_method + + def _get_weights(self, dist, weights): + """Get weights for neighbors based on distance and weights parameter.""" + if weights in (None, "uniform"): + return None + if weights == "distance": + # if user attempts to classify a point that was zero distance from one + # or more training points, those training points are weighted as 1.0 + # and the other points as 0.0 + if dist.dtype is np.dtype(object): + for point_dist_i, point_dist in enumerate(dist): + # check if point_dist is iterable + # (ex: RadiusNeighborClassifier.predict may set an element of + # dist to 1e-6 to represent an 'outlier') + if hasattr(point_dist, "__contains__") and 0.0 in point_dist: + dist[point_dist_i] = point_dist == 0.0 + else: + dist[point_dist_i] = 1.0 / point_dist + else: + with np.errstate(divide="ignore"): + dist = 1.0 / dist + inf_mask = np.isinf(dist) + inf_row = np.any(inf_mask, axis=1) + dist[inf_row] = inf_mask[inf_row] + return dist + elif callable(weights): + return weights(dist) + else: + raise ValueError( + "weights not recognized: should be 'uniform', " + "'distance', or a callable function" + ) + + def _validate_targets(self, y, dtype): + """Validate and convert target values.""" + from onedal.utils.validation import _column_or_1d + arr = _column_or_1d(y, warn=True) + + try: + return arr.astype(dtype, copy=False) + except ValueError: + return arr + + def _validate_n_classes(self): + """Validate that we have at least 2 classes for classification.""" + length = 0 if self.classes_ is None else len(self.classes_) + if length < 2: + raise ValueError( + f"The number of classes has to be greater than one; got {length}" + ) def _fit_validation(self, X, y=None): if sklearn_check_version("1.2"): self._validate_params() - check_feature_names(self, X, reset=True) + if self.metric_params is not None and "p" in self.metric_params: if self.p is not None: warnings.warn( @@ -67,8 +134,9 @@ def _fit_validation(self, X, y=None): self.effective_metric_ = "chebyshev" if not isinstance(X, (KDTree, BallTree, _sklearn_NeighborsBase)): + xp, _ = get_namespace(X) self._fit_X = _check_array( - X, dtype=[np.float64, np.float32], accept_sparse=True + X, dtype=[xp.float64, xp.float32], accept_sparse=True ) self.n_samples_fit_ = _num_samples(self._fit_X) self.n_features_in_ = _num_features(self._fit_X) @@ -310,4 +378,4 @@ def kneighbors_graph(self, X=None, n_neighbors=None, mode="connectivity"): return kneighbors_graph - kneighbors_graph.__doc__ = KNeighborsMixin.kneighbors_graph.__doc__ \ No newline at end of file + kneighbors_graph.__doc__ = KNeighborsMixin.kneighbors_graph.__doc__ diff --git a/sklearnex/tests/test_common.py b/sklearnex/tests/test_common.py index d8e3cb8188..a0b1d90476 100644 --- a/sklearnex/tests/test_common.py +++ b/sklearnex/tests/test_common.py @@ -601,4 +601,4 @@ def test_estimator(estimator, method, design_pattern, estimator_trace): if key in _DESIGN_RULE_VIOLATIONS: pytest.xfail(_DESIGN_RULE_VIOLATIONS[key]) else: - raise + raise \ No newline at end of file From 0e8b4c66b949645010f4984228db9b5c0a02c97a Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 6 Oct 2025 16:22:41 -0700 Subject: [PATCH 04/87] fix: try it again --- onedal/neighbors/neighbors.py | 2 +- sklearnex/neighbors/common.py | 76 ++--------------------------------- 2 files changed, 5 insertions(+), 73 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index 313a6253a9..6114346b37 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -691,4 +691,4 @@ def fit(self, X, y, queue=None): @supports_queue def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): - return self._kneighbors(X, n_neighbors, return_distance) \ No newline at end of file + return self._kneighbors(X, n_neighbors, return_distance) diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index 8013098247..11407ce8b4 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -31,81 +31,14 @@ from .._utils import PatchingConditionsChain from ..base import oneDALEstimator from ..utils._array_api import get_namespace +from ..utils.validation import check_feature_names class KNeighborsDispatchingBase(oneDALEstimator): - - def _parse_auto_method(self, method, n_samples, n_features): - """Parse auto method selection for neighbors algorithm.""" - result_method = method - - if method in ["auto", "ball_tree"]: - condition = ( - self.n_neighbors is not None and self.n_neighbors >= n_samples // 2 - ) - if self.metric == "precomputed" or n_features > 15 or condition: - result_method = "brute" - else: - if self.metric == "euclidean": - result_method = "kd_tree" - else: - result_method = "brute" - - return result_method - - def _get_weights(self, dist, weights): - """Get weights for neighbors based on distance and weights parameter.""" - if weights in (None, "uniform"): - return None - if weights == "distance": - # if user attempts to classify a point that was zero distance from one - # or more training points, those training points are weighted as 1.0 - # and the other points as 0.0 - if dist.dtype is np.dtype(object): - for point_dist_i, point_dist in enumerate(dist): - # check if point_dist is iterable - # (ex: RadiusNeighborClassifier.predict may set an element of - # dist to 1e-6 to represent an 'outlier') - if hasattr(point_dist, "__contains__") and 0.0 in point_dist: - dist[point_dist_i] = point_dist == 0.0 - else: - dist[point_dist_i] = 1.0 / point_dist - else: - with np.errstate(divide="ignore"): - dist = 1.0 / dist - inf_mask = np.isinf(dist) - inf_row = np.any(inf_mask, axis=1) - dist[inf_row] = inf_mask[inf_row] - return dist - elif callable(weights): - return weights(dist) - else: - raise ValueError( - "weights not recognized: should be 'uniform', " - "'distance', or a callable function" - ) - - def _validate_targets(self, y, dtype): - """Validate and convert target values.""" - from onedal.utils.validation import _column_or_1d - arr = _column_or_1d(y, warn=True) - - try: - return arr.astype(dtype, copy=False) - except ValueError: - return arr - - def _validate_n_classes(self): - """Validate that we have at least 2 classes for classification.""" - length = 0 if self.classes_ is None else len(self.classes_) - if length < 2: - raise ValueError( - f"The number of classes has to be greater than one; got {length}" - ) def _fit_validation(self, X, y=None): if sklearn_check_version("1.2"): self._validate_params() - + check_feature_names(self, X, reset=True) if self.metric_params is not None and "p" in self.metric_params: if self.p is not None: warnings.warn( @@ -134,9 +67,8 @@ def _fit_validation(self, X, y=None): self.effective_metric_ = "chebyshev" if not isinstance(X, (KDTree, BallTree, _sklearn_NeighborsBase)): - xp, _ = get_namespace(X) self._fit_X = _check_array( - X, dtype=[xp.float64, xp.float32], accept_sparse=True + X, dtype=[np.float64, np.float32], accept_sparse=True ) self.n_samples_fit_ = _num_samples(self._fit_X) self.n_features_in_ = _num_features(self._fit_X) @@ -378,4 +310,4 @@ def kneighbors_graph(self, X=None, n_neighbors=None, mode="connectivity"): return kneighbors_graph - kneighbors_graph.__doc__ = KNeighborsMixin.kneighbors_graph.__doc__ + kneighbors_graph.__doc__ = KNeighborsMixin.kneighbors_graph.__doc__ \ No newline at end of file From 9dda937ea7060f48f43142bd6bd95ac9f05edec8 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 6 Oct 2025 16:47:18 -0700 Subject: [PATCH 05/87] fix: first round of refactor move preprocssing function to sklearnex --- onedal/neighbors/neighbors.py | 227 +++------------------- sklearnex/neighbors/_lof.py | 14 +- sklearnex/neighbors/common.py | 135 ++++++++++++- sklearnex/neighbors/knn_classification.py | 38 +++- sklearnex/neighbors/knn_regression.py | 39 +++- sklearnex/neighbors/knn_unsupervised.py | 23 ++- 6 files changed, 263 insertions(+), 213 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index 6114346b37..39ffe4dd9b 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -15,7 +15,6 @@ # ============================================================================== from abc import ABCMeta, abstractmethod -from numbers import Integral import numpy as np @@ -28,14 +27,7 @@ from ..common._mixin import ClassifierMixin, RegressorMixin from ..datatypes import from_table, to_table from ..utils._array_api import _get_sycl_namespace -from ..utils.validation import ( - _check_array, - _check_classification_targets, - _check_n_features, - _check_X_y, - _column_or_1d, - _num_samples, -) +from ..utils.validation import _num_samples class NeighborsCommonBase(metaclass=ABCMeta): @@ -50,23 +42,6 @@ def __init__(self): self.effective_metric_params_ = None self._onedal_model = None - def _parse_auto_method(self, method, n_samples, n_features): - result_method = method - - if method in ["auto", "ball_tree"]: - condition = ( - self.n_neighbors is not None and self.n_neighbors >= n_samples // 2 - ) - if self.metric == "precomputed" or n_features > 15 or condition: - result_method = "brute" - else: - if self.metric == "euclidean": - result_method = "kd_tree" - else: - result_method = "brute" - - return result_method - @abstractmethod def train(self, *args, **kwargs): ... @@ -76,66 +51,6 @@ def infer(self, *args, **kwargs): ... @abstractmethod def _onedal_fit(self, X, y): ... - def _validate_data( - self, X, y=None, reset=True, validate_separately=None, **check_params - ): - if y is None: - if self.requires_y: - raise ValueError( - f"This {self.__class__.__name__} estimator " - f"requires y to be passed, but the target y is None." - ) - X = _check_array(X, **check_params) - out = X, y - else: - if validate_separately: - # We need this because some estimators validate X and y - # separately, and in general, separately calling _check_array() - # on X and y isn't equivalent to just calling _check_X_y() - # :( - check_X_params, check_y_params = validate_separately - X = _check_array(X, **check_X_params) - y = _check_array(y, **check_y_params) - else: - X, y = _check_X_y(X, y, **check_params) - out = X, y - - if check_params.get("ensure_2d", True): - _check_n_features(self, X, reset=reset) - - return out - - def _get_weights(self, dist, weights): - if weights in (None, "uniform"): - return None - if weights == "distance": - # if user attempts to classify a point that was zero distance from one - # or more training points, those training points are weighted as 1.0 - # and the other points as 0.0 - if dist.dtype is np.dtype(object): - for point_dist_i, point_dist in enumerate(dist): - # check if point_dist is iterable - # (ex: RadiusNeighborClassifier.predict may set an element of - # dist to 1e-6 to represent an 'outlier') - if hasattr(point_dist, "__contains__") and 0.0 in point_dist: - dist[point_dist_i] = point_dist == 0.0 - else: - dist[point_dist_i] = 1.0 / point_dist - else: - with np.errstate(divide="ignore"): - dist = 1.0 / dist - inf_mask = np.isinf(dist) - inf_row = np.any(inf_mask, axis=1) - dist[inf_row] = inf_mask[inf_row] - return dist - elif callable(weights): - return weights(dist) - else: - raise ValueError( - "weights not recognized: should be 'uniform', " - "'distance', or a callable function" - ) - def _get_onedal_params(self, X, y=None, n_neighbors=None): class_count = 0 if self.classes_ is None else len(self.classes_) weights = getattr(self, "weights", "uniform") @@ -145,8 +60,18 @@ def _get_onedal_params(self, X, y=None, n_neighbors=None): p = 2.0 else: p = self.p + + # Handle different input types for dtype + try: + fptype = X.dtype + except AttributeError: + # For pandas DataFrames or other types without dtype attribute + import numpy as np + + fptype = np.float64 + return { - "fptype": X.dtype, + "fptype": fptype, "vote_weights": "uniform" if weights == "uniform" else "distance", "method": self._fit_method, "radius": self.radius, @@ -176,21 +101,6 @@ def __init__( self.p = p self.metric_params = metric_params - def _validate_targets(self, y, dtype): - arr = _column_or_1d(y, warn=True) - - try: - return arr.astype(dtype, copy=False) - except ValueError: - return arr - - def _validate_n_classes(self): - length = 0 if self.classes_ is None else len(self.classes_) - if length < 2: - raise ValueError( - f"The number of classes has to be greater than one; got {length}" - ) - def _fit(self, X, y): self._onedal_model = None self._tree = None @@ -202,13 +112,8 @@ def _fit(self, X, y): ) _, xp, _ = _get_sycl_namespace(X) - use_raw_input = _get_config().get("use_raw_input", False) is True if y is not None or self.requires_y: shape = getattr(y, "shape", None) - if not use_raw_input: - X, y = super()._validate_data( - X, y, dtype=[np.float64, np.float32], accept_sparse="csr" - ) self._shape = shape if shape is not None else y.shape if _is_classifier(self): @@ -218,7 +123,6 @@ def _fit(self, X, y): else: self.outputs_2d_ = True - _check_classification_targets(y) self.classes_ = [] self._y = np.empty(y.shape, dtype=int) for k in range(self._y.shape[1]): @@ -228,36 +132,19 @@ def _fit(self, X, y): if not self.outputs_2d_: self.classes_ = self.classes_[0] self._y = self._y.ravel() - - self._validate_n_classes() else: self._y = y - elif not use_raw_input: - X, _ = super()._validate_data(X, dtype=[np.float64, np.float32]) self.n_samples_fit_ = X.shape[0] self.n_features_in_ = X.shape[1] self._fit_X = X - if self.n_neighbors is not None: - if self.n_neighbors <= 0: - raise ValueError("Expected n_neighbors > 0. Got %d" % self.n_neighbors) - if not isinstance(self.n_neighbors, Integral): - raise TypeError( - "n_neighbors does not take %s value, " - "enter integer value" % type(self.n_neighbors) - ) - - self._fit_method = super()._parse_auto_method( - self.algorithm, self.n_samples_fit_, self.n_features_in_ - ) - _fit_y = None queue = QM.get_global_queue() gpu_device = queue is not None and queue.sycl_device.is_gpu if _is_classifier(self) or (_is_regressor(self) and gpu_device): - _fit_y = self._validate_targets(self._y, X.dtype).reshape((-1, 1)) + _fit_y = y.astype(X.dtype).reshape((-1, 1)) if y is not None else None result = self._onedal_fit(X, _fit_y) if y is not None and _is_regressor(self): @@ -269,38 +156,22 @@ def _fit(self, X, y): return result def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): - n_features = getattr(self, "n_features_in_", None) - shape = getattr(X, "shape", None) - if n_features and shape and len(shape) > 1 and shape[1] != n_features: - raise ValueError( - ( - f"X has {X.shape[1]} features, " - f"but kneighbors is expecting " - f"{n_features} features as input" - ) - ) - _check_is_fitted(self) if n_neighbors is None: n_neighbors = self.n_neighbors - elif n_neighbors <= 0: - raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors) - else: - if not isinstance(n_neighbors, Integral): - raise TypeError( - "n_neighbors does not take %s value, " - "enter integer value" % type(n_neighbors) - ) if X is not None: query_is_train = False +<<<<<<< HEAD <<<<<<< HEAD if not use_raw_input: X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) ======= X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) >>>>>>> e003b37f (fix: try it again) +======= +>>>>>>> 8cd6f2b2 (fix: first round of refactor move preprocssing function to sklearnex) else: query_is_train = True X = self._fit_X @@ -309,24 +180,12 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors += 1 n_samples_fit = self.n_samples_fit_ - if n_neighbors > n_samples_fit: - if query_is_train: - n_neighbors -= 1 # ok to modify inplace because an error is raised - inequality_str = "n_neighbors < n_samples_fit" - else: - inequality_str = "n_neighbors <= n_samples_fit" - raise ValueError( - f"Expected {inequality_str}, but " - f"n_neighbors = {n_neighbors}, n_samples_fit = {n_samples_fit}, " - f"n_samples = {X.shape[0]}" # include n_samples for common tests - ) chunked_results = None - method = self._parse_auto_method( - self._fit_method, self.n_samples_fit_, n_features - ) + # Use the fit method determined at sklearnex level + method = getattr(self, "_fit_method", "brute") - params = super()._get_onedal_params(X, n_neighbors=n_neighbors) + params = self._get_onedal_params(X, n_neighbors=n_neighbors) prediction_results = self._onedal_predict(self._onedal_model, X, params) distances = from_table(prediction_results.distances) indices = from_table(prediction_results.indices) @@ -434,30 +293,9 @@ def fit(self, X, y, queue=None): @supports_queue def predict(self, X, queue=None): - use_raw_input = _get_config().get("use_raw_input", False) is True - if not use_raw_input: - X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) onedal_model = getattr(self, "_onedal_model", None) - n_features = getattr(self, "n_features_in_", None) - n_samples_fit_ = getattr(self, "n_samples_fit_", None) - shape = getattr(X, "shape", None) - if n_features and shape and len(shape) > 1 and shape[1] != n_features: - raise ValueError( - ( - f"X has {X.shape[1]} features, " - f"but KNNClassifier is expecting " - f"{n_features} features as input" - ) - ) - _check_is_fitted(self) - self._fit_method = self._parse_auto_method( - self.algorithm, n_samples_fit_, n_features - ) - - self._validate_n_classes() - params = self._get_onedal_params(X) prediction_result = self._onedal_predict(onedal_model, X, params) responses = from_table(prediction_result.responses) @@ -477,9 +315,8 @@ def predict_proba(self, X, queue=None): n_queries = _num_samples(X) - weights = self._get_weights(neigh_dist, self.weights) - if weights is None: - weights = np.ones_like(neigh_ind) + # Use uniform weights for now - weights calculation should be done at sklearnex level + weights = np.ones_like(neigh_ind) all_rows = np.arange(n_queries) probabilities = [] @@ -580,28 +417,9 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None) return self._kneighbors(X, n_neighbors, return_distance) def _predict_gpu(self, X): - use_raw_input = _get_config().get("use_raw_input", False) is True - if not use_raw_input: - X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) onedal_model = getattr(self, "_onedal_model", None) - n_features = getattr(self, "n_features_in_", None) - n_samples_fit_ = getattr(self, "n_samples_fit_", None) - shape = getattr(X, "shape", None) - if n_features and shape and len(shape) > 1 and shape[1] != n_features: - raise ValueError( - ( - f"X has {X.shape[1]} features, " - f"but KNNClassifier is expecting " - f"{n_features} features as input" - ) - ) - _check_is_fitted(self) - self._fit_method = self._parse_auto_method( - self.algorithm, n_samples_fit_, n_features - ) - params = self._get_onedal_params(X) prediction_result = self._onedal_predict(onedal_model, X, params) @@ -613,7 +431,8 @@ def _predict_gpu(self, X): def _predict_skl(self, X): neigh_dist, neigh_ind = self.kneighbors(X) - weights = self._get_weights(neigh_dist, self.weights) + # Use uniform weights for now - weights calculation should be done at sklearnex level + weights = None _y = self._y if _y.ndim == 1: diff --git a/sklearnex/neighbors/_lof.py b/sklearnex/neighbors/_lof.py index 7c115ce9c5..7f5f2fe840 100644 --- a/sklearnex/neighbors/_lof.py +++ b/sklearnex/neighbors/_lof.py @@ -152,6 +152,18 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): check_is_fitted(self) if X is not None: check_feature_names(self, X, reset=False) + # Perform preprocessing at sklearnex level + import numpy as np + + from onedal.utils.validation import _check_array + + X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + self._validate_feature_count(X, "kneighbors") + + # Validate n_neighbors + if n_neighbors is not None: + self._validate_n_neighbors(n_neighbors) + return dispatch( self, "kneighbors", @@ -186,4 +198,4 @@ def score_samples(self, X): return -np.mean(lrd_ratios_array, axis=1) fit.__doc__ = _sklearn_LocalOutlierFactor.fit.__doc__ - kneighbors.__doc__ = _sklearn_LocalOutlierFactor.kneighbors.__doc__ \ No newline at end of file + kneighbors.__doc__ = _sklearn_LocalOutlierFactor.kneighbors.__doc__ diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index 11407ce8b4..417b607253 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -15,6 +15,7 @@ # ============================================================================== import warnings +from numbers import Integral import numpy as np from scipy import sparse as sp @@ -26,7 +27,14 @@ from daal4py.sklearn._utils import sklearn_check_version from onedal._device_offload import _transfer_to_host -from onedal.utils.validation import _check_array, _num_features, _num_samples +from onedal.utils.validation import ( + _check_array, + _check_classification_targets, + _check_X_y, + _column_or_1d, + _num_features, + _num_samples, +) from .._utils import PatchingConditionsChain from ..base import oneDALEstimator @@ -35,6 +43,129 @@ class KNeighborsDispatchingBase(oneDALEstimator): + def _parse_auto_method(self, method, n_samples, n_features): + result_method = method + + if method in ["auto", "ball_tree"]: + condition = ( + self.n_neighbors is not None and self.n_neighbors >= n_samples // 2 + ) + if self.metric == "precomputed" or n_features > 15 or condition: + result_method = "brute" + else: + if self.metric == "euclidean": + result_method = "kd_tree" + else: + result_method = "brute" + + return result_method + + def _validate_data( + self, X, y=None, reset=True, validate_separately=None, **check_params + ): + if y is None: + if getattr(self, "requires_y", False): + raise ValueError( + f"This {self.__class__.__name__} estimator " + f"requires y to be passed, but the target y is None." + ) + X = _check_array(X, **check_params) + out = X, y + else: + if validate_separately: + # We need this because some estimators validate X and y + # separately, and in general, separately calling _check_array() + # on X and y isn't equivalent to just calling _check_X_y() + # :( + check_X_params, check_y_params = validate_separately + X = _check_array(X, **check_X_params) + y = _check_array(y, **check_y_params) + else: + X, y = _check_X_y(X, y, **check_params) + out = X, y + + if check_params.get("ensure_2d", True): + from onedal.utils.validation import _check_n_features + + _check_n_features(self, X, reset=reset) + + return out + + def _get_weights(self, dist, weights): + if weights in (None, "uniform"): + return None + if weights == "distance": + # if user attempts to classify a point that was zero distance from one + # or more training points, those training points are weighted as 1.0 + # and the other points as 0.0 + if dist.dtype is np.dtype(object): + for point_dist_i, point_dist in enumerate(dist): + # check if point_dist is iterable + # (ex: RadiusNeighborClassifier.predict may set an element of + # dist to 1e-6 to represent an 'outlier') + if hasattr(point_dist, "__contains__") and 0.0 in point_dist: + dist[point_dist_i] = point_dist == 0.0 + else: + dist[point_dist_i] = 1.0 / point_dist + else: + with np.errstate(divide="ignore"): + dist = 1.0 / dist + inf_mask = np.isinf(dist) + inf_row = np.any(inf_mask, axis=1) + dist[inf_row] = inf_mask[inf_row] + return dist + elif callable(weights): + return weights(dist) + else: + raise ValueError( + "weights not recognized: should be 'uniform', " + "'distance', or a callable function" + ) + + def _validate_targets(self, y, dtype): + arr = _column_or_1d(y, warn=True) + + try: + return arr.astype(dtype, copy=False) + except ValueError: + return arr + + def _validate_n_neighbors(self, n_neighbors): + if n_neighbors is not None: + if n_neighbors <= 0: + raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors) + if not isinstance(n_neighbors, Integral): + raise TypeError( + "n_neighbors does not take %s value, " + "enter integer value" % type(n_neighbors) + ) + + def _validate_feature_count(self, X, method_name=""): + n_features = getattr(self, "n_features_in_", None) + shape = getattr(X, "shape", None) + if n_features and shape and len(shape) > 1 and shape[1] != n_features: + raise ValueError( + ( + f"X has {X.shape[1]} features, " + f"but {method_name} is expecting " + f"{n_features} features as input" + ) + ) + + def _validate_kneighbors_bounds(self, n_neighbors, query_is_train, X): + n_samples_fit = self.n_samples_fit_ + if n_neighbors > n_samples_fit: + if query_is_train: + n_neighbors -= 1 # ok to modify inplace because an error is raised + inequality_str = "n_neighbors < n_samples_fit" + else: + inequality_str = "n_neighbors <= n_samples_fit" + raise ValueError( + f"Expected {inequality_str}, but " + f"n_neighbors = {n_neighbors}, n_samples_fit = {n_samples_fit}, " + f"n_samples = {X.shape[0]}" # include n_samples for common tests + ) + def _fit_validation(self, X, y=None): if sklearn_check_version("1.2"): self._validate_params() @@ -310,4 +441,4 @@ def kneighbors_graph(self, X=None, n_neighbors=None, mode="connectivity"): return kneighbors_graph - kneighbors_graph.__doc__ = KNeighborsMixin.kneighbors_graph.__doc__ \ No newline at end of file + kneighbors_graph.__doc__ = KNeighborsMixin.kneighbors_graph.__doc__ diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index e3f516d932..17cc642ad3 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -14,6 +14,7 @@ # limitations under the License. # =============================================================================== +import numpy as np from sklearn.metrics import accuracy_score from sklearn.neighbors._classification import ( KNeighborsClassifier as _sklearn_KNeighborsClassifier, @@ -80,6 +81,13 @@ def fit(self, X, y): def predict(self, X): check_is_fitted(self) check_feature_names(self, X, reset=False) + + # Perform preprocessing at sklearnex level + from onedal.utils.validation import _check_array + + X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + self._validate_feature_count(X, "KNNClassifier") + return dispatch( self, "predict", @@ -94,6 +102,13 @@ def predict(self, X): def predict_proba(self, X): check_is_fitted(self) check_feature_names(self, X, reset=False) + + # Perform preprocessing at sklearnex level + from onedal.utils.validation import _check_array + + X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + self._validate_feature_count(X, "predict_proba") + return dispatch( self, "predict_proba", @@ -108,6 +123,13 @@ def predict_proba(self, X): def score(self, X, y, sample_weight=None): check_is_fitted(self) check_feature_names(self, X, reset=False) + + # Perform preprocessing at sklearnex level + from onedal.utils.validation import _check_array + + X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + self._validate_feature_count(X, "score") + return dispatch( self, "score", @@ -125,6 +147,16 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): check_is_fitted(self) if X is not None: check_feature_names(self, X, reset=False) + # Perform preprocessing at sklearnex level + from onedal.utils.validation import _check_array + + X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + self._validate_feature_count(X, "kneighbors") + + # Validate n_neighbors + if n_neighbors is not None: + self._validate_n_neighbors(n_neighbors) + return dispatch( self, "kneighbors", @@ -138,13 +170,10 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): ) def _onedal_fit(self, X, y, queue=None): -<<<<<<< HEAD -======= # import sys # print("=" * 50, file=sys.stderr, flush=True) # print("DEBUG: _onedal_fit called!", file=sys.stderr, flush=True) # print("=" * 50, file=sys.stderr, flush=True) ->>>>>>> e003b37f (fix: try it again) onedal_params = { "n_neighbors": self.n_neighbors, "weights": self.weights, @@ -157,6 +186,7 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ + self._onedal_estimator._fit_method = self._fit_method self._onedal_estimator.fit(X, y, queue=queue) self._save_attributes() @@ -193,4 +223,4 @@ def _save_attributes(self): predict.__doc__ = _sklearn_KNeighborsClassifier.predict.__doc__ predict_proba.__doc__ = _sklearn_KNeighborsClassifier.predict_proba.__doc__ score.__doc__ = _sklearn_KNeighborsClassifier.score.__doc__ - kneighbors.__doc__ = _sklearn_KNeighborsClassifier.kneighbors.__doc__ \ No newline at end of file + kneighbors.__doc__ = _sklearn_KNeighborsClassifier.kneighbors.__doc__ diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 502dba72c6..bc3cb54ee1 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -14,6 +14,7 @@ # limitations under the License. # ============================================================================== +import numpy as np from sklearn.metrics import r2_score from sklearn.neighbors._regression import ( KNeighborsRegressor as _sklearn_KNeighborsRegressor, @@ -78,6 +79,13 @@ def fit(self, X, y): def predict(self, X): check_is_fitted(self) check_feature_names(self, X, reset=False) + + # Perform preprocessing at sklearnex level + from onedal.utils.validation import _check_array + + X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + self._validate_feature_count(X, "KNNRegressor") + return dispatch( self, "predict", @@ -92,6 +100,13 @@ def predict(self, X): def score(self, X, y, sample_weight=None): check_is_fitted(self) check_feature_names(self, X, reset=False) + + # Perform preprocessing at sklearnex level + from onedal.utils.validation import _check_array + + X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + self._validate_feature_count(X, "score") + return dispatch( self, "score", @@ -109,6 +124,16 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): check_is_fitted(self) if X is not None: check_feature_names(self, X, reset=False) + # Perform preprocessing at sklearnex level + from onedal.utils.validation import _check_array + + X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + self._validate_feature_count(X, "kneighbors") + + # Validate n_neighbors + if n_neighbors is not None: + self._validate_n_neighbors(n_neighbors) + return dispatch( self, "kneighbors", @@ -122,6 +147,17 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): ) def _onedal_fit(self, X, y, queue=None): + # Perform preprocessing at sklearnex level + X, y = self._validate_data( + X, y, dtype=[np.float64, np.float32], accept_sparse="csr" + ) + + # Validate n_neighbors + self._validate_n_neighbors(self.n_neighbors) + + # Parse auto method + self._fit_method = self._parse_auto_method(self.algorithm, X.shape[0], X.shape[1]) + onedal_params = { "n_neighbors": self.n_neighbors, "weights": self.weights, @@ -134,6 +170,7 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ + self._onedal_estimator._fit_method = self._fit_method self._onedal_estimator.fit(X, y, queue=queue) self._save_attributes() @@ -164,4 +201,4 @@ def _save_attributes(self): fit.__doc__ = _sklearn_KNeighborsRegressor.__doc__ predict.__doc__ = _sklearn_KNeighborsRegressor.predict.__doc__ kneighbors.__doc__ = _sklearn_KNeighborsRegressor.kneighbors.__doc__ - score.__doc__ = _sklearn_KNeighborsRegressor.score.__doc__ \ No newline at end of file + score.__doc__ = _sklearn_KNeighborsRegressor.score.__doc__ diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 19706f812d..ad2e5e661f 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -14,6 +14,7 @@ # limitations under the License. # =============================================================================== +import numpy as np from sklearn.neighbors._unsupervised import NearestNeighbors as _sklearn_NearestNeighbors from sklearn.utils.validation import _deprecate_positional_args, check_is_fitted @@ -76,6 +77,16 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): check_is_fitted(self) if X is not None: check_feature_names(self, X, reset=False) + # Perform preprocessing at sklearnex level + from onedal.utils.validation import _check_array + + X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + self._validate_feature_count(X, "kneighbors") + + # Validate n_neighbors + if n_neighbors is not None: + self._validate_n_neighbors(n_neighbors) + return dispatch( self, "kneighbors", @@ -129,6 +140,15 @@ def radius_neighbors_graph( ) def _onedal_fit(self, X, y=None, queue=None): + # Perform preprocessing at sklearnex level + X, _ = self._validate_data(X, dtype=[np.float64, np.float32], accept_sparse=True) + + # Validate n_neighbors + self._validate_n_neighbors(self.n_neighbors) + + # Parse auto method + self._fit_method = self._parse_auto_method(self.algorithm, X.shape[0], X.shape[1]) + onedal_params = { "n_neighbors": self.n_neighbors, "algorithm": self.algorithm, @@ -140,6 +160,7 @@ def _onedal_fit(self, X, y=None, queue=None): self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ + self._onedal_estimator._fit_method = self._fit_method self._onedal_estimator.fit(X, y, queue=queue) self._save_attributes() @@ -167,4 +188,4 @@ def _save_attributes(self): radius_neighbors.__doc__ = _sklearn_NearestNeighbors.radius_neighbors.__doc__ radius_neighbors_graph.__doc__ = ( _sklearn_NearestNeighbors.radius_neighbors_graph.__doc__ - ) \ No newline at end of file + ) From 8bd86c2b162214b3a3b072fa6933ef88ad0ba3f6 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 6 Oct 2025 17:58:05 -0700 Subject: [PATCH 06/87] fix: fix shape --- onedal/neighbors/neighbors.py | 7 +++++ sklearnex/neighbors/common.py | 35 +++++++++++++++++++++++ sklearnex/neighbors/knn_classification.py | 7 +++++ sklearnex/neighbors/knn_regression.py | 8 ++++++ sklearnex/neighbors/knn_unsupervised.py | 7 +++++ 5 files changed, 64 insertions(+) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index 39ffe4dd9b..f02dea2dc1 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -70,6 +70,13 @@ def _get_onedal_params(self, X, y=None, n_neighbors=None): fptype = np.float64 + # _fit_method should be set by sklearnex level before calling oneDAL + if not hasattr(self, "_fit_method") or self._fit_method is None: + raise ValueError( + "_fit_method must be set by sklearnex level before calling oneDAL. " + "This indicates improper usage - oneDAL neighbors should not be called directly." + ) + return { "fptype": fptype, "vote_weights": "uniform" if weights == "uniform" else "distance", diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index 417b607253..e28af4e2e9 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -166,6 +166,41 @@ def _validate_kneighbors_bounds(self, n_neighbors, query_is_train, X): f"n_samples = {X.shape[0]}" # include n_samples for common tests ) + def _process_classification_targets(self, y): + """Process classification targets and set class-related attributes.""" + import numpy as np + + # Handle shape processing + shape = getattr(y, "shape", None) + self._shape = shape if shape is not None else y.shape + + if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1: + self.outputs_2d_ = False + y = y.reshape((-1, 1)) + else: + self.outputs_2d_ = True + + # Process classes + self.classes_ = [] + self._y = np.empty(y.shape, dtype=int) + for k in range(self._y.shape[1]): + classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True) + self.classes_.append(classes) + + if not self.outputs_2d_: + self.classes_ = self.classes_[0] + self._y = self._y.ravel() + + return y + + def _process_regression_targets(self, y): + """Process regression targets and set shape-related attributes.""" + # Handle shape processing for regression + shape = getattr(y, "shape", None) + self._shape = shape if shape is not None else y.shape + self._y = y + return y + def _fit_validation(self, X, y=None): if sklearn_check_version("1.2"): self._validate_params() diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index 17cc642ad3..a9a0fb9d67 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -187,6 +187,13 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ self._onedal_estimator._fit_method = self._fit_method + + # Set shape and class attributes on the onedal estimator + self._onedal_estimator._shape = self._shape + self._onedal_estimator.classes_ = self.classes_ + self._onedal_estimator._y = self._y + self._onedal_estimator.outputs_2d_ = self.outputs_2d_ + self._onedal_estimator.fit(X, y, queue=queue) self._save_attributes() diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index bc3cb54ee1..d6ee39a88f 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -158,6 +158,9 @@ def _onedal_fit(self, X, y, queue=None): # Parse auto method self._fit_method = self._parse_auto_method(self.algorithm, X.shape[0], X.shape[1]) + # Handle shape processing at sklearnex level + y = self._process_regression_targets(y) + onedal_params = { "n_neighbors": self.n_neighbors, "weights": self.weights, @@ -171,6 +174,11 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ self._onedal_estimator._fit_method = self._fit_method + + # Set shape attributes on the onedal estimator + self._onedal_estimator._shape = self._shape + self._onedal_estimator._y = self._y + self._onedal_estimator.fit(X, y, queue=queue) self._save_attributes() diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index ad2e5e661f..ddb688d629 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -149,6 +149,9 @@ def _onedal_fit(self, X, y=None, queue=None): # Parse auto method self._fit_method = self._parse_auto_method(self.algorithm, X.shape[0], X.shape[1]) + # Set basic attributes for unsupervised + self.classes_ = None + onedal_params = { "n_neighbors": self.n_neighbors, "algorithm": self.algorithm, @@ -161,6 +164,10 @@ def _onedal_fit(self, X, y=None, queue=None): self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ self._onedal_estimator._fit_method = self._fit_method + + # Set attributes on the onedal estimator + self._onedal_estimator.classes_ = self.classes_ + self._onedal_estimator.fit(X, y, queue=queue) self._save_attributes() From debfcdf845a876ba146ef08cdb9b2a8c22f714de Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 7 Oct 2025 11:03:06 -0700 Subject: [PATCH 07/87] rebase: rebase to main --- onedal/neighbors/neighbors.py | 11 +---------- sklearnex/neighbors/common.py | 2 +- sklearnex/neighbors/knn_classification.py | 22 +++++++++++++++++++++- sklearnex/neighbors/knn_regression.py | 2 +- sklearnex/neighbors/knn_unsupervised.py | 2 +- 5 files changed, 25 insertions(+), 14 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index f02dea2dc1..6ca6c65c29 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -170,15 +170,6 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): if X is not None: query_is_train = False -<<<<<<< HEAD -<<<<<<< HEAD - if not use_raw_input: - X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) -======= - X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) ->>>>>>> e003b37f (fix: try it again) -======= ->>>>>>> 8cd6f2b2 (fix: first round of refactor move preprocssing function to sklearnex) else: query_is_train = True X = self._fit_X @@ -517,4 +508,4 @@ def fit(self, X, y, queue=None): @supports_queue def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): - return self._kneighbors(X, n_neighbors, return_distance) + return self._kneighbors(X, n_neighbors, return_distance) \ No newline at end of file diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index e28af4e2e9..843952ffb0 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -476,4 +476,4 @@ def kneighbors_graph(self, X=None, n_neighbors=None, mode="connectivity"): return kneighbors_graph - kneighbors_graph.__doc__ = KNeighborsMixin.kneighbors_graph.__doc__ + kneighbors_graph.__doc__ = KNeighborsMixin.kneighbors_graph.__doc__ \ No newline at end of file diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index a9a0fb9d67..5a3115a61a 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -174,6 +174,26 @@ def _onedal_fit(self, X, y, queue=None): # print("=" * 50, file=sys.stderr, flush=True) # print("DEBUG: _onedal_fit called!", file=sys.stderr, flush=True) # print("=" * 50, file=sys.stderr, flush=True) + + # Perform preprocessing at sklearnex level + X, y = self._validate_data( + X, y, dtype=[np.float64, np.float32], accept_sparse="csr" + ) + + # Validate n_neighbors + self._validate_n_neighbors(self.n_neighbors) + + # Parse auto method + self._fit_method = self._parse_auto_method(self.algorithm, X.shape[0], X.shape[1]) + + # Validate classification targets + from onedal.utils.validation import _check_classification_targets + + _check_classification_targets(y) + + # Handle shape and class processing at sklearnex level + y = self._process_classification_targets(y) + onedal_params = { "n_neighbors": self.n_neighbors, "weights": self.weights, @@ -230,4 +250,4 @@ def _save_attributes(self): predict.__doc__ = _sklearn_KNeighborsClassifier.predict.__doc__ predict_proba.__doc__ = _sklearn_KNeighborsClassifier.predict_proba.__doc__ score.__doc__ = _sklearn_KNeighborsClassifier.score.__doc__ - kneighbors.__doc__ = _sklearn_KNeighborsClassifier.kneighbors.__doc__ + kneighbors.__doc__ = _sklearn_KNeighborsClassifier.kneighbors.__doc__ \ No newline at end of file diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index d6ee39a88f..ff073a10b0 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -209,4 +209,4 @@ def _save_attributes(self): fit.__doc__ = _sklearn_KNeighborsRegressor.__doc__ predict.__doc__ = _sklearn_KNeighborsRegressor.predict.__doc__ kneighbors.__doc__ = _sklearn_KNeighborsRegressor.kneighbors.__doc__ - score.__doc__ = _sklearn_KNeighborsRegressor.score.__doc__ + score.__doc__ = _sklearn_KNeighborsRegressor.score.__doc__ \ No newline at end of file diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index ddb688d629..2060916699 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -195,4 +195,4 @@ def _save_attributes(self): radius_neighbors.__doc__ = _sklearn_NearestNeighbors.radius_neighbors.__doc__ radius_neighbors_graph.__doc__ = ( _sklearn_NearestNeighbors.radius_neighbors_graph.__doc__ - ) + ) \ No newline at end of file From e9e73067e4d82f48421752f64277c7e5ae89ef40 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 7 Oct 2025 12:01:40 -0700 Subject: [PATCH 08/87] fix: add fit emthod logic in onedla --- onedal/neighbors/neighbors.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index 6ca6c65c29..3be0c58ca3 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -70,17 +70,33 @@ def _get_onedal_params(self, X, y=None, n_neighbors=None): fptype = np.float64 - # _fit_method should be set by sklearnex level before calling oneDAL + # Handle _fit_method: use if set by sklearnex, otherwise determine it ourselves if not hasattr(self, "_fit_method") or self._fit_method is None: - raise ValueError( - "_fit_method must be set by sklearnex level before calling oneDAL. " - "This indicates improper usage - oneDAL neighbors should not be called directly." - ) + # Direct oneDAL usage - determine method ourselves + method = getattr(self, "algorithm", "auto") + n_samples, n_features = X.shape + + if method in ["auto", "ball_tree"]: + condition = ( + self.n_neighbors is not None and self.n_neighbors >= n_samples // 2 + ) + if getattr(self, "metric", "minkowski") == "precomputed" or n_features > 15 or condition: + fit_method = "brute" + else: + if getattr(self, "effective_metric_", getattr(self, "metric", "minkowski")) == "euclidean": + fit_method = "kd_tree" + else: + fit_method = "brute" + else: + fit_method = method + else: + # Use the method set by sklearnex level + fit_method = self._fit_method return { "fptype": fptype, "vote_weights": "uniform" if weights == "uniform" else "distance", - "method": self._fit_method, + "method": fit_method, "radius": self.radius, "class_count": class_count, "neighbor_count": self.n_neighbors if n_neighbors is None else n_neighbors, From 02da9e9b6e4730e14c2e59cf4fb28973e3c9bf65 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 7 Oct 2025 14:03:58 -0700 Subject: [PATCH 09/87] fix: fix test --- onedal/neighbors/neighbors.py | 28 ++++--------------- .../tests/test_knn_classification.py | 12 ++++---- 2 files changed, 12 insertions(+), 28 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index 3be0c58ca3..6ca6c65c29 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -70,33 +70,17 @@ def _get_onedal_params(self, X, y=None, n_neighbors=None): fptype = np.float64 - # Handle _fit_method: use if set by sklearnex, otherwise determine it ourselves + # _fit_method should be set by sklearnex level before calling oneDAL if not hasattr(self, "_fit_method") or self._fit_method is None: - # Direct oneDAL usage - determine method ourselves - method = getattr(self, "algorithm", "auto") - n_samples, n_features = X.shape - - if method in ["auto", "ball_tree"]: - condition = ( - self.n_neighbors is not None and self.n_neighbors >= n_samples // 2 - ) - if getattr(self, "metric", "minkowski") == "precomputed" or n_features > 15 or condition: - fit_method = "brute" - else: - if getattr(self, "effective_metric_", getattr(self, "metric", "minkowski")) == "euclidean": - fit_method = "kd_tree" - else: - fit_method = "brute" - else: - fit_method = method - else: - # Use the method set by sklearnex level - fit_method = self._fit_method + raise ValueError( + "_fit_method must be set by sklearnex level before calling oneDAL. " + "This indicates improper usage - oneDAL neighbors should not be called directly." + ) return { "fptype": fptype, "vote_weights": "uniform" if weights == "uniform" else "distance", - "method": fit_method, + "method": self._fit_method, "radius": self.radius, "class_count": class_count, "neighbor_count": self.n_neighbors if n_neighbors is None else n_neighbors, diff --git a/onedal/neighbors/tests/test_knn_classification.py b/onedal/neighbors/tests/test_knn_classification.py index d29bdab345..c0410d8cb1 100755 --- a/onedal/neighbors/tests/test_knn_classification.py +++ b/onedal/neighbors/tests/test_knn_classification.py @@ -19,15 +19,15 @@ from numpy.testing import assert_array_equal from sklearn import datasets -from onedal.neighbors import KNeighborsClassifier +from sklearnex.neighbors import KNeighborsClassifier from onedal.tests.utils._device_selection import get_queues @pytest.mark.parametrize("queue", get_queues()) def test_iris(queue): iris = datasets.load_iris() - clf = KNeighborsClassifier(2).fit(iris.data, iris.target, queue=queue) - assert clf.score(iris.data, iris.target, queue=queue) > 0.9 + clf = KNeighborsClassifier(2).fit(iris.data, iris.target) + assert clf.score(iris.data, iris.target) > 0.9 assert_array_equal(clf.classes_, np.sort(clf.classes_)) @@ -36,8 +36,8 @@ def test_pickle(queue): if queue and queue.sycl_device.is_gpu: pytest.skip("KNN classifier pickling for the GPU sycl_queue is buggy.") iris = datasets.load_iris() - clf = KNeighborsClassifier(2).fit(iris.data, iris.target, queue=queue) - expected = clf.predict(iris.data, queue=queue) + clf = KNeighborsClassifier(2).fit(iris.data, iris.target) + expected = clf.predict(iris.data) import pickle @@ -45,5 +45,5 @@ def test_pickle(queue): clf2 = pickle.loads(dump) assert type(clf2) == clf.__class__ - result = clf2.predict(iris.data, queue=queue) + result = clf2.predict(iris.data) assert_array_equal(expected, result) From 62c8ddd3504d96e1d60d591047d60e9999579175 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Wed, 8 Oct 2025 16:57:13 -0700 Subject: [PATCH 10/87] fix: fix tupleerror --- sklearnex/neighbors/knn_classification.py | 3 ++- sklearnex/neighbors/knn_regression.py | 3 ++- sklearnex/neighbors/knn_unsupervised.py | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index 5a3115a61a..0394d12c37 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -240,7 +240,8 @@ def _save_attributes(self): self.classes_ = self._onedal_estimator.classes_ self.n_features_in_ = self._onedal_estimator.n_features_in_ self.n_samples_fit_ = self._onedal_estimator.n_samples_fit_ - self._fit_X = self._onedal_estimator._fit_X + fit_x = self._onedal_estimator._fit_X + self._fit_X = fit_x[0] if isinstance(fit_x, tuple) else fit_x self._y = self._onedal_estimator._y self._fit_method = self._onedal_estimator._fit_method self.outputs_2d_ = self._onedal_estimator.outputs_2d_ diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index ff073a10b0..93884b41b5 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -201,7 +201,8 @@ def _onedal_score(self, X, y, sample_weight=None, queue=None): def _save_attributes(self): self.n_features_in_ = self._onedal_estimator.n_features_in_ self.n_samples_fit_ = self._onedal_estimator.n_samples_fit_ - self._fit_X = self._onedal_estimator._fit_X + fit_x = self._onedal_estimator._fit_X + self._fit_X = fit_x[0] if isinstance(fit_x, tuple) else fit_x self._y = self._onedal_estimator._y self._fit_method = self._onedal_estimator._fit_method self._tree = self._onedal_estimator._tree diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 2060916699..eac9dea5ae 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -186,7 +186,8 @@ def _save_attributes(self): self.classes_ = self._onedal_estimator.classes_ self.n_features_in_ = self._onedal_estimator.n_features_in_ self.n_samples_fit_ = self._onedal_estimator.n_samples_fit_ - self._fit_X = self._onedal_estimator._fit_X + fit_x = self._onedal_estimator._fit_X + self._fit_X = fit_x[0] if isinstance(fit_x, tuple) else fit_x self._fit_method = self._onedal_estimator._fit_method self._tree = self._onedal_estimator._tree From fc296b534586327e11bfab913685e37daaf7a4c6 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Thu, 9 Oct 2025 16:13:13 -0700 Subject: [PATCH 11/87] fix: fix tuple issue --- onedal/neighbors/neighbors.py | 2 +- sklearnex/neighbors/knn_classification.py | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index 6ca6c65c29..440a94ff57 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -503,7 +503,7 @@ def _onedal_predict(self, model, X, params): return self.infer(params, model, X) @supports_queue - def fit(self, X, y, queue=None): + def fit(self, X, y=None, queue=None): return self._fit(X, y) @supports_queue diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index 0394d12c37..0912c09464 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -121,6 +121,8 @@ def predict_proba(self, X): @wrap_output_data def score(self, X, y, sample_weight=None): + import sys + print("DEBUG: score called11111!", X, y, file=sys.stderr, flush=True) check_is_fitted(self) check_feature_names(self, X, reset=False) @@ -144,6 +146,8 @@ def score(self, X, y, sample_weight=None): @wrap_output_data def kneighbors(self, X=None, n_neighbors=None, return_distance=True): + import sys + print("DEBUG: kneighbors called11111!", X, file=sys.stderr, flush=True) check_is_fitted(self) if X is not None: check_feature_names(self, X, reset=False) @@ -170,10 +174,8 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): ) def _onedal_fit(self, X, y, queue=None): - # import sys - # print("=" * 50, file=sys.stderr, flush=True) - # print("DEBUG: _onedal_fit called!", file=sys.stderr, flush=True) - # print("=" * 50, file=sys.stderr, flush=True) + import sys + print("DEBUG: _onedal_fit called11111!", X, y, file=sys.stderr, flush=True) # Perform preprocessing at sklearnex level X, y = self._validate_data( @@ -232,11 +234,17 @@ def _onedal_kneighbors( ) def _onedal_score(self, X, y, sample_weight=None, queue=None): + import sys + print("DEBUG: _onedal_score called11111!", X, y, file=sys.stderr, flush=True) + return accuracy_score( y, self._onedal_predict(X, queue=queue), sample_weight=sample_weight ) def _save_attributes(self): + import sys + print("DEBUG: _save_attributes called11111!", self._onedal_estimator, file=sys.stderr, flush=True) + self.classes_ = self._onedal_estimator.classes_ self.n_features_in_ = self._onedal_estimator.n_features_in_ self.n_samples_fit_ = self._onedal_estimator.n_samples_fit_ From fe0abbb037f414e9617b8f85752c38ff254b2b17 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Thu, 9 Oct 2025 23:42:04 -0700 Subject: [PATCH 12/87] print: print fit_x --- sklearnex/neighbors/knn_unsupervised.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index eac9dea5ae..41cffbf139 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -108,6 +108,13 @@ def radius_neighbors( or getattr(self, "_tree", 0) is None and self._fit_method == "kd_tree" ): + # Debug: Check what _fit_X actually is at time of error + import sys + print(f"DEBUG radius_neighbors: self._fit_X type: {type(self._fit_X)}", file=sys.stderr, flush=True) + if isinstance(self._fit_X, tuple): + print(f"DEBUG radius_neighbors: _fit_X is tuple of length {len(self._fit_X)}", file=sys.stderr, flush=True) + print(f"DEBUG radius_neighbors: tuple contents: {[type(x) for x in self._fit_X]}", file=sys.stderr, flush=True) + _sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, "_y", None)) check_is_fitted(self) return dispatch( @@ -187,6 +194,13 @@ def _save_attributes(self): self.n_features_in_ = self._onedal_estimator.n_features_in_ self.n_samples_fit_ = self._onedal_estimator.n_samples_fit_ fit_x = self._onedal_estimator._fit_X + + # Debug: Check if fit_x is unexpectedly a tuple + if isinstance(fit_x, tuple): + import sys + print(f"DEBUG: _onedal_estimator._fit_X is a tuple: {type(fit_x)}, length: {len(fit_x)}", file=sys.stderr, flush=True) + print(f"DEBUG: fit_x[0] type: {type(fit_x[0])}, fit_x[1]: {fit_x[1]}", file=sys.stderr, flush=True) + self._fit_X = fit_x[0] if isinstance(fit_x, tuple) else fit_x self._fit_method = self._onedal_estimator._fit_method self._tree = self._onedal_estimator._tree From e202e6502b4d0d47bc4f76b7ae744a66deac2815 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Fri, 10 Oct 2025 11:52:30 -0700 Subject: [PATCH 13/87] fix: fixed tuple --- onedal/neighbors/neighbors.py | 4 ++-- sklearnex/neighbors/knn_unsupervised.py | 14 -------------- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index 440a94ff57..7b6f7d642d 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -493,8 +493,8 @@ def _onedal_fit(self, X, y): # global queue is set as per user configuration (`target_offload`) or from data prior to calling this internal function queue = QM.get_global_queue() params = self._get_onedal_params(X, y) - X, y = to_table(X, y, queue=queue) - return self.train(params, X).model + X_table, y_table = to_table(X, y, queue=queue) + return self.train(params, X_table).model def _onedal_predict(self, model, X, params): X = to_table(X, queue=QM.get_global_queue()) diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 41cffbf139..eac9dea5ae 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -108,13 +108,6 @@ def radius_neighbors( or getattr(self, "_tree", 0) is None and self._fit_method == "kd_tree" ): - # Debug: Check what _fit_X actually is at time of error - import sys - print(f"DEBUG radius_neighbors: self._fit_X type: {type(self._fit_X)}", file=sys.stderr, flush=True) - if isinstance(self._fit_X, tuple): - print(f"DEBUG radius_neighbors: _fit_X is tuple of length {len(self._fit_X)}", file=sys.stderr, flush=True) - print(f"DEBUG radius_neighbors: tuple contents: {[type(x) for x in self._fit_X]}", file=sys.stderr, flush=True) - _sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, "_y", None)) check_is_fitted(self) return dispatch( @@ -194,13 +187,6 @@ def _save_attributes(self): self.n_features_in_ = self._onedal_estimator.n_features_in_ self.n_samples_fit_ = self._onedal_estimator.n_samples_fit_ fit_x = self._onedal_estimator._fit_X - - # Debug: Check if fit_x is unexpectedly a tuple - if isinstance(fit_x, tuple): - import sys - print(f"DEBUG: _onedal_estimator._fit_X is a tuple: {type(fit_x)}, length: {len(fit_x)}", file=sys.stderr, flush=True) - print(f"DEBUG: fit_x[0] type: {type(fit_x[0])}, fit_x[1]: {fit_x[1]}", file=sys.stderr, flush=True) - self._fit_X = fit_x[0] if isinstance(fit_x, tuple) else fit_x self._fit_method = self._onedal_estimator._fit_method self._tree = self._onedal_estimator._tree From 649fc5d6a574086c7751ecec9c1b5b6f1aade860 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Fri, 10 Oct 2025 14:52:53 -0700 Subject: [PATCH 14/87] fix: fix tuple --- onedal/neighbors/neighbors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index 7b6f7d642d..440a94ff57 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -493,8 +493,8 @@ def _onedal_fit(self, X, y): # global queue is set as per user configuration (`target_offload`) or from data prior to calling this internal function queue = QM.get_global_queue() params = self._get_onedal_params(X, y) - X_table, y_table = to_table(X, y, queue=queue) - return self.train(params, X_table).model + X, y = to_table(X, y, queue=queue) + return self.train(params, X).model def _onedal_predict(self, model, X, params): X = to_table(X, queue=QM.get_global_queue()) From a1f95f1217a5ea5ca1fec63cc42f88b58f77bb8a Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Fri, 10 Oct 2025 15:28:01 -0700 Subject: [PATCH 15/87] print: print in save attributes --- sklearnex/neighbors/knn_unsupervised.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index eac9dea5ae..0376c34af1 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -183,6 +183,10 @@ def _onedal_kneighbors( ) def _save_attributes(self): + print(f"DEBUG: _save_attributes - _fit_X type: {type(self._onedal_estimator._fit_X)}") + if hasattr(self._onedal_estimator, '_fit_X'): + print(f"DEBUG: _fit_X value preview: {str(self._onedal_estimator._fit_X)[:200]}") + self.classes_ = self._onedal_estimator.classes_ self.n_features_in_ = self._onedal_estimator.n_features_in_ self.n_samples_fit_ = self._onedal_estimator.n_samples_fit_ From 939a4f6575e23783840fc8cef4f8d37a72aeefd2 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Fri, 10 Oct 2025 15:54:43 -0700 Subject: [PATCH 16/87] fix: tuple handling --- sklearnex/neighbors/knn_unsupervised.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 0376c34af1..1c5676ec68 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -108,7 +108,10 @@ def radius_neighbors( or getattr(self, "_tree", 0) is None and self._fit_method == "kd_tree" ): - _sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, "_y", None)) + # Handle potential tuple in _fit_X (same as _save_attributes logic) + fit_x = self._fit_X + fit_x_array = fit_x[0] if isinstance(fit_x, tuple) else fit_x + _sklearn_NearestNeighbors.fit(self, fit_x_array, getattr(self, "_y", None)) check_is_fitted(self) return dispatch( self, From a4b1351aa87a5d91ed09ff5369dd6456776175d1 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Fri, 10 Oct 2025 16:16:47 -0700 Subject: [PATCH 17/87] print: add print --- sklearnex/neighbors/knn_unsupervised.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 1c5676ec68..1d8be9280a 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -14,6 +14,7 @@ # limitations under the License. # =============================================================================== +import sys import numpy as np from sklearn.neighbors._unsupervised import NearestNeighbors as _sklearn_NearestNeighbors from sklearn.utils.validation import _deprecate_positional_args, check_is_fitted @@ -103,15 +104,25 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): def radius_neighbors( self, X=None, radius=None, return_distance=True, sort_results=False ): + print(f"DEBUG radius_neighbors start: hasattr _onedal_estimator: {hasattr(self, '_onedal_estimator')}", file=sys.stderr) + print(f"DEBUG radius_neighbors start: _tree: {getattr(self, '_tree', 'NOT_SET')}", file=sys.stderr) + print(f"DEBUG radius_neighbors start: _fit_method: {getattr(self, '_fit_method', 'NOT_SET')}", file=sys.stderr) + if ( hasattr(self, "_onedal_estimator") or getattr(self, "_tree", 0) is None and self._fit_method == "kd_tree" ): + print("DEBUG: Entering the fit_x handling block", file=sys.stderr) # Handle potential tuple in _fit_X (same as _save_attributes logic) fit_x = self._fit_X + print(f"DEBUG radius_neighbors: _fit_X type: {type(fit_x)}", file=sys.stderr) + print(f"DEBUG radius_neighbors: _fit_X shape/content: {fit_x.shape if hasattr(fit_x, 'shape') else fit_x}", file=sys.stderr) fit_x_array = fit_x[0] if isinstance(fit_x, tuple) else fit_x + print(f"DEBUG radius_neighbors: fit_x_array type: {type(fit_x_array)}", file=sys.stderr) _sklearn_NearestNeighbors.fit(self, fit_x_array, getattr(self, "_y", None)) + else: + print("DEBUG: NOT entering the fit_x handling block - using default path", file=sys.stderr) check_is_fitted(self) return dispatch( self, @@ -186,9 +197,9 @@ def _onedal_kneighbors( ) def _save_attributes(self): - print(f"DEBUG: _save_attributes - _fit_X type: {type(self._onedal_estimator._fit_X)}") + print(f"DEBUG: _save_attributes - _fit_X type: {type(self._onedal_estimator._fit_X)}", file=sys.stderr) if hasattr(self._onedal_estimator, '_fit_X'): - print(f"DEBUG: _fit_X value preview: {str(self._onedal_estimator._fit_X)[:200]}") + print(f"DEBUG: _fit_X value preview: {str(self._onedal_estimator._fit_X)[:200]}", file=sys.stderr) self.classes_ = self._onedal_estimator.classes_ self.n_features_in_ = self._onedal_estimator.n_features_in_ From 39ae6c532fad052f2cdc4962c479de211377f4f8 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Fri, 10 Oct 2025 16:49:17 -0700 Subject: [PATCH 18/87] print: test print --- onedal/neighbors/neighbors.py | 8 +++++++ sklearnex/neighbors/knn_unsupervised.py | 28 ++++++++++++++++++++----- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index 440a94ff57..67e4b36033 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -17,6 +17,7 @@ from abc import ABCMeta, abstractmethod import numpy as np +import sys from onedal._device_offload import supports_queue from onedal.common._backend import bind_default_backend @@ -109,6 +110,7 @@ def __init__( self.metric_params = metric_params def _fit(self, X, y): + print(f"DEBUG oneDAL _fit start: X type = {type(X)}", file=sys.stderr) self._onedal_model = None self._tree = None self._shape = None @@ -145,6 +147,7 @@ def _fit(self, X, y): self.n_samples_fit_ = X.shape[0] self.n_features_in_ = X.shape[1] self._fit_X = X + print(f"DEBUG oneDAL _fit: setting _fit_X = {type(X)}, shape = {X.shape}", file=sys.stderr) _fit_y = None queue = QM.get_global_queue() @@ -153,6 +156,7 @@ def _fit(self, X, y): if _is_classifier(self) or (_is_regressor(self) and gpu_device): _fit_y = y.astype(X.dtype).reshape((-1, 1)) if y is not None else None result = self._onedal_fit(X, _fit_y) + print(f"DEBUG oneDAL _fit: after _onedal_fit, _fit_X type = {type(self._fit_X)}", file=sys.stderr) if y is not None and _is_regressor(self): self._y = y if self._shape is None else xp.reshape(y, self._shape) @@ -490,10 +494,14 @@ def train(self, *args, **kwargs): ... def infer(self, *arg, **kwargs): ... def _onedal_fit(self, X, y): + print(f"DEBUG NearestNeighbors _onedal_fit: X type = {type(X)}, y type = {type(y)}", file=sys.stderr) # global queue is set as per user configuration (`target_offload`) or from data prior to calling this internal function queue = QM.get_global_queue() params = self._get_onedal_params(X, y) + print(f"DEBUG NearestNeighbors _onedal_fit: before to_table - X type = {type(X)}, y type = {type(y)}", file=sys.stderr) X, y = to_table(X, y, queue=queue) + print(f"DEBUG NearestNeighbors _onedal_fit: after to_table - X type = {type(X)}, y type = {type(y)}", file=sys.stderr) + print(f"DEBUG NearestNeighbors _onedal_fit: self._fit_X type = {type(self._fit_X)}", file=sys.stderr) return self.train(params, X).model def _onedal_predict(self, model, X, params): diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 1d8be9280a..406a5b66e1 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -108,11 +108,16 @@ def radius_neighbors( print(f"DEBUG radius_neighbors start: _tree: {getattr(self, '_tree', 'NOT_SET')}", file=sys.stderr) print(f"DEBUG radius_neighbors start: _fit_method: {getattr(self, '_fit_method', 'NOT_SET')}", file=sys.stderr) - if ( - hasattr(self, "_onedal_estimator") - or getattr(self, "_tree", 0) is None - and self._fit_method == "kd_tree" - ): + # Check the condition logic + has_onedal = hasattr(self, "_onedal_estimator") + tree_is_none = getattr(self, "_tree", 0) is None + is_kd_tree = getattr(self, "_fit_method", None) == "kd_tree" + print(f"DEBUG: has_onedal={has_onedal}, tree_is_none={tree_is_none}, is_kd_tree={is_kd_tree}", file=sys.stderr) + + condition_met = has_onedal or (tree_is_none and is_kd_tree) + print(f"DEBUG: condition_met={condition_met}", file=sys.stderr) + + if condition_met: print("DEBUG: Entering the fit_x handling block", file=sys.stderr) # Handle potential tuple in _fit_X (same as _save_attributes logic) fit_x = self._fit_X @@ -123,6 +128,13 @@ def radius_neighbors( _sklearn_NearestNeighbors.fit(self, fit_x_array, getattr(self, "_y", None)) else: print("DEBUG: NOT entering the fit_x handling block - using default path", file=sys.stderr) + # ALWAYS handle potential tuple in _fit_X for robustness + if hasattr(self, '_fit_X'): + fit_x = self._fit_X + print(f"DEBUG fallback path: _fit_X type: {type(fit_x)}", file=sys.stderr) + if isinstance(fit_x, tuple): + print("DEBUG fallback path: _fit_X is tuple, extracting first element", file=sys.stderr) + self._fit_X = fit_x[0] check_is_fitted(self) return dispatch( self, @@ -140,6 +152,12 @@ def radius_neighbors( def radius_neighbors_graph( self, X=None, radius=None, mode="connectivity", sort_results=False ): + print(f"DEBUG radius_neighbors_graph start: _fit_X type: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + # Handle potential tuple in _fit_X before calling dispatch + if hasattr(self, '_fit_X') and isinstance(self._fit_X, tuple): + print("DEBUG radius_neighbors_graph: _fit_X is tuple, extracting first element", file=sys.stderr) + self._fit_X = self._fit_X[0] + return dispatch( self, "radius_neighbors_graph", From aa98829112d689069ba402c0c65a21dc1084bb96 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 13 Oct 2025 11:18:38 -0700 Subject: [PATCH 19/87] test: test fix for typle --- onedal/neighbors/neighbors.py | 9 +++++++-- sklearnex/neighbors/knn_unsupervised.py | 25 ++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index 67e4b36033..9512521db0 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -146,8 +146,13 @@ def _fit(self, X, y): self.n_samples_fit_ = X.shape[0] self.n_features_in_ = X.shape[1] - self._fit_X = X - print(f"DEBUG oneDAL _fit: setting _fit_X = {type(X)}, shape = {X.shape}", file=sys.stderr) + # Ensure _fit_X is always an array, never a tuple + if isinstance(X, tuple): + print(f"DEBUG oneDAL _fit: X is tuple, extracting first element: {type(X)}", file=sys.stderr) + self._fit_X = X[0] + else: + self._fit_X = X + print(f"DEBUG oneDAL _fit: setting _fit_X = {type(self._fit_X)}, shape = {self._fit_X.shape}", file=sys.stderr) _fit_y = None queue = QM.get_global_queue() diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 406a5b66e1..58978e65c5 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -125,7 +125,30 @@ def radius_neighbors( print(f"DEBUG radius_neighbors: _fit_X shape/content: {fit_x.shape if hasattr(fit_x, 'shape') else fit_x}", file=sys.stderr) fit_x_array = fit_x[0] if isinstance(fit_x, tuple) else fit_x print(f"DEBUG radius_neighbors: fit_x_array type: {type(fit_x_array)}", file=sys.stderr) - _sklearn_NearestNeighbors.fit(self, fit_x_array, getattr(self, "_y", None)) + + # Additional safety check - ensure fit_x_array is not a tuple + if isinstance(fit_x_array, tuple): + print(f"DEBUG radius_neighbors: fit_x_array is still tuple after extraction: {type(fit_x_array)}", file=sys.stderr) + fit_x_array = fit_x_array[0] # Extract again if needed + print(f"DEBUG radius_neighbors: fit_x_array after second extraction: {type(fit_x_array)}", file=sys.stderr) + + # Temporarily set _fit_X to the extracted array since sklearn accesses it directly + original_fit_x = self._fit_X + self._fit_X = fit_x_array + + # Debug the _y value and handle potential tuple + y_value = getattr(self, "_y", None) + if isinstance(y_value, tuple): + print(f"DEBUG: _y is tuple, extracting: {type(y_value)}", file=sys.stderr) + y_value = y_value[0] if y_value[0] is not None else None + print(f"DEBUG: _y value type: {type(y_value)}, value: {y_value}", file=sys.stderr) + + try: + # Call _fit directly to avoid any preprocessing in fit() that might create tuples + _sklearn_NearestNeighbors._fit(self, fit_x_array, y_value) + finally: + # Restore original _fit_X + self._fit_X = original_fit_x else: print("DEBUG: NOT entering the fit_x handling block - using default path", file=sys.stderr) # ALWAYS handle potential tuple in _fit_X for robustness From 2f834d05ab0b379dd7de5a8b700571eefd8ff534 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 13 Oct 2025 12:16:52 -0700 Subject: [PATCH 20/87] fix: more print --- onedal/neighbors/neighbors.py | 48 ++++++++++++++--- sklearnex/neighbors/knn_unsupervised.py | 72 ++++++++++++++++++++++--- 2 files changed, 106 insertions(+), 14 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index 9512521db0..bd8b9f67f9 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -110,7 +110,10 @@ def __init__( self.metric_params = metric_params def _fit(self, X, y): - print(f"DEBUG oneDAL _fit start: X type = {type(X)}", file=sys.stderr) + print(f"DEBUG oneDAL _fit START - ENTRY PARAMETERS:", file=sys.stderr) + print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f" y type: {type(y)}, y shape: {getattr(y, 'shape', 'NO_SHAPE')}", file=sys.stderr) + self._onedal_model = None self._tree = None self._shape = None @@ -146,22 +149,34 @@ def _fit(self, X, y): self.n_samples_fit_ = X.shape[0] self.n_features_in_ = X.shape[1] + + print(f"DEBUG oneDAL _fit BEFORE setting _fit_X:", file=sys.stderr) + print(f" X type: {type(X)}, isinstance(X, tuple): {isinstance(X, tuple)}", file=sys.stderr) + # Ensure _fit_X is always an array, never a tuple if isinstance(X, tuple): print(f"DEBUG oneDAL _fit: X is tuple, extracting first element: {type(X)}", file=sys.stderr) self._fit_X = X[0] else: self._fit_X = X - print(f"DEBUG oneDAL _fit: setting _fit_X = {type(self._fit_X)}, shape = {self._fit_X.shape}", file=sys.stderr) + + print(f"DEBUG oneDAL _fit AFTER setting _fit_X:", file=sys.stderr) + print(f" self._fit_X type: {type(self._fit_X)}, shape: {getattr(self._fit_X, 'shape', 'NO_SHAPE')}", file=sys.stderr) _fit_y = None queue = QM.get_global_queue() gpu_device = queue is not None and queue.sycl_device.is_gpu + print(f"DEBUG oneDAL _fit BEFORE calling _onedal_fit:", file=sys.stderr) + print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f" _fit_y type: {type(_fit_y)}, _fit_y shape: {getattr(_fit_y, 'shape', 'NO_SHAPE')}", file=sys.stderr) + if _is_classifier(self) or (_is_regressor(self) and gpu_device): _fit_y = y.astype(X.dtype).reshape((-1, 1)) if y is not None else None result = self._onedal_fit(X, _fit_y) - print(f"DEBUG oneDAL _fit: after _onedal_fit, _fit_X type = {type(self._fit_X)}", file=sys.stderr) + + print(f"DEBUG oneDAL _fit AFTER _onedal_fit:", file=sys.stderr) + print(f" self._fit_X type: {type(self._fit_X)}, shape: {getattr(self._fit_X, 'shape', 'NO_SHAPE')}", file=sys.stderr) if y is not None and _is_regressor(self): self._y = y if self._shape is None else xp.reshape(y, self._shape) @@ -499,15 +514,32 @@ def train(self, *args, **kwargs): ... def infer(self, *arg, **kwargs): ... def _onedal_fit(self, X, y): - print(f"DEBUG NearestNeighbors _onedal_fit: X type = {type(X)}, y type = {type(y)}", file=sys.stderr) + print(f"DEBUG NearestNeighbors _onedal_fit START - ENTRY PARAMETERS:", file=sys.stderr) + print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f" y type: {type(y)}, y shape: {getattr(y, 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f" self._fit_X BEFORE to_table: type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + # global queue is set as per user configuration (`target_offload`) or from data prior to calling this internal function queue = QM.get_global_queue() params = self._get_onedal_params(X, y) - print(f"DEBUG NearestNeighbors _onedal_fit: before to_table - X type = {type(X)}, y type = {type(y)}", file=sys.stderr) + + print(f"DEBUG NearestNeighbors _onedal_fit BEFORE to_table:", file=sys.stderr) + print(f" X type: {type(X)}, isinstance(X, tuple): {isinstance(X, tuple)}", file=sys.stderr) + print(f" y type: {type(y)}, isinstance(y, tuple): {isinstance(y, tuple)}", file=sys.stderr) + X, y = to_table(X, y, queue=queue) - print(f"DEBUG NearestNeighbors _onedal_fit: after to_table - X type = {type(X)}, y type = {type(y)}", file=sys.stderr) - print(f"DEBUG NearestNeighbors _onedal_fit: self._fit_X type = {type(self._fit_X)}", file=sys.stderr) - return self.train(params, X).model + + print(f"DEBUG NearestNeighbors _onedal_fit AFTER to_table - CRITICAL POINT:", file=sys.stderr) + print(f" X type: {type(X)}, isinstance(X, tuple): {isinstance(X, tuple)}", file=sys.stderr) + print(f" y type: {type(y)}, isinstance(y, tuple): {isinstance(y, tuple)}", file=sys.stderr) + print(f" self._fit_X AFTER to_table: type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + + result = self.train(params, X).model + + print(f"DEBUG NearestNeighbors _onedal_fit AFTER train:", file=sys.stderr) + print(f" self._fit_X type: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + + return result def _onedal_predict(self, model, X, params): X = to_table(X, queue=QM.get_global_queue()) diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 58978e65c5..6093e60acf 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -61,6 +61,10 @@ def __init__( ) def fit(self, X, y=None): + print(f"DEBUG fit START - ENTRY PARAMETERS:", file=sys.stderr) + print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f" y type: {type(y)}, y shape: {getattr(y, 'shape', 'NO_SHAPE')}", file=sys.stderr) + dispatch( self, "fit", @@ -71,6 +75,11 @@ def fit(self, X, y=None): X, None, ) + + print(f"DEBUG fit AFTER dispatch:", file=sys.stderr) + print(f" self._fit_X type: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print(f" self._fit_X shape: {getattr(getattr(self, '_fit_X', None), 'shape', 'NO_SHAPE')}", file=sys.stderr) + return self @wrap_output_data @@ -104,9 +113,14 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): def radius_neighbors( self, X=None, radius=None, return_distance=True, sort_results=False ): - print(f"DEBUG radius_neighbors start: hasattr _onedal_estimator: {hasattr(self, '_onedal_estimator')}", file=sys.stderr) - print(f"DEBUG radius_neighbors start: _tree: {getattr(self, '_tree', 'NOT_SET')}", file=sys.stderr) - print(f"DEBUG radius_neighbors start: _fit_method: {getattr(self, '_fit_method', 'NOT_SET')}", file=sys.stderr) + print(f"DEBUG radius_neighbors START - ENTRY PARAMETERS:", file=sys.stderr) + print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f" radius: {radius}, return_distance: {return_distance}, sort_results: {sort_results}", file=sys.stderr) + print(f" self._fit_X type: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print(f" self._fit_X shape: {getattr(getattr(self, '_fit_X', None), 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f" hasattr _onedal_estimator: {hasattr(self, '_onedal_estimator')}", file=sys.stderr) + print(f" _tree: {getattr(self, '_tree', 'NOT_SET')}", file=sys.stderr) + print(f" _fit_method: {getattr(self, '_fit_method', 'NOT_SET')}", file=sys.stderr) # Check the condition logic has_onedal = hasattr(self, "_onedal_estimator") @@ -159,6 +173,13 @@ def radius_neighbors( print("DEBUG fallback path: _fit_X is tuple, extracting first element", file=sys.stderr) self._fit_X = fit_x[0] check_is_fitted(self) + + print(f"DEBUG radius_neighbors BEFORE DISPATCH:", file=sys.stderr) + print(f" self._fit_X type: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print(f" self._fit_X shape: {getattr(getattr(self, '_fit_X', None), 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f" radius: {radius}, return_distance: {return_distance}, sort_results: {sort_results}", file=sys.stderr) + return dispatch( self, "radius_neighbors", @@ -175,12 +196,23 @@ def radius_neighbors( def radius_neighbors_graph( self, X=None, radius=None, mode="connectivity", sort_results=False ): - print(f"DEBUG radius_neighbors_graph start: _fit_X type: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print(f"DEBUG radius_neighbors_graph START - ENTRY PARAMETERS:", file=sys.stderr) + print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f" radius: {radius}, mode: {mode}, sort_results: {sort_results}", file=sys.stderr) + print(f" self._fit_X type: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print(f" self._fit_X shape: {getattr(getattr(self, '_fit_X', None), 'shape', 'NO_SHAPE')}", file=sys.stderr) + # Handle potential tuple in _fit_X before calling dispatch if hasattr(self, '_fit_X') and isinstance(self._fit_X, tuple): print("DEBUG radius_neighbors_graph: _fit_X is tuple, extracting first element", file=sys.stderr) self._fit_X = self._fit_X[0] + print(f"DEBUG radius_neighbors_graph BEFORE DISPATCH:", file=sys.stderr) + print(f" self._fit_X type: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print(f" self._fit_X shape: {getattr(getattr(self, '_fit_X', None), 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f" radius: {radius}, mode: {mode}, sort_results: {sort_results}", file=sys.stderr) + return dispatch( self, "radius_neighbors_graph", @@ -195,8 +227,16 @@ def radius_neighbors_graph( ) def _onedal_fit(self, X, y=None, queue=None): + print(f"DEBUG _onedal_fit START - ENTRY PARAMETERS:", file=sys.stderr) + print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f" y type: {type(y)}, y shape: {getattr(y, 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f" queue: {queue}", file=sys.stderr) + # Perform preprocessing at sklearnex level X, _ = self._validate_data(X, dtype=[np.float64, np.float32], accept_sparse=True) + + print(f"DEBUG _onedal_fit AFTER _validate_data:", file=sys.stderr) + print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) # Validate n_neighbors self._validate_n_neighbors(self.n_neighbors) @@ -223,8 +263,16 @@ def _onedal_fit(self, X, y=None, queue=None): # Set attributes on the onedal estimator self._onedal_estimator.classes_ = self.classes_ + print(f"DEBUG _onedal_fit BEFORE calling onedal_estimator.fit:", file=sys.stderr) + print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f" y type: {type(y)}, y shape: {getattr(y, 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f" queue: {queue}", file=sys.stderr) + self._onedal_estimator.fit(X, y, queue=queue) + print(f"DEBUG _onedal_fit AFTER calling onedal_estimator.fit:", file=sys.stderr) + print(f" onedal_estimator._fit_X type: {type(getattr(self._onedal_estimator, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + self._save_attributes() def _onedal_predict(self, X, queue=None): @@ -238,15 +286,27 @@ def _onedal_kneighbors( ) def _save_attributes(self): - print(f"DEBUG: _save_attributes - _fit_X type: {type(self._onedal_estimator._fit_X)}", file=sys.stderr) + print(f"DEBUG _save_attributes START:", file=sys.stderr) + print(f" onedal_estimator._fit_X type: {type(getattr(self._onedal_estimator, '_fit_X', 'NOT_SET'))}", file=sys.stderr) if hasattr(self._onedal_estimator, '_fit_X'): - print(f"DEBUG: _fit_X value preview: {str(self._onedal_estimator._fit_X)[:200]}", file=sys.stderr) + fit_x_preview = str(self._onedal_estimator._fit_X)[:200] + print(f" onedal_estimator._fit_X value preview: {fit_x_preview}", file=sys.stderr) self.classes_ = self._onedal_estimator.classes_ self.n_features_in_ = self._onedal_estimator.n_features_in_ self.n_samples_fit_ = self._onedal_estimator.n_samples_fit_ fit_x = self._onedal_estimator._fit_X + + print(f"DEBUG _save_attributes processing _fit_X:", file=sys.stderr) + print(f" fit_x type: {type(fit_x)}", file=sys.stderr) + print(f" isinstance(fit_x, tuple): {isinstance(fit_x, tuple)}", file=sys.stderr) + self._fit_X = fit_x[0] if isinstance(fit_x, tuple) else fit_x + + print(f"DEBUG _save_attributes AFTER processing:", file=sys.stderr) + print(f" self._fit_X type: {type(self._fit_X)}", file=sys.stderr) + print(f" self._fit_X shape: {getattr(self._fit_X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + self._fit_method = self._onedal_estimator._fit_method self._tree = self._onedal_estimator._tree From dcf5b43b812d968ccf9ec4ce56344aa216cdf62f Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 13 Oct 2025 14:13:12 -0700 Subject: [PATCH 21/87] fix: test fix for tuyple issue --- onedal/neighbors/neighbors.py | 6 +- sklearnex/neighbors/knn_unsupervised.py | 79 ++++++++++--------------- 2 files changed, 36 insertions(+), 49 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index bd8b9f67f9..7512ede0bc 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -153,10 +153,12 @@ def _fit(self, X, y): print(f"DEBUG oneDAL _fit BEFORE setting _fit_X:", file=sys.stderr) print(f" X type: {type(X)}, isinstance(X, tuple): {isinstance(X, tuple)}", file=sys.stderr) - # Ensure _fit_X is always an array, never a tuple + # CRITICAL FIX: Ensure _fit_X is always an array, never a tuple + # This is essential because sklearn's _fit method reads from self._fit_X directly if isinstance(X, tuple): print(f"DEBUG oneDAL _fit: X is tuple, extracting first element: {type(X)}", file=sys.stderr) - self._fit_X = X[0] + # Extract the actual array from tuple created by from_table/to_table + self._fit_X = X[0] if X[0] is not None else X[1] else: self._fit_X = X diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 6093e60acf..433b2f31a2 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -122,56 +122,35 @@ def radius_neighbors( print(f" _tree: {getattr(self, '_tree', 'NOT_SET')}", file=sys.stderr) print(f" _fit_method: {getattr(self, '_fit_method', 'NOT_SET')}", file=sys.stderr) - # Check the condition logic - has_onedal = hasattr(self, "_onedal_estimator") - tree_is_none = getattr(self, "_tree", 0) is None - is_kd_tree = getattr(self, "_fit_method", None) == "kd_tree" - print(f"DEBUG: has_onedal={has_onedal}, tree_is_none={tree_is_none}, is_kd_tree={is_kd_tree}", file=sys.stderr) - - condition_met = has_onedal or (tree_is_none and is_kd_tree) - print(f"DEBUG: condition_met={condition_met}", file=sys.stderr) + # Preprocessing for X parameter (same as kneighbors) + if X is not None: + check_feature_names(self, X, reset=False) + # Perform preprocessing at sklearnex level + from onedal.utils.validation import _check_array + + X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + self._validate_feature_count(X, "radius_neighbors") - if condition_met: - print("DEBUG: Entering the fit_x handling block", file=sys.stderr) - # Handle potential tuple in _fit_X (same as _save_attributes logic) - fit_x = self._fit_X - print(f"DEBUG radius_neighbors: _fit_X type: {type(fit_x)}", file=sys.stderr) - print(f"DEBUG radius_neighbors: _fit_X shape/content: {fit_x.shape if hasattr(fit_x, 'shape') else fit_x}", file=sys.stderr) - fit_x_array = fit_x[0] if isinstance(fit_x, tuple) else fit_x - print(f"DEBUG radius_neighbors: fit_x_array type: {type(fit_x_array)}", file=sys.stderr) - - # Additional safety check - ensure fit_x_array is not a tuple - if isinstance(fit_x_array, tuple): - print(f"DEBUG radius_neighbors: fit_x_array is still tuple after extraction: {type(fit_x_array)}", file=sys.stderr) - fit_x_array = fit_x_array[0] # Extract again if needed - print(f"DEBUG radius_neighbors: fit_x_array after second extraction: {type(fit_x_array)}", file=sys.stderr) - - # Temporarily set _fit_X to the extracted array since sklearn accesses it directly - original_fit_x = self._fit_X - self._fit_X = fit_x_array + # Original OneDAL refactoring condition with debug + if ( + hasattr(self, "_onedal_estimator") + or getattr(self, "_tree", 0) is None + and getattr(self, "_fit_method", None) == "kd_tree" + ): + print("DEBUG: Condition met - calling sklearn fit for preprocessing", file=sys.stderr) - # Debug the _y value and handle potential tuple - y_value = getattr(self, "_y", None) - if isinstance(y_value, tuple): - print(f"DEBUG: _y is tuple, extracting: {type(y_value)}", file=sys.stderr) - y_value = y_value[0] if y_value[0] is not None else None - print(f"DEBUG: _y value type: {type(y_value)}, value: {y_value}", file=sys.stderr) + # Ensure _fit_X is not a tuple before sklearn accesses it + fit_x_for_sklearn = self._fit_X + if isinstance(self._fit_X, tuple): + print("DEBUG radius_neighbors: _fit_X is tuple, extracting first element for sklearn fit", file=sys.stderr) + fit_x_for_sklearn = self._fit_X[0] - try: - # Call _fit directly to avoid any preprocessing in fit() that might create tuples - _sklearn_NearestNeighbors._fit(self, fit_x_array, y_value) - finally: - # Restore original _fit_X - self._fit_X = original_fit_x + print(f"DEBUG: Calling _sklearn_NearestNeighbors.fit with fit_x_for_sklearn type: {type(fit_x_for_sklearn)}", file=sys.stderr) + _sklearn_NearestNeighbors.fit(self, fit_x_for_sklearn, getattr(self, "_y", None)) + print("DEBUG: sklearn fit completed", file=sys.stderr) else: - print("DEBUG: NOT entering the fit_x handling block - using default path", file=sys.stderr) - # ALWAYS handle potential tuple in _fit_X for robustness - if hasattr(self, '_fit_X'): - fit_x = self._fit_X - print(f"DEBUG fallback path: _fit_X type: {type(fit_x)}", file=sys.stderr) - if isinstance(fit_x, tuple): - print("DEBUG fallback path: _fit_X is tuple, extracting first element", file=sys.stderr) - self._fit_X = fit_x[0] + print("DEBUG: Condition NOT met - skipping sklearn fit", file=sys.stderr) + check_is_fitted(self) print(f"DEBUG radius_neighbors BEFORE DISPATCH:", file=sys.stderr) @@ -301,7 +280,13 @@ def _save_attributes(self): print(f" fit_x type: {type(fit_x)}", file=sys.stderr) print(f" isinstance(fit_x, tuple): {isinstance(fit_x, tuple)}", file=sys.stderr) - self._fit_X = fit_x[0] if isinstance(fit_x, tuple) else fit_x + # CRITICAL FIX: OneDAL's to_table() can return tuples (array, None) in recursive calls + # We must extract the actual array for sklearn compatibility + if isinstance(fit_x, tuple): + print(f"DEBUG _save_attributes: fit_x is tuple, extracting array from: {fit_x}", file=sys.stderr) + self._fit_X = fit_x[0] # Extract the array from (array, None) tuple + else: + self._fit_X = fit_x print(f"DEBUG _save_attributes AFTER processing:", file=sys.stderr) print(f" self._fit_X type: {type(self._fit_X)}", file=sys.stderr) From 9c656478313d2d44caabbd4760dc547f9a5d4abd Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 13 Oct 2025 14:46:58 -0700 Subject: [PATCH 22/87] fix: test fix for tuyple issue --- sklearnex/neighbors/knn_unsupervised.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 433b2f31a2..ebf0391264 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -139,14 +139,14 @@ def radius_neighbors( ): print("DEBUG: Condition met - calling sklearn fit for preprocessing", file=sys.stderr) - # Ensure _fit_X is not a tuple before sklearn accesses it - fit_x_for_sklearn = self._fit_X + # CRITICAL FIX: Ensure _fit_X is properly extracted from tuple if needed + # This is essential because sklearn's fit method accesses self._fit_X directly if isinstance(self._fit_X, tuple): - print("DEBUG radius_neighbors: _fit_X is tuple, extracting first element for sklearn fit", file=sys.stderr) - fit_x_for_sklearn = self._fit_X[0] + print("DEBUG radius_neighbors: _fit_X is tuple, permanently extracting first element", file=sys.stderr) + self._fit_X = self._fit_X[0] # Fix the attribute permanently - print(f"DEBUG: Calling _sklearn_NearestNeighbors.fit with fit_x_for_sklearn type: {type(fit_x_for_sklearn)}", file=sys.stderr) - _sklearn_NearestNeighbors.fit(self, fit_x_for_sklearn, getattr(self, "_y", None)) + print(f"DEBUG: Calling _sklearn_NearestNeighbors.fit with self._fit_X type: {type(self._fit_X)}", file=sys.stderr) + _sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, "_y", None)) print("DEBUG: sklearn fit completed", file=sys.stderr) else: print("DEBUG: Condition NOT met - skipping sklearn fit", file=sys.stderr) @@ -183,8 +183,8 @@ def radius_neighbors_graph( # Handle potential tuple in _fit_X before calling dispatch if hasattr(self, '_fit_X') and isinstance(self._fit_X, tuple): - print("DEBUG radius_neighbors_graph: _fit_X is tuple, extracting first element", file=sys.stderr) - self._fit_X = self._fit_X[0] + print("DEBUG radius_neighbors_graph: _fit_X is tuple, permanently extracting first element", file=sys.stderr) + self._fit_X = self._fit_X[0] # Fix the attribute permanently print(f"DEBUG radius_neighbors_graph BEFORE DISPATCH:", file=sys.stderr) print(f" self._fit_X type: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) From b33834d0d212713405b3e8b2cc1955475748e27a Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 13 Oct 2025 16:35:19 -0700 Subject: [PATCH 23/87] fix: try add validation --- sklearnex/neighbors/knn_unsupervised.py | 42 ++++++++++++++++++++----- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index ebf0391264..d00d8bdedf 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -85,6 +85,12 @@ def fit(self, X, y=None): @wrap_output_data def kneighbors(self, X=None, n_neighbors=None, return_distance=True): check_is_fitted(self) + + # CRITICAL FIRST: Ensure _fit_X is always an array before any sklearn operations + if hasattr(self, '_fit_X') and isinstance(self._fit_X, tuple): + print("DEBUG kneighbors: PREVENTIVE FIX - _fit_X is tuple, permanently extracting first element", file=sys.stderr) + self._fit_X = self._fit_X[0] # Fix the attribute permanently + if X is not None: check_feature_names(self, X, reset=False) # Perform preprocessing at sklearnex level @@ -122,6 +128,11 @@ def radius_neighbors( print(f" _tree: {getattr(self, '_tree', 'NOT_SET')}", file=sys.stderr) print(f" _fit_method: {getattr(self, '_fit_method', 'NOT_SET')}", file=sys.stderr) + # CRITICAL FIRST: Ensure _fit_X is always an array before any sklearn operations + if hasattr(self, '_fit_X') and isinstance(self._fit_X, tuple): + print("DEBUG radius_neighbors: PREVENTIVE FIX - _fit_X is tuple, permanently extracting first element", file=sys.stderr) + self._fit_X = self._fit_X[0] # Fix the attribute permanently + # Preprocessing for X parameter (same as kneighbors) if X is not None: check_feature_names(self, X, reset=False) @@ -131,7 +142,7 @@ def radius_neighbors( X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) self._validate_feature_count(X, "radius_neighbors") - # Original OneDAL refactoring condition with debug + # Original OneDAL refactoring condition with proper validation if ( hasattr(self, "_onedal_estimator") or getattr(self, "_tree", 0) is None @@ -139,14 +150,31 @@ def radius_neighbors( ): print("DEBUG: Condition met - calling sklearn fit for preprocessing", file=sys.stderr) + # Use sklearnex-level validation instead of raw OneDAL data + # This ensures we have proper arrays, not tuples from OneDAL processing + fit_x_for_sklearn = getattr(self, "_fit_X", None) + fit_y_for_sklearn = getattr(self, "_y", None) + + # Apply sklearnex-level validation to ensure proper data format + if fit_x_for_sklearn is not None: + # Use the refactored _validate_data method from KNeighborsDispatchingBase + fit_x_for_sklearn, _ = self._validate_data( + fit_x_for_sklearn, dtype=[np.float64, np.float32], accept_sparse=True + ) + # CRITICAL FIX: Ensure _fit_X is properly extracted from tuple if needed - # This is essential because sklearn's fit method accesses self._fit_X directly - if isinstance(self._fit_X, tuple): - print("DEBUG radius_neighbors: _fit_X is tuple, permanently extracting first element", file=sys.stderr) - self._fit_X = self._fit_X[0] # Fix the attribute permanently + if isinstance(fit_x_for_sklearn, tuple): + print("DEBUG radius_neighbors: fit_x_for_sklearn is tuple, extracting first element", file=sys.stderr) + fit_x_for_sklearn = fit_x_for_sklearn[0] + + # Update the main attribute to ensure consistency + self._fit_X = fit_x_for_sklearn + + print(f"DEBUG: Calling _sklearn_NearestNeighbors.fit with validated data", file=sys.stderr) + print(f" fit_x_for_sklearn type: {type(fit_x_for_sklearn)}", file=sys.stderr) + print(f" fit_y_for_sklearn type: {type(fit_y_for_sklearn)}", file=sys.stderr) - print(f"DEBUG: Calling _sklearn_NearestNeighbors.fit with self._fit_X type: {type(self._fit_X)}", file=sys.stderr) - _sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, "_y", None)) + _sklearn_NearestNeighbors.fit(self, fit_x_for_sklearn, fit_y_for_sklearn) print("DEBUG: sklearn fit completed", file=sys.stderr) else: print("DEBUG: Condition NOT met - skipping sklearn fit", file=sys.stderr) From 96762db04add7a074bb99c0c24db9be2ed3733fa Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 13 Oct 2025 17:10:08 -0700 Subject: [PATCH 24/87] fix: try restore neighbors funcitons --- onedal/neighbors/neighbors.py | 228 +++++++++++++++++++++++++--------- 1 file changed, 166 insertions(+), 62 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index 7512ede0bc..e43d4b7339 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -15,6 +15,7 @@ # ============================================================================== from abc import ABCMeta, abstractmethod +from numbers import Integral import numpy as np import sys @@ -28,7 +29,14 @@ from ..common._mixin import ClassifierMixin, RegressorMixin from ..datatypes import from_table, to_table from ..utils._array_api import _get_sycl_namespace -from ..utils.validation import _num_samples +from ..utils.validation import ( + _check_array, + _check_classification_targets, + _check_n_features, + _check_X_y, + _column_or_1d, + _num_samples, +) class NeighborsCommonBase(metaclass=ABCMeta): @@ -43,6 +51,23 @@ def __init__(self): self.effective_metric_params_ = None self._onedal_model = None + def _parse_auto_method(self, method, n_samples, n_features): + result_method = method + + if method in ["auto", "ball_tree"]: + condition = ( + self.n_neighbors is not None and self.n_neighbors >= n_samples // 2 + ) + if self.metric == "precomputed" or n_features > 15 or condition: + result_method = "brute" + else: + if self.metric == "euclidean": + result_method = "kd_tree" + else: + result_method = "brute" + + return result_method + @abstractmethod def train(self, *args, **kwargs): ... @@ -52,6 +77,66 @@ def infer(self, *args, **kwargs): ... @abstractmethod def _onedal_fit(self, X, y): ... + def _validate_data( + self, X, y=None, reset=True, validate_separately=None, **check_params + ): + if y is None: + if self.requires_y: + raise ValueError( + f"This {self.__class__.__name__} estimator " + f"requires y to be passed, but the target y is None." + ) + X = _check_array(X, **check_params) + out = X, y + else: + if validate_separately: + # We need this because some estimators validate X and y + # separately, and in general, separately calling _check_array() + # on X and y isn't equivalent to just calling _check_X_y() + # :( + check_X_params, check_y_params = validate_separately + X = _check_array(X, **check_X_params) + y = _check_array(y, **check_y_params) + else: + X, y = _check_X_y(X, y, **check_params) + out = X, y + + if check_params.get("ensure_2d", True): + _check_n_features(self, X, reset=reset) + + return out + + def _get_weights(self, dist, weights): + if weights in (None, "uniform"): + return None + if weights == "distance": + # if user attempts to classify a point that was zero distance from one + # or more training points, those training points are weighted as 1.0 + # and the other points as 0.0 + if dist.dtype is np.dtype(object): + for point_dist_i, point_dist in enumerate(dist): + # check if point_dist is iterable + # (ex: RadiusNeighborClassifier.predict may set an element of + # dist to 1e-6 to represent an 'outlier') + if hasattr(point_dist, "__contains__") and 0.0 in point_dist: + dist[point_dist_i] = point_dist == 0.0 + else: + dist[point_dist_i] = 1.0 / point_dist + else: + with np.errstate(divide="ignore"): + dist = 1.0 / dist + inf_mask = np.isinf(dist) + inf_row = np.any(inf_mask, axis=1) + dist[inf_row] = inf_mask[inf_row] + return dist + elif callable(weights): + return weights(dist) + else: + raise ValueError( + "weights not recognized: should be 'uniform', " + "'distance', or a callable function" + ) + def _get_onedal_params(self, X, y=None, n_neighbors=None): class_count = 0 if self.classes_ is None else len(self.classes_) weights = getattr(self, "weights", "uniform") @@ -61,25 +146,8 @@ def _get_onedal_params(self, X, y=None, n_neighbors=None): p = 2.0 else: p = self.p - - # Handle different input types for dtype - try: - fptype = X.dtype - except AttributeError: - # For pandas DataFrames or other types without dtype attribute - import numpy as np - - fptype = np.float64 - - # _fit_method should be set by sklearnex level before calling oneDAL - if not hasattr(self, "_fit_method") or self._fit_method is None: - raise ValueError( - "_fit_method must be set by sklearnex level before calling oneDAL. " - "This indicates improper usage - oneDAL neighbors should not be called directly." - ) - return { - "fptype": fptype, + "fptype": X.dtype, "vote_weights": "uniform" if weights == "uniform" else "distance", "method": self._fit_method, "radius": self.radius, @@ -109,6 +177,21 @@ def __init__( self.p = p self.metric_params = metric_params + def _validate_targets(self, y, dtype): + arr = _column_or_1d(y, warn=True) + + try: + return arr.astype(dtype, copy=False) + except ValueError: + return arr + + def _validate_n_classes(self): + length = 0 if self.classes_ is None else len(self.classes_) + if length < 2: + raise ValueError( + f"The number of classes has to be greater than one; got {length}" + ) + def _fit(self, X, y): print(f"DEBUG oneDAL _fit START - ENTRY PARAMETERS:", file=sys.stderr) print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) @@ -124,8 +207,13 @@ def _fit(self, X, y): ) _, xp, _ = _get_sycl_namespace(X) + use_raw_input = _get_config().get("use_raw_input", False) is True if y is not None or self.requires_y: shape = getattr(y, "shape", None) + if not use_raw_input: + X, y = super()._validate_data( + X, y, dtype=[np.float64, np.float32], accept_sparse="csr" + ) self._shape = shape if shape is not None else y.shape if _is_classifier(self): @@ -135,6 +223,7 @@ def _fit(self, X, y): else: self.outputs_2d_ = True + _check_classification_targets(y) self.classes_ = [] self._y = np.empty(y.shape, dtype=int) for k in range(self._y.shape[1]): @@ -144,26 +233,29 @@ def _fit(self, X, y): if not self.outputs_2d_: self.classes_ = self.classes_[0] self._y = self._y.ravel() + + self._validate_n_classes() else: self._y = y + elif not use_raw_input: + X, _ = super()._validate_data(X, dtype=[np.float64, np.float32]) self.n_samples_fit_ = X.shape[0] self.n_features_in_ = X.shape[1] - - print(f"DEBUG oneDAL _fit BEFORE setting _fit_X:", file=sys.stderr) - print(f" X type: {type(X)}, isinstance(X, tuple): {isinstance(X, tuple)}", file=sys.stderr) - - # CRITICAL FIX: Ensure _fit_X is always an array, never a tuple - # This is essential because sklearn's _fit method reads from self._fit_X directly - if isinstance(X, tuple): - print(f"DEBUG oneDAL _fit: X is tuple, extracting first element: {type(X)}", file=sys.stderr) - # Extract the actual array from tuple created by from_table/to_table - self._fit_X = X[0] if X[0] is not None else X[1] - else: - self._fit_X = X - - print(f"DEBUG oneDAL _fit AFTER setting _fit_X:", file=sys.stderr) - print(f" self._fit_X type: {type(self._fit_X)}, shape: {getattr(self._fit_X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + self._fit_X = X + + if self.n_neighbors is not None: + if self.n_neighbors <= 0: + raise ValueError("Expected n_neighbors > 0. Got %d" % self.n_neighbors) + if not isinstance(self.n_neighbors, Integral): + raise TypeError( + "n_neighbors does not take %s value, " + "enter integer value" % type(self.n_neighbors) + ) + + self._fit_method = super()._parse_auto_method( + self.algorithm, self.n_samples_fit_, self.n_features_in_ + ) _fit_y = None queue = QM.get_global_queue() @@ -174,7 +266,7 @@ def _fit(self, X, y): print(f" _fit_y type: {type(_fit_y)}, _fit_y shape: {getattr(_fit_y, 'shape', 'NO_SHAPE')}", file=sys.stderr) if _is_classifier(self) or (_is_regressor(self) and gpu_device): - _fit_y = y.astype(X.dtype).reshape((-1, 1)) if y is not None else None + _fit_y = self._validate_targets(self._y, X.dtype).reshape((-1, 1)) result = self._onedal_fit(X, _fit_y) print(f"DEBUG oneDAL _fit AFTER _onedal_fit:", file=sys.stderr) @@ -189,13 +281,35 @@ def _fit(self, X, y): return result def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): + use_raw_input = _get_config().get("use_raw_input", False) is True + n_features = getattr(self, "n_features_in_", None) + shape = getattr(X, "shape", None) + if n_features and shape and len(shape) > 1 and shape[1] != n_features: + raise ValueError( + ( + f"X has {X.shape[1]} features, " + f"but kneighbors is expecting " + f"{n_features} features as input" + ) + ) + _check_is_fitted(self) if n_neighbors is None: n_neighbors = self.n_neighbors + elif n_neighbors <= 0: + raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors) + else: + if not isinstance(n_neighbors, Integral): + raise TypeError( + "n_neighbors does not take %s value, " + "enter integer value" % type(n_neighbors) + ) if X is not None: query_is_train = False + if not use_raw_input: + X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) else: query_is_train = True X = self._fit_X @@ -204,12 +318,24 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors += 1 n_samples_fit = self.n_samples_fit_ + if n_neighbors > n_samples_fit: + if query_is_train: + n_neighbors -= 1 # ok to modify inplace because an error is raised + inequality_str = "n_neighbors < n_samples_fit" + else: + inequality_str = "n_neighbors <= n_samples_fit" + raise ValueError( + f"Expected {inequality_str}, but " + f"n_neighbors = {n_neighbors}, n_samples_fit = {n_samples_fit}, " + f"n_samples = {X.shape[0]}" # include n_samples for common tests + ) chunked_results = None - # Use the fit method determined at sklearnex level - method = getattr(self, "_fit_method", "brute") + method = self._parse_auto_method( + self._fit_method, self.n_samples_fit_, n_features + ) - params = self._get_onedal_params(X, n_neighbors=n_neighbors) + params = super()._get_onedal_params(X, n_neighbors=n_neighbors) prediction_results = self._onedal_predict(self._onedal_model, X, params) distances = from_table(prediction_results.distances) indices = from_table(prediction_results.indices) @@ -492,7 +618,6 @@ def __init__( self, n_neighbors=5, *, - weights="uniform", algorithm="auto", p=2, metric="minkowski", @@ -507,7 +632,7 @@ def __init__( metric_params=metric_params, **kwargs, ) - self.weights = weights + self.requires_y = False @bind_default_backend("neighbors.search") def train(self, *args, **kwargs): ... @@ -516,32 +641,11 @@ def train(self, *args, **kwargs): ... def infer(self, *arg, **kwargs): ... def _onedal_fit(self, X, y): - print(f"DEBUG NearestNeighbors _onedal_fit START - ENTRY PARAMETERS:", file=sys.stderr) - print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - print(f" y type: {type(y)}, y shape: {getattr(y, 'shape', 'NO_SHAPE')}", file=sys.stderr) - print(f" self._fit_X BEFORE to_table: type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) - # global queue is set as per user configuration (`target_offload`) or from data prior to calling this internal function queue = QM.get_global_queue() params = self._get_onedal_params(X, y) - - print(f"DEBUG NearestNeighbors _onedal_fit BEFORE to_table:", file=sys.stderr) - print(f" X type: {type(X)}, isinstance(X, tuple): {isinstance(X, tuple)}", file=sys.stderr) - print(f" y type: {type(y)}, isinstance(y, tuple): {isinstance(y, tuple)}", file=sys.stderr) - X, y = to_table(X, y, queue=queue) - - print(f"DEBUG NearestNeighbors _onedal_fit AFTER to_table - CRITICAL POINT:", file=sys.stderr) - print(f" X type: {type(X)}, isinstance(X, tuple): {isinstance(X, tuple)}", file=sys.stderr) - print(f" y type: {type(y)}, isinstance(y, tuple): {isinstance(y, tuple)}", file=sys.stderr) - print(f" self._fit_X AFTER to_table: type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) - - result = self.train(params, X).model - - print(f"DEBUG NearestNeighbors _onedal_fit AFTER train:", file=sys.stderr) - print(f" self._fit_X type: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) - - return result + return self.train(params, X).model def _onedal_predict(self, model, X, params): X = to_table(X, queue=QM.get_global_queue()) From cc2293cbce016e62399908f1377c5d1ec3f950fe Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 13 Oct 2025 23:40:17 -0700 Subject: [PATCH 25/87] fix: test restore --- sklearnex/neighbors/knn_unsupervised.py | 73 +++---------------------- 1 file changed, 7 insertions(+), 66 deletions(-) diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index d00d8bdedf..2195254a69 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -93,15 +93,6 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): if X is not None: check_feature_names(self, X, reset=False) - # Perform preprocessing at sklearnex level - from onedal.utils.validation import _check_array - - X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) - self._validate_feature_count(X, "kneighbors") - - # Validate n_neighbors - if n_neighbors is not None: - self._validate_n_neighbors(n_neighbors) return dispatch( self, @@ -133,51 +124,20 @@ def radius_neighbors( print("DEBUG radius_neighbors: PREVENTIVE FIX - _fit_X is tuple, permanently extracting first element", file=sys.stderr) self._fit_X = self._fit_X[0] # Fix the attribute permanently - # Preprocessing for X parameter (same as kneighbors) - if X is not None: - check_feature_names(self, X, reset=False) - # Perform preprocessing at sklearnex level - from onedal.utils.validation import _check_array - - X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) - self._validate_feature_count(X, "radius_neighbors") - - # Original OneDAL refactoring condition with proper validation + # Original main branch logic - simple conditional fit if ( hasattr(self, "_onedal_estimator") or getattr(self, "_tree", 0) is None - and getattr(self, "_fit_method", None) == "kd_tree" + and self._fit_method == "kd_tree" ): - print("DEBUG: Condition met - calling sklearn fit for preprocessing", file=sys.stderr) - - # Use sklearnex-level validation instead of raw OneDAL data - # This ensures we have proper arrays, not tuples from OneDAL processing - fit_x_for_sklearn = getattr(self, "_fit_X", None) - fit_y_for_sklearn = getattr(self, "_y", None) - - # Apply sklearnex-level validation to ensure proper data format - if fit_x_for_sklearn is not None: - # Use the refactored _validate_data method from KNeighborsDispatchingBase - fit_x_for_sklearn, _ = self._validate_data( - fit_x_for_sklearn, dtype=[np.float64, np.float32], accept_sparse=True - ) - - # CRITICAL FIX: Ensure _fit_X is properly extracted from tuple if needed - if isinstance(fit_x_for_sklearn, tuple): - print("DEBUG radius_neighbors: fit_x_for_sklearn is tuple, extracting first element", file=sys.stderr) - fit_x_for_sklearn = fit_x_for_sklearn[0] - - # Update the main attribute to ensure consistency - self._fit_X = fit_x_for_sklearn + print("DEBUG: Original condition met - calling sklearn fit", file=sys.stderr) + print(f" self._fit_X type before fit: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print(f" self._y type before fit: {type(getattr(self, '_y', 'NOT_SET'))}", file=sys.stderr) - print(f"DEBUG: Calling _sklearn_NearestNeighbors.fit with validated data", file=sys.stderr) - print(f" fit_x_for_sklearn type: {type(fit_x_for_sklearn)}", file=sys.stderr) - print(f" fit_y_for_sklearn type: {type(fit_y_for_sklearn)}", file=sys.stderr) - - _sklearn_NearestNeighbors.fit(self, fit_x_for_sklearn, fit_y_for_sklearn) + _sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, "_y", None)) print("DEBUG: sklearn fit completed", file=sys.stderr) else: - print("DEBUG: Condition NOT met - skipping sklearn fit", file=sys.stderr) + print("DEBUG: Original condition NOT met - skipping sklearn fit", file=sys.stderr) check_is_fitted(self) @@ -239,21 +199,6 @@ def _onedal_fit(self, X, y=None, queue=None): print(f" y type: {type(y)}, y shape: {getattr(y, 'shape', 'NO_SHAPE')}", file=sys.stderr) print(f" queue: {queue}", file=sys.stderr) - # Perform preprocessing at sklearnex level - X, _ = self._validate_data(X, dtype=[np.float64, np.float32], accept_sparse=True) - - print(f"DEBUG _onedal_fit AFTER _validate_data:", file=sys.stderr) - print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - - # Validate n_neighbors - self._validate_n_neighbors(self.n_neighbors) - - # Parse auto method - self._fit_method = self._parse_auto_method(self.algorithm, X.shape[0], X.shape[1]) - - # Set basic attributes for unsupervised - self.classes_ = None - onedal_params = { "n_neighbors": self.n_neighbors, "algorithm": self.algorithm, @@ -265,10 +210,6 @@ def _onedal_fit(self, X, y=None, queue=None): self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ - self._onedal_estimator._fit_method = self._fit_method - - # Set attributes on the onedal estimator - self._onedal_estimator.classes_ = self.classes_ print(f"DEBUG _onedal_fit BEFORE calling onedal_estimator.fit:", file=sys.stderr) print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) From 19fe8ce8a5e927d821e4fac2abe9fa8dae4446a2 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 14 Oct 2025 10:11:57 -0700 Subject: [PATCH 26/87] fix: restore again --- onedal/neighbors/neighbors.py | 71 +++++++++++++- sklearnex/neighbors/knn_unsupervised.py | 125 +++++++++++++----------- 2 files changed, 137 insertions(+), 59 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index e43d4b7339..217cf9dbd8 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -443,18 +443,45 @@ def fit(self, X, y, queue=None): @supports_queue def predict(self, X, queue=None): + print(f"DEBUG KNeighborsClassifier.predict START:", file=sys.stderr) + print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + + use_raw_input = _get_config().get("use_raw_input", False) is True + if not use_raw_input: + X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) onedal_model = getattr(self, "_onedal_model", None) + n_features = getattr(self, "n_features_in_", None) + n_samples_fit_ = getattr(self, "n_samples_fit_", None) + shape = getattr(X, "shape", None) + if n_features and shape and len(shape) > 1 and shape[1] != n_features: + raise ValueError( + ( + f"X has {X.shape[1]} features, " + f"but KNNClassifier is expecting " + f"{n_features} features as input" + ) + ) + _check_is_fitted(self) + self._fit_method = self._parse_auto_method( + self.algorithm, n_samples_fit_, n_features + ) + + self._validate_n_classes() + + print(f"DEBUG KNeighborsClassifier.predict BEFORE _get_onedal_params:", file=sys.stderr) params = self._get_onedal_params(X) prediction_result = self._onedal_predict(onedal_model, X, params) responses = from_table(prediction_result.responses) result = self.classes_.take(np.asarray(responses.ravel(), dtype=np.intp)) + print(f"DEBUG KNeighborsClassifier.predict END - result type: {type(result)}", file=sys.stderr) return result @supports_queue def predict_proba(self, X, queue=None): + print(f"DEBUG KNeighborsClassifier.predict_proba START:", file=sys.stderr) neigh_dist, neigh_ind = self.kneighbors(X, queue=queue) classes_ = self.classes_ @@ -465,6 +492,7 @@ def predict_proba(self, X, queue=None): n_queries = _num_samples(X) + print(f"DEBUG KNeighborsClassifier.predict_proba - using uniform weights (original main branch logic)", file=sys.stderr) # Use uniform weights for now - weights calculation should be done at sklearnex level weights = np.ones_like(neigh_ind) @@ -567,20 +595,48 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None) return self._kneighbors(X, n_neighbors, return_distance) def _predict_gpu(self, X): + print(f"DEBUG KNeighborsRegressor._predict_gpu START:", file=sys.stderr) + print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + + use_raw_input = _get_config().get("use_raw_input", False) is True + if not use_raw_input: + X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) onedal_model = getattr(self, "_onedal_model", None) + n_features = getattr(self, "n_features_in_", None) + n_samples_fit_ = getattr(self, "n_samples_fit_", None) + shape = getattr(X, "shape", None) + if n_features and shape and len(shape) > 1 and shape[1] != n_features: + raise ValueError( + ( + f"X has {X.shape[1]} features, " + f"but KNNClassifier is expecting " + f"{n_features} features as input" + ) + ) + _check_is_fitted(self) + self._fit_method = self._parse_auto_method( + self.algorithm, n_samples_fit_, n_features + ) + + print(f"DEBUG KNeighborsRegressor._predict_gpu BEFORE _get_onedal_params:", file=sys.stderr) params = self._get_onedal_params(X) prediction_result = self._onedal_predict(onedal_model, X, params) responses = from_table(prediction_result.responses) result = responses.ravel() + print(f"DEBUG KNeighborsRegressor._predict_gpu END - result type: {type(result)}", file=sys.stderr) return result def _predict_skl(self, X): + print(f"DEBUG KNeighborsRegressor._predict_skl START:", file=sys.stderr) + print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + neigh_dist, neigh_ind = self.kneighbors(X) + print(f"DEBUG KNeighborsRegressor._predict_skl - using uniform weights (original main branch logic)", file=sys.stderr) # Use uniform weights for now - weights calculation should be done at sklearnex level weights = None @@ -601,16 +657,27 @@ def _predict_skl(self, X): if self._y.ndim == 1: y_pred = y_pred.ravel() + print(f"DEBUG KNeighborsRegressor._predict_skl END - y_pred type: {type(y_pred)}", file=sys.stderr) return y_pred @supports_queue def predict(self, X, queue=None): + print(f"DEBUG KNeighborsRegressor.predict START:", file=sys.stderr) + print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f" queue: {queue}", file=sys.stderr) + gpu_device = queue is not None and getattr(queue.sycl_device, "is_gpu", False) is_uniform_weights = getattr(self, "weights", "uniform") == "uniform" + + print(f"DEBUG KNeighborsRegressor.predict - gpu_device: {gpu_device}, is_uniform_weights: {is_uniform_weights}", file=sys.stderr) + if gpu_device and is_uniform_weights: - return self._predict_gpu(X) + result = self._predict_gpu(X) else: - return self._predict_skl(X) + result = self._predict_skl(X) + + print(f"DEBUG KNeighborsRegressor.predict END - result type: {type(result)}", file=sys.stderr) + return result class NearestNeighbors(NeighborsBase): diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 2195254a69..bf9a8e3310 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -15,7 +15,6 @@ # =============================================================================== import sys -import numpy as np from sklearn.neighbors._unsupervised import NearestNeighbors as _sklearn_NearestNeighbors from sklearn.utils.validation import _deprecate_positional_args, check_is_fitted @@ -61,7 +60,7 @@ def __init__( ) def fit(self, X, y=None): - print(f"DEBUG fit START - ENTRY PARAMETERS:", file=sys.stderr) + print(f"DEBUG NearestNeighbors.fit START:", file=sys.stderr) print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) print(f" y type: {type(y)}, y shape: {getattr(y, 'shape', 'NO_SHAPE')}", file=sys.stderr) @@ -76,25 +75,27 @@ def fit(self, X, y=None): None, ) - print(f"DEBUG fit AFTER dispatch:", file=sys.stderr) + print(f"DEBUG NearestNeighbors.fit AFTER dispatch:", file=sys.stderr) print(f" self._fit_X type: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) print(f" self._fit_X shape: {getattr(getattr(self, '_fit_X', None), 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f" hasattr _onedal_estimator: {hasattr(self, '_onedal_estimator')}", file=sys.stderr) + print(f" _tree: {getattr(self, '_tree', 'NOT_SET')}", file=sys.stderr) + print(f" _fit_method: {getattr(self, '_fit_method', 'NOT_SET')}", file=sys.stderr) return self @wrap_output_data def kneighbors(self, X=None, n_neighbors=None, return_distance=True): - check_is_fitted(self) + print(f"DEBUG NearestNeighbors.kneighbors START:", file=sys.stderr) + print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f" n_neighbors: {n_neighbors}, return_distance: {return_distance}", file=sys.stderr) + print(f" self._fit_X type: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) - # CRITICAL FIRST: Ensure _fit_X is always an array before any sklearn operations - if hasattr(self, '_fit_X') and isinstance(self._fit_X, tuple): - print("DEBUG kneighbors: PREVENTIVE FIX - _fit_X is tuple, permanently extracting first element", file=sys.stderr) - self._fit_X = self._fit_X[0] # Fix the attribute permanently - + check_is_fitted(self) if X is not None: check_feature_names(self, X, reset=False) - return dispatch( + result = dispatch( self, "kneighbors", { @@ -105,12 +106,15 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors=n_neighbors, return_distance=return_distance, ) + + print(f"DEBUG NearestNeighbors.kneighbors END - result type: {type(result)}", file=sys.stderr) + return result @wrap_output_data def radius_neighbors( self, X=None, radius=None, return_distance=True, sort_results=False ): - print(f"DEBUG radius_neighbors START - ENTRY PARAMETERS:", file=sys.stderr) + print(f"DEBUG NearestNeighbors.radius_neighbors START:", file=sys.stderr) print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) print(f" radius: {radius}, return_distance: {return_distance}, sort_results: {sort_results}", file=sys.stderr) print(f" self._fit_X type: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) @@ -119,35 +123,31 @@ def radius_neighbors( print(f" _tree: {getattr(self, '_tree', 'NOT_SET')}", file=sys.stderr) print(f" _fit_method: {getattr(self, '_fit_method', 'NOT_SET')}", file=sys.stderr) - # CRITICAL FIRST: Ensure _fit_X is always an array before any sklearn operations - if hasattr(self, '_fit_X') and isinstance(self._fit_X, tuple): - print("DEBUG radius_neighbors: PREVENTIVE FIX - _fit_X is tuple, permanently extracting first element", file=sys.stderr) - self._fit_X = self._fit_X[0] # Fix the attribute permanently - - # Original main branch logic - simple conditional fit + # ORIGINAL MAIN BRANCH LOGIC - EXACTLY AS IT WAS if ( hasattr(self, "_onedal_estimator") or getattr(self, "_tree", 0) is None and self._fit_method == "kd_tree" ): - print("DEBUG: Original condition met - calling sklearn fit", file=sys.stderr) - print(f" self._fit_X type before fit: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) - print(f" self._y type before fit: {type(getattr(self, '_y', 'NOT_SET'))}", file=sys.stderr) + print("DEBUG NearestNeighbors.radius_neighbors - Condition met, calling sklearn fit", file=sys.stderr) + print(f" About to call _sklearn_NearestNeighbors.fit with:", file=sys.stderr) + print(f" self type: {type(self)}", file=sys.stderr) + print(f" self._fit_X type: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print(f" self._y type: {type(getattr(self, '_y', 'NOT_SET'))}", file=sys.stderr) _sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, "_y", None)) - print("DEBUG: sklearn fit completed", file=sys.stderr) + + print("DEBUG NearestNeighbors.radius_neighbors - sklearn fit completed", file=sys.stderr) else: - print("DEBUG: Original condition NOT met - skipping sklearn fit", file=sys.stderr) + print("DEBUG NearestNeighbors.radius_neighbors - Condition NOT met, skipping sklearn fit", file=sys.stderr) check_is_fitted(self) - print(f"DEBUG radius_neighbors BEFORE DISPATCH:", file=sys.stderr) + print(f"DEBUG NearestNeighbors.radius_neighbors BEFORE DISPATCH:", file=sys.stderr) print(f" self._fit_X type: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) print(f" self._fit_X shape: {getattr(getattr(self, '_fit_X', None), 'shape', 'NO_SHAPE')}", file=sys.stderr) - print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - print(f" radius: {radius}, return_distance: {return_distance}, sort_results: {sort_results}", file=sys.stderr) - return dispatch( + result = dispatch( self, "radius_neighbors", { @@ -159,28 +159,18 @@ def radius_neighbors( return_distance=return_distance, sort_results=sort_results, ) + + print(f"DEBUG NearestNeighbors.radius_neighbors END - result type: {type(result)}", file=sys.stderr) + return result def radius_neighbors_graph( self, X=None, radius=None, mode="connectivity", sort_results=False ): - print(f"DEBUG radius_neighbors_graph START - ENTRY PARAMETERS:", file=sys.stderr) + print(f"DEBUG NearestNeighbors.radius_neighbors_graph START:", file=sys.stderr) print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) print(f" radius: {radius}, mode: {mode}, sort_results: {sort_results}", file=sys.stderr) - print(f" self._fit_X type: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) - print(f" self._fit_X shape: {getattr(getattr(self, '_fit_X', None), 'shape', 'NO_SHAPE')}", file=sys.stderr) - # Handle potential tuple in _fit_X before calling dispatch - if hasattr(self, '_fit_X') and isinstance(self._fit_X, tuple): - print("DEBUG radius_neighbors_graph: _fit_X is tuple, permanently extracting first element", file=sys.stderr) - self._fit_X = self._fit_X[0] # Fix the attribute permanently - - print(f"DEBUG radius_neighbors_graph BEFORE DISPATCH:", file=sys.stderr) - print(f" self._fit_X type: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) - print(f" self._fit_X shape: {getattr(getattr(self, '_fit_X', None), 'shape', 'NO_SHAPE')}", file=sys.stderr) - print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - print(f" radius: {radius}, mode: {mode}, sort_results: {sort_results}", file=sys.stderr) - - return dispatch( + result = dispatch( self, "radius_neighbors_graph", { @@ -192,9 +182,12 @@ def radius_neighbors_graph( mode=mode, sort_results=sort_results, ) + + print(f"DEBUG NearestNeighbors.radius_neighbors_graph END - result type: {type(result)}", file=sys.stderr) + return result def _onedal_fit(self, X, y=None, queue=None): - print(f"DEBUG _onedal_fit START - ENTRY PARAMETERS:", file=sys.stderr) + print(f"DEBUG NearestNeighbors._onedal_fit START:", file=sys.stderr) print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) print(f" y type: {type(y)}, y shape: {getattr(y, 'shape', 'NO_SHAPE')}", file=sys.stderr) print(f" queue: {queue}", file=sys.stderr) @@ -206,35 +199,51 @@ def _onedal_fit(self, X, y=None, queue=None): "p": self.effective_metric_params_["p"], } + print(f"DEBUG NearestNeighbors._onedal_fit - Creating onedal_NearestNeighbors with params: {onedal_params}", file=sys.stderr) + self._onedal_estimator = onedal_NearestNeighbors(**onedal_params) self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ - print(f"DEBUG _onedal_fit BEFORE calling onedal_estimator.fit:", file=sys.stderr) + print(f"DEBUG NearestNeighbors._onedal_fit BEFORE calling onedal_estimator.fit:", file=sys.stderr) print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) print(f" y type: {type(y)}, y shape: {getattr(y, 'shape', 'NO_SHAPE')}", file=sys.stderr) - print(f" queue: {queue}", file=sys.stderr) self._onedal_estimator.fit(X, y, queue=queue) - print(f"DEBUG _onedal_fit AFTER calling onedal_estimator.fit:", file=sys.stderr) + print(f"DEBUG NearestNeighbors._onedal_fit AFTER calling onedal_estimator.fit:", file=sys.stderr) print(f" onedal_estimator._fit_X type: {type(getattr(self._onedal_estimator, '_fit_X', 'NOT_SET'))}", file=sys.stderr) self._save_attributes() + + print(f"DEBUG NearestNeighbors._onedal_fit END - _save_attributes completed", file=sys.stderr) def _onedal_predict(self, X, queue=None): - return self._onedal_estimator.predict(X, queue=queue) + print(f"DEBUG NearestNeighbors._onedal_predict START:", file=sys.stderr) + print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + + result = self._onedal_estimator.predict(X, queue=queue) + + print(f"DEBUG NearestNeighbors._onedal_predict END - result type: {type(result)}", file=sys.stderr) + return result def _onedal_kneighbors( self, X=None, n_neighbors=None, return_distance=True, queue=None ): - return self._onedal_estimator.kneighbors( + print(f"DEBUG NearestNeighbors._onedal_kneighbors START:", file=sys.stderr) + print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f" n_neighbors: {n_neighbors}, return_distance: {return_distance}", file=sys.stderr) + + result = self._onedal_estimator.kneighbors( X, n_neighbors, return_distance, queue=queue ) + + print(f"DEBUG NearestNeighbors._onedal_kneighbors END - result type: {type(result)}", file=sys.stderr) + return result def _save_attributes(self): - print(f"DEBUG _save_attributes START:", file=sys.stderr) + print(f"DEBUG NearestNeighbors._save_attributes START:", file=sys.stderr) print(f" onedal_estimator._fit_X type: {type(getattr(self._onedal_estimator, '_fit_X', 'NOT_SET'))}", file=sys.stderr) if hasattr(self._onedal_estimator, '_fit_X'): fit_x_preview = str(self._onedal_estimator._fit_X)[:200] @@ -243,27 +252,29 @@ def _save_attributes(self): self.classes_ = self._onedal_estimator.classes_ self.n_features_in_ = self._onedal_estimator.n_features_in_ self.n_samples_fit_ = self._onedal_estimator.n_samples_fit_ - fit_x = self._onedal_estimator._fit_X - print(f"DEBUG _save_attributes processing _fit_X:", file=sys.stderr) + # CRITICAL: Check if _fit_X is tuple and handle it + fit_x = self._onedal_estimator._fit_X + print(f"DEBUG NearestNeighbors._save_attributes processing _fit_X:", file=sys.stderr) print(f" fit_x type: {type(fit_x)}", file=sys.stderr) print(f" isinstance(fit_x, tuple): {isinstance(fit_x, tuple)}", file=sys.stderr) - # CRITICAL FIX: OneDAL's to_table() can return tuples (array, None) in recursive calls - # We must extract the actual array for sklearn compatibility if isinstance(fit_x, tuple): - print(f"DEBUG _save_attributes: fit_x is tuple, extracting array from: {fit_x}", file=sys.stderr) + print(f"DEBUG NearestNeighbors._save_attributes - fit_x is tuple: {fit_x}", file=sys.stderr) + print(f" Extracting first element: {type(fit_x[0]) if len(fit_x) > 0 else 'EMPTY'}", file=sys.stderr) self._fit_X = fit_x[0] # Extract the array from (array, None) tuple else: self._fit_X = fit_x - print(f"DEBUG _save_attributes AFTER processing:", file=sys.stderr) - print(f" self._fit_X type: {type(self._fit_X)}", file=sys.stderr) - print(f" self._fit_X shape: {getattr(self._fit_X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - self._fit_method = self._onedal_estimator._fit_method self._tree = self._onedal_estimator._tree + print(f"DEBUG NearestNeighbors._save_attributes END:", file=sys.stderr) + print(f" self._fit_X type: {type(self._fit_X)}", file=sys.stderr) + print(f" self._fit_X shape: {getattr(self._fit_X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f" self._fit_method: {self._fit_method}", file=sys.stderr) + print(f" self._tree: {self._tree}", file=sys.stderr) + fit.__doc__ = _sklearn_NearestNeighbors.__doc__ kneighbors.__doc__ = _sklearn_NearestNeighbors.kneighbors.__doc__ radius_neighbors.__doc__ = _sklearn_NearestNeighbors.radius_neighbors.__doc__ From 0f37c1b79782029293e5b610c95fbbc956d2591e Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 14 Oct 2025 11:11:50 -0700 Subject: [PATCH 27/87] fix: restpore --- onedal/neighbors/neighbors.py | 62 ++++------- sklearnex/neighbors/knn_unsupervised.py | 136 +++++------------------- 2 files changed, 45 insertions(+), 153 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index 217cf9dbd8..0785a4b754 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -193,10 +193,7 @@ def _validate_n_classes(self): ) def _fit(self, X, y): - print(f"DEBUG oneDAL _fit START - ENTRY PARAMETERS:", file=sys.stderr) - print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - print(f" y type: {type(y)}, y shape: {getattr(y, 'shape', 'NO_SHAPE')}", file=sys.stderr) - + print(f"DEBUG oneDAL _fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", file=sys.stderr) self._onedal_model = None self._tree = None self._shape = None @@ -261,16 +258,11 @@ def _fit(self, X, y): queue = QM.get_global_queue() gpu_device = queue is not None and queue.sycl_device.is_gpu - print(f"DEBUG oneDAL _fit BEFORE calling _onedal_fit:", file=sys.stderr) - print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - print(f" _fit_y type: {type(_fit_y)}, _fit_y shape: {getattr(_fit_y, 'shape', 'NO_SHAPE')}", file=sys.stderr) - + print(f"DEBUG oneDAL _fit: Before _onedal_fit, X type={type(X)}, _fit_y type={type(_fit_y)}", file=sys.stderr) if _is_classifier(self) or (_is_regressor(self) and gpu_device): _fit_y = self._validate_targets(self._y, X.dtype).reshape((-1, 1)) result = self._onedal_fit(X, _fit_y) - - print(f"DEBUG oneDAL _fit AFTER _onedal_fit:", file=sys.stderr) - print(f" self._fit_X type: {type(self._fit_X)}, shape: {getattr(self._fit_X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f"DEBUG oneDAL _fit: After _onedal_fit, self._fit_X type={type(self._fit_X)}, shape={getattr(self._fit_X, 'shape', 'NO_SHAPE')}", file=sys.stderr) if y is not None and _is_regressor(self): self._y = y if self._shape is None else xp.reshape(y, self._shape) @@ -443,9 +435,7 @@ def fit(self, X, y, queue=None): @supports_queue def predict(self, X, queue=None): - print(f"DEBUG KNeighborsClassifier.predict START:", file=sys.stderr) - print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - + print(f"DEBUG KNeighborsClassifier.predict START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) use_raw_input = _get_config().get("use_raw_input", False) is True if not use_raw_input: X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) @@ -470,18 +460,17 @@ def predict(self, X, queue=None): self._validate_n_classes() - print(f"DEBUG KNeighborsClassifier.predict BEFORE _get_onedal_params:", file=sys.stderr) params = self._get_onedal_params(X) prediction_result = self._onedal_predict(onedal_model, X, params) responses = from_table(prediction_result.responses) result = self.classes_.take(np.asarray(responses.ravel(), dtype=np.intp)) - print(f"DEBUG KNeighborsClassifier.predict END - result type: {type(result)}", file=sys.stderr) + print(f"DEBUG KNeighborsClassifier.predict END: result type={type(result)}", file=sys.stderr) return result @supports_queue def predict_proba(self, X, queue=None): - print(f"DEBUG KNeighborsClassifier.predict_proba START:", file=sys.stderr) + print(f"DEBUG KNeighborsClassifier.predict_proba START: X type={type(X)}", file=sys.stderr) neigh_dist, neigh_ind = self.kneighbors(X, queue=queue) classes_ = self.classes_ @@ -492,9 +481,13 @@ def predict_proba(self, X, queue=None): n_queries = _num_samples(X) - print(f"DEBUG KNeighborsClassifier.predict_proba - using uniform weights (original main branch logic)", file=sys.stderr) - # Use uniform weights for now - weights calculation should be done at sklearnex level - weights = np.ones_like(neigh_ind) + print(f"DEBUG predict_proba: Calling _get_weights", file=sys.stderr) + weights = self._get_weights(neigh_dist, self.weights) + if weights is None: + print(f"DEBUG predict_proba: weights is None, using ones_like", file=sys.stderr) + weights = np.ones_like(neigh_ind) + else: + print(f"DEBUG predict_proba: weights calculated, type={type(weights)}", file=sys.stderr) all_rows = np.arange(n_queries) probabilities = [] @@ -595,9 +588,6 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None) return self._kneighbors(X, n_neighbors, return_distance) def _predict_gpu(self, X): - print(f"DEBUG KNeighborsRegressor._predict_gpu START:", file=sys.stderr) - print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - use_raw_input = _get_config().get("use_raw_input", False) is True if not use_raw_input: X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) @@ -620,25 +610,21 @@ def _predict_gpu(self, X): self.algorithm, n_samples_fit_, n_features ) - print(f"DEBUG KNeighborsRegressor._predict_gpu BEFORE _get_onedal_params:", file=sys.stderr) params = self._get_onedal_params(X) prediction_result = self._onedal_predict(onedal_model, X, params) responses = from_table(prediction_result.responses) result = responses.ravel() - print(f"DEBUG KNeighborsRegressor._predict_gpu END - result type: {type(result)}", file=sys.stderr) return result def _predict_skl(self, X): - print(f"DEBUG KNeighborsRegressor._predict_skl START:", file=sys.stderr) - print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - + print(f"DEBUG KNeighborsRegressor._predict_skl START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) neigh_dist, neigh_ind = self.kneighbors(X) - print(f"DEBUG KNeighborsRegressor._predict_skl - using uniform weights (original main branch logic)", file=sys.stderr) - # Use uniform weights for now - weights calculation should be done at sklearnex level - weights = None + print(f"DEBUG _predict_skl: Calling _get_weights", file=sys.stderr) + weights = self._get_weights(neigh_dist, self.weights) + print(f"DEBUG _predict_skl: weights result={type(weights) if weights is not None else 'None'}", file=sys.stderr) _y = self._y if _y.ndim == 1: @@ -657,26 +643,20 @@ def _predict_skl(self, X): if self._y.ndim == 1: y_pred = y_pred.ravel() - print(f"DEBUG KNeighborsRegressor._predict_skl END - y_pred type: {type(y_pred)}", file=sys.stderr) + print(f"DEBUG KNeighborsRegressor._predict_skl END: y_pred type={type(y_pred)}", file=sys.stderr) return y_pred @supports_queue def predict(self, X, queue=None): - print(f"DEBUG KNeighborsRegressor.predict START:", file=sys.stderr) - print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - print(f" queue: {queue}", file=sys.stderr) - + print(f"DEBUG KNeighborsRegressor.predict START: X type={type(X)}, queue={queue}", file=sys.stderr) gpu_device = queue is not None and getattr(queue.sycl_device, "is_gpu", False) is_uniform_weights = getattr(self, "weights", "uniform") == "uniform" - - print(f"DEBUG KNeighborsRegressor.predict - gpu_device: {gpu_device}, is_uniform_weights: {is_uniform_weights}", file=sys.stderr) - + print(f"DEBUG KNeighborsRegressor.predict: gpu_device={gpu_device}, is_uniform_weights={is_uniform_weights}", file=sys.stderr) if gpu_device and is_uniform_weights: result = self._predict_gpu(X) else: result = self._predict_skl(X) - - print(f"DEBUG KNeighborsRegressor.predict END - result type: {type(result)}", file=sys.stderr) + print(f"DEBUG KNeighborsRegressor.predict END: result type={type(result)}", file=sys.stderr) return result diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index bf9a8e3310..556847fc6e 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -60,10 +60,7 @@ def __init__( ) def fit(self, X, y=None): - print(f"DEBUG NearestNeighbors.fit START:", file=sys.stderr) - print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - print(f" y type: {type(y)}, y shape: {getattr(y, 'shape', 'NO_SHAPE')}", file=sys.stderr) - + print(f"DEBUG NearestNeighbors.fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", file=sys.stderr) dispatch( self, "fit", @@ -74,27 +71,15 @@ def fit(self, X, y=None): X, None, ) - - print(f"DEBUG NearestNeighbors.fit AFTER dispatch:", file=sys.stderr) - print(f" self._fit_X type: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) - print(f" self._fit_X shape: {getattr(getattr(self, '_fit_X', None), 'shape', 'NO_SHAPE')}", file=sys.stderr) - print(f" hasattr _onedal_estimator: {hasattr(self, '_onedal_estimator')}", file=sys.stderr) - print(f" _tree: {getattr(self, '_tree', 'NOT_SET')}", file=sys.stderr) - print(f" _fit_method: {getattr(self, '_fit_method', 'NOT_SET')}", file=sys.stderr) - + print(f"DEBUG NearestNeighbors.fit END: _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}, _fit_X shape={getattr(getattr(self, '_fit_X', None), 'shape', 'NO_SHAPE')}", file=sys.stderr) return self @wrap_output_data def kneighbors(self, X=None, n_neighbors=None, return_distance=True): - print(f"DEBUG NearestNeighbors.kneighbors START:", file=sys.stderr) - print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - print(f" n_neighbors: {n_neighbors}, return_distance: {return_distance}", file=sys.stderr) - print(f" self._fit_X type: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) - + print(f"DEBUG NearestNeighbors.kneighbors START: X type={type(X)}, _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) check_is_fitted(self) if X is not None: check_feature_names(self, X, reset=False) - result = dispatch( self, "kneighbors", @@ -106,47 +91,24 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors=n_neighbors, return_distance=return_distance, ) - - print(f"DEBUG NearestNeighbors.kneighbors END - result type: {type(result)}", file=sys.stderr) + print(f"DEBUG NearestNeighbors.kneighbors END: result type={type(result)}", file=sys.stderr) return result @wrap_output_data def radius_neighbors( self, X=None, radius=None, return_distance=True, sort_results=False ): - print(f"DEBUG NearestNeighbors.radius_neighbors START:", file=sys.stderr) - print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - print(f" radius: {radius}, return_distance: {return_distance}, sort_results: {sort_results}", file=sys.stderr) - print(f" self._fit_X type: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) - print(f" self._fit_X shape: {getattr(getattr(self, '_fit_X', None), 'shape', 'NO_SHAPE')}", file=sys.stderr) - print(f" hasattr _onedal_estimator: {hasattr(self, '_onedal_estimator')}", file=sys.stderr) - print(f" _tree: {getattr(self, '_tree', 'NOT_SET')}", file=sys.stderr) - print(f" _fit_method: {getattr(self, '_fit_method', 'NOT_SET')}", file=sys.stderr) - - # ORIGINAL MAIN BRANCH LOGIC - EXACTLY AS IT WAS + print(f"DEBUG NearestNeighbors.radius_neighbors START: X type={type(X)}, _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}, _fit_X shape={getattr(getattr(self, '_fit_X', None), 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f"DEBUG radius_neighbors: hasattr _onedal_estimator={hasattr(self, '_onedal_estimator')}, _tree={getattr(self, '_tree', 'NOT_SET')}, _fit_method={getattr(self, '_fit_method', 'NOT_SET')}", file=sys.stderr) if ( hasattr(self, "_onedal_estimator") or getattr(self, "_tree", 0) is None and self._fit_method == "kd_tree" ): - print("DEBUG NearestNeighbors.radius_neighbors - Condition met, calling sklearn fit", file=sys.stderr) - print(f" About to call _sklearn_NearestNeighbors.fit with:", file=sys.stderr) - print(f" self type: {type(self)}", file=sys.stderr) - print(f" self._fit_X type: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) - print(f" self._y type: {type(getattr(self, '_y', 'NOT_SET'))}", file=sys.stderr) - + print(f"DEBUG radius_neighbors: Calling sklearn fit with _fit_X type={type(self._fit_X)}", file=sys.stderr) _sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, "_y", None)) - - print("DEBUG NearestNeighbors.radius_neighbors - sklearn fit completed", file=sys.stderr) - else: - print("DEBUG NearestNeighbors.radius_neighbors - Condition NOT met, skipping sklearn fit", file=sys.stderr) - + print(f"DEBUG radius_neighbors: sklearn fit completed, _fit_X type now={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) check_is_fitted(self) - - print(f"DEBUG NearestNeighbors.radius_neighbors BEFORE DISPATCH:", file=sys.stderr) - print(f" self._fit_X type: {type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) - print(f" self._fit_X shape: {getattr(getattr(self, '_fit_X', None), 'shape', 'NO_SHAPE')}", file=sys.stderr) - result = dispatch( self, "radius_neighbors", @@ -159,18 +121,13 @@ def radius_neighbors( return_distance=return_distance, sort_results=sort_results, ) - - print(f"DEBUG NearestNeighbors.radius_neighbors END - result type: {type(result)}", file=sys.stderr) + print(f"DEBUG NearestNeighbors.radius_neighbors END: result type={type(result)}", file=sys.stderr) return result def radius_neighbors_graph( self, X=None, radius=None, mode="connectivity", sort_results=False ): - print(f"DEBUG NearestNeighbors.radius_neighbors_graph START:", file=sys.stderr) - print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - print(f" radius: {radius}, mode: {mode}, sort_results: {sort_results}", file=sys.stderr) - - result = dispatch( + return dispatch( self, "radius_neighbors_graph", { @@ -182,16 +139,9 @@ def radius_neighbors_graph( mode=mode, sort_results=sort_results, ) - - print(f"DEBUG NearestNeighbors.radius_neighbors_graph END - result type: {type(result)}", file=sys.stderr) - return result def _onedal_fit(self, X, y=None, queue=None): - print(f"DEBUG NearestNeighbors._onedal_fit START:", file=sys.stderr) - print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - print(f" y type: {type(y)}, y shape: {getattr(y, 'shape', 'NO_SHAPE')}", file=sys.stderr) - print(f" queue: {queue}", file=sys.stderr) - + print(f"DEBUG NearestNeighbors._onedal_fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", file=sys.stderr) onedal_params = { "n_neighbors": self.n_neighbors, "algorithm": self.algorithm, @@ -199,81 +149,43 @@ def _onedal_fit(self, X, y=None, queue=None): "p": self.effective_metric_params_["p"], } - print(f"DEBUG NearestNeighbors._onedal_fit - Creating onedal_NearestNeighbors with params: {onedal_params}", file=sys.stderr) - self._onedal_estimator = onedal_NearestNeighbors(**onedal_params) self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ - - print(f"DEBUG NearestNeighbors._onedal_fit BEFORE calling onedal_estimator.fit:", file=sys.stderr) - print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - print(f" y type: {type(y)}, y shape: {getattr(y, 'shape', 'NO_SHAPE')}", file=sys.stderr) - + print(f"DEBUG NearestNeighbors._onedal_fit: Calling onedal_estimator.fit", file=sys.stderr) self._onedal_estimator.fit(X, y, queue=queue) - - print(f"DEBUG NearestNeighbors._onedal_fit AFTER calling onedal_estimator.fit:", file=sys.stderr) - print(f" onedal_estimator._fit_X type: {type(getattr(self._onedal_estimator, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print(f"DEBUG NearestNeighbors._onedal_fit: After fit, onedal_estimator._fit_X type={type(getattr(self._onedal_estimator, '_fit_X', 'NOT_SET'))}", file=sys.stderr) self._save_attributes() - - print(f"DEBUG NearestNeighbors._onedal_fit END - _save_attributes completed", file=sys.stderr) + print(f"DEBUG NearestNeighbors._onedal_fit END: self._fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) def _onedal_predict(self, X, queue=None): - print(f"DEBUG NearestNeighbors._onedal_predict START:", file=sys.stderr) - print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - - result = self._onedal_estimator.predict(X, queue=queue) - - print(f"DEBUG NearestNeighbors._onedal_predict END - result type: {type(result)}", file=sys.stderr) - return result + return self._onedal_estimator.predict(X, queue=queue) def _onedal_kneighbors( self, X=None, n_neighbors=None, return_distance=True, queue=None ): - print(f"DEBUG NearestNeighbors._onedal_kneighbors START:", file=sys.stderr) - print(f" X type: {type(X)}, X shape: {getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - print(f" n_neighbors: {n_neighbors}, return_distance: {return_distance}", file=sys.stderr) - - result = self._onedal_estimator.kneighbors( + return self._onedal_estimator.kneighbors( X, n_neighbors, return_distance, queue=queue ) - - print(f"DEBUG NearestNeighbors._onedal_kneighbors END - result type: {type(result)}", file=sys.stderr) - return result def _save_attributes(self): - print(f"DEBUG NearestNeighbors._save_attributes START:", file=sys.stderr) - print(f" onedal_estimator._fit_X type: {type(getattr(self._onedal_estimator, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print(f"DEBUG NearestNeighbors._save_attributes START: onedal_estimator._fit_X type={type(getattr(self._onedal_estimator, '_fit_X', 'NOT_SET'))}", file=sys.stderr) if hasattr(self._onedal_estimator, '_fit_X'): fit_x_preview = str(self._onedal_estimator._fit_X)[:200] - print(f" onedal_estimator._fit_X value preview: {fit_x_preview}", file=sys.stderr) - + print(f"DEBUG _save_attributes: _fit_X value preview={fit_x_preview}", file=sys.stderr) self.classes_ = self._onedal_estimator.classes_ self.n_features_in_ = self._onedal_estimator.n_features_in_ self.n_samples_fit_ = self._onedal_estimator.n_samples_fit_ - - # CRITICAL: Check if _fit_X is tuple and handle it - fit_x = self._onedal_estimator._fit_X - print(f"DEBUG NearestNeighbors._save_attributes processing _fit_X:", file=sys.stderr) - print(f" fit_x type: {type(fit_x)}", file=sys.stderr) - print(f" isinstance(fit_x, tuple): {isinstance(fit_x, tuple)}", file=sys.stderr) - - if isinstance(fit_x, tuple): - print(f"DEBUG NearestNeighbors._save_attributes - fit_x is tuple: {fit_x}", file=sys.stderr) - print(f" Extracting first element: {type(fit_x[0]) if len(fit_x) > 0 else 'EMPTY'}", file=sys.stderr) - self._fit_X = fit_x[0] # Extract the array from (array, None) tuple - else: - self._fit_X = fit_x - + # ORIGINAL MAIN BRANCH: Direct assignment without any tuple extraction + self._fit_X = self._onedal_estimator._fit_X + print(f"DEBUG _save_attributes: AFTER assignment - self._fit_X type={type(self._fit_X)}, has shape attr={hasattr(self._fit_X, 'shape')}", file=sys.stderr) + if hasattr(self._fit_X, 'shape'): + print(f"DEBUG _save_attributes: self._fit_X.shape={self._fit_X.shape}", file=sys.stderr) self._fit_method = self._onedal_estimator._fit_method self._tree = self._onedal_estimator._tree - - print(f"DEBUG NearestNeighbors._save_attributes END:", file=sys.stderr) - print(f" self._fit_X type: {type(self._fit_X)}", file=sys.stderr) - print(f" self._fit_X shape: {getattr(self._fit_X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - print(f" self._fit_method: {self._fit_method}", file=sys.stderr) - print(f" self._tree: {self._tree}", file=sys.stderr) + print(f"DEBUG NearestNeighbors._save_attributes END: _fit_method={self._fit_method}, _tree={self._tree}", file=sys.stderr) fit.__doc__ = _sklearn_NearestNeighbors.__doc__ kneighbors.__doc__ = _sklearn_NearestNeighbors.kneighbors.__doc__ From f984c42777a673f30c90cc5d1649fd542068968d Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 14 Oct 2025 11:58:33 -0700 Subject: [PATCH 28/87] fix: restore ad and add print --- sklearnex/neighbors/_lof.py | 38 ++++--- sklearnex/neighbors/knn_classification.py | 126 +++++++++------------- sklearnex/neighbors/knn_regression.py | 97 ++++++++--------- 3 files changed, 115 insertions(+), 146 deletions(-) diff --git a/sklearnex/neighbors/_lof.py b/sklearnex/neighbors/_lof.py index 7f5f2fe840..dd4525fb9c 100644 --- a/sklearnex/neighbors/_lof.py +++ b/sklearnex/neighbors/_lof.py @@ -112,6 +112,8 @@ def _onedal_fit(self, X, y, queue=None): return self def fit(self, X, y=None): + import sys + print(f"DEBUG LocalOutlierFactor.fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) result = dispatch( self, "fit", @@ -122,9 +124,12 @@ def fit(self, X, y=None): X, None, ) + print(f"DEBUG LocalOutlierFactor.fit END: result type={type(result)}", file=sys.stderr) return result def _predict(self, X=None): + import sys + print(f"DEBUG LocalOutlierFactor._predict START: X type={type(X)}", file=sys.stderr) check_is_fitted(self) if X is not None: @@ -136,6 +141,7 @@ def _predict(self, X=None): is_inlier = np.ones(self.n_samples_fit_, dtype=int) is_inlier[self.negative_outlier_factor_ < self.offset_] = -1 + print(f"DEBUG LocalOutlierFactor._predict END: is_inlier type={type(is_inlier)}", file=sys.stderr) return is_inlier # This had to be done because predict loses the queue when no @@ -146,25 +152,19 @@ def _predict(self, X=None): @wraps(_sklearn_LocalOutlierFactor.fit_predict, assigned=["__doc__"]) @wrap_output_data def fit_predict(self, X, y=None): - return self.fit(X)._predict() + import sys + print(f"DEBUG LocalOutlierFactor.fit_predict START: X type={type(X)}", file=sys.stderr) + result = self.fit(X)._predict() + print(f"DEBUG LocalOutlierFactor.fit_predict END: result type={type(result)}", file=sys.stderr) + return result def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): + import sys + print(f"DEBUG LocalOutlierFactor._kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) check_is_fitted(self) if X is not None: check_feature_names(self, X, reset=False) - # Perform preprocessing at sklearnex level - import numpy as np - - from onedal.utils.validation import _check_array - - X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) - self._validate_feature_count(X, "kneighbors") - - # Validate n_neighbors - if n_neighbors is not None: - self._validate_n_neighbors(n_neighbors) - - return dispatch( + result = dispatch( self, "kneighbors", { @@ -175,6 +175,8 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors=n_neighbors, return_distance=return_distance, ) + print(f"DEBUG LocalOutlierFactor._kneighbors END: result type={type(result)}", file=sys.stderr) + return result kneighbors = wrap_output_data(_kneighbors) @@ -182,6 +184,8 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): @wraps(_sklearn_LocalOutlierFactor.score_samples, assigned=["__doc__"]) @wrap_output_data def score_samples(self, X): + import sys + print(f"DEBUG LocalOutlierFactor.score_samples START: X type={type(X)}", file=sys.stderr) check_is_fitted(self) distances_X, neighbors_indices_X = self._kneighbors( @@ -195,7 +199,9 @@ def score_samples(self, X): lrd_ratios_array = self._lrd[neighbors_indices_X] / X_lrd[:, np.newaxis] - return -np.mean(lrd_ratios_array, axis=1) + result = -np.mean(lrd_ratios_array, axis=1) + print(f"DEBUG LocalOutlierFactor.score_samples END: result type={type(result)}", file=sys.stderr) + return result fit.__doc__ = _sklearn_LocalOutlierFactor.fit.__doc__ - kneighbors.__doc__ = _sklearn_LocalOutlierFactor.kneighbors.__doc__ + kneighbors.__doc__ = _sklearn_LocalOutlierFactor.kneighbors.__doc__ \ No newline at end of file diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index 0912c09464..f6a867e234 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -14,7 +14,6 @@ # limitations under the License. # =============================================================================== -import numpy as np from sklearn.metrics import accuracy_score from sklearn.neighbors._classification import ( KNeighborsClassifier as _sklearn_KNeighborsClassifier, @@ -65,6 +64,8 @@ def __init__( ) def fit(self, X, y): + import sys + print(f"DEBUG KNeighborsClassifier.fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", file=sys.stderr) dispatch( self, "fit", @@ -75,20 +76,16 @@ def fit(self, X, y): X, y, ) + print(f"DEBUG KNeighborsClassifier.fit END: _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) return self @wrap_output_data def predict(self, X): + import sys + print(f"DEBUG KNeighborsClassifier.predict START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) check_is_fitted(self) check_feature_names(self, X, reset=False) - - # Perform preprocessing at sklearnex level - from onedal.utils.validation import _check_array - - X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) - self._validate_feature_count(X, "KNNClassifier") - - return dispatch( + result = dispatch( self, "predict", { @@ -97,19 +94,16 @@ def predict(self, X): }, X, ) + print(f"DEBUG KNeighborsClassifier.predict END: result type={type(result)}", file=sys.stderr) + return result @wrap_output_data def predict_proba(self, X): + import sys + print(f"DEBUG KNeighborsClassifier.predict_proba START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) check_is_fitted(self) check_feature_names(self, X, reset=False) - - # Perform preprocessing at sklearnex level - from onedal.utils.validation import _check_array - - X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) - self._validate_feature_count(X, "predict_proba") - - return dispatch( + result = dispatch( self, "predict_proba", { @@ -118,21 +112,16 @@ def predict_proba(self, X): }, X, ) + print(f"DEBUG KNeighborsClassifier.predict_proba END: result type={type(result)}", file=sys.stderr) + return result @wrap_output_data def score(self, X, y, sample_weight=None): import sys - print("DEBUG: score called11111!", X, y, file=sys.stderr, flush=True) + print(f"DEBUG KNeighborsClassifier.score START: X type={type(X)}, y type={type(y)}", file=sys.stderr) check_is_fitted(self) check_feature_names(self, X, reset=False) - - # Perform preprocessing at sklearnex level - from onedal.utils.validation import _check_array - - X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) - self._validate_feature_count(X, "score") - - return dispatch( + result = dispatch( self, "score", { @@ -143,25 +132,17 @@ def score(self, X, y, sample_weight=None): y, sample_weight=sample_weight, ) + print(f"DEBUG KNeighborsClassifier.score END: result={result}", file=sys.stderr) + return result @wrap_output_data def kneighbors(self, X=None, n_neighbors=None, return_distance=True): import sys - print("DEBUG: kneighbors called11111!", X, file=sys.stderr, flush=True) + print(f"DEBUG KNeighborsClassifier.kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) check_is_fitted(self) if X is not None: check_feature_names(self, X, reset=False) - # Perform preprocessing at sklearnex level - from onedal.utils.validation import _check_array - - X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) - self._validate_feature_count(X, "kneighbors") - - # Validate n_neighbors - if n_neighbors is not None: - self._validate_n_neighbors(n_neighbors) - - return dispatch( + result = dispatch( self, "kneighbors", { @@ -172,30 +153,12 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors=n_neighbors, return_distance=return_distance, ) + print(f"DEBUG KNeighborsClassifier.kneighbors END: result type={type(result)}", file=sys.stderr) + return result def _onedal_fit(self, X, y, queue=None): import sys - print("DEBUG: _onedal_fit called11111!", X, y, file=sys.stderr, flush=True) - - # Perform preprocessing at sklearnex level - X, y = self._validate_data( - X, y, dtype=[np.float64, np.float32], accept_sparse="csr" - ) - - # Validate n_neighbors - self._validate_n_neighbors(self.n_neighbors) - - # Parse auto method - self._fit_method = self._parse_auto_method(self.algorithm, X.shape[0], X.shape[1]) - - # Validate classification targets - from onedal.utils.validation import _check_classification_targets - - _check_classification_targets(y) - - # Handle shape and class processing at sklearnex level - y = self._process_classification_targets(y) - + print(f"DEBUG KNeighborsClassifier._onedal_fit START: X type={type(X)}, y type={type(y)}", file=sys.stderr) onedal_params = { "n_neighbors": self.n_neighbors, "weights": self.weights, @@ -208,52 +171,61 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ - self._onedal_estimator._fit_method = self._fit_method - - # Set shape and class attributes on the onedal estimator - self._onedal_estimator._shape = self._shape - self._onedal_estimator.classes_ = self.classes_ - self._onedal_estimator._y = self._y - self._onedal_estimator.outputs_2d_ = self.outputs_2d_ - + print(f"DEBUG KNeighborsClassifier._onedal_fit: Calling onedal_estimator.fit", file=sys.stderr) self._onedal_estimator.fit(X, y, queue=queue) + print(f"DEBUG KNeighborsClassifier._onedal_fit: After fit, calling _save_attributes", file=sys.stderr) self._save_attributes() + print(f"DEBUG KNeighborsClassifier._onedal_fit END: self._fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) def _onedal_predict(self, X, queue=None): - return self._onedal_estimator.predict(X, queue=queue) + import sys + print(f"DEBUG KNeighborsClassifier._onedal_predict START: X type={type(X)}", file=sys.stderr) + result = self._onedal_estimator.predict(X, queue=queue) + print(f"DEBUG KNeighborsClassifier._onedal_predict END: result type={type(result)}", file=sys.stderr) + return result def _onedal_predict_proba(self, X, queue=None): - return self._onedal_estimator.predict_proba(X, queue=queue) + import sys + print(f"DEBUG KNeighborsClassifier._onedal_predict_proba START: X type={type(X)}", file=sys.stderr) + result = self._onedal_estimator.predict_proba(X, queue=queue) + print(f"DEBUG KNeighborsClassifier._onedal_predict_proba END: result type={type(result)}", file=sys.stderr) + return result def _onedal_kneighbors( self, X=None, n_neighbors=None, return_distance=True, queue=None ): - return self._onedal_estimator.kneighbors( + import sys + print(f"DEBUG KNeighborsClassifier._onedal_kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) + result = self._onedal_estimator.kneighbors( X, n_neighbors, return_distance, queue=queue ) + print(f"DEBUG KNeighborsClassifier._onedal_kneighbors END: result type={type(result)}", file=sys.stderr) + return result def _onedal_score(self, X, y, sample_weight=None, queue=None): import sys - print("DEBUG: _onedal_score called11111!", X, y, file=sys.stderr, flush=True) - - return accuracy_score( + print(f"DEBUG KNeighborsClassifier._onedal_score START: X type={type(X)}, y type={type(y)}", file=sys.stderr) + result = accuracy_score( y, self._onedal_predict(X, queue=queue), sample_weight=sample_weight ) + print(f"DEBUG KNeighborsClassifier._onedal_score END: result={result}", file=sys.stderr) + return result def _save_attributes(self): import sys - print("DEBUG: _save_attributes called11111!", self._onedal_estimator, file=sys.stderr, flush=True) - + print(f"DEBUG KNeighborsClassifier._save_attributes START", file=sys.stderr) self.classes_ = self._onedal_estimator.classes_ self.n_features_in_ = self._onedal_estimator.n_features_in_ self.n_samples_fit_ = self._onedal_estimator.n_samples_fit_ - fit_x = self._onedal_estimator._fit_X - self._fit_X = fit_x[0] if isinstance(fit_x, tuple) else fit_x + self._fit_X = self._onedal_estimator._fit_X + print(f"DEBUG KNeighborsClassifier._save_attributes: _fit_X type={type(self._fit_X)}", file=sys.stderr) self._y = self._onedal_estimator._y + print(f"DEBUG KNeighborsClassifier._save_attributes: _y type={type(self._y)}", file=sys.stderr) self._fit_method = self._onedal_estimator._fit_method self.outputs_2d_ = self._onedal_estimator.outputs_2d_ self._tree = self._onedal_estimator._tree + print(f"DEBUG KNeighborsClassifier._save_attributes END", file=sys.stderr) fit.__doc__ = _sklearn_KNeighborsClassifier.fit.__doc__ predict.__doc__ = _sklearn_KNeighborsClassifier.predict.__doc__ diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 93884b41b5..f788ed6618 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -14,7 +14,6 @@ # limitations under the License. # ============================================================================== -import numpy as np from sklearn.metrics import r2_score from sklearn.neighbors._regression import ( KNeighborsRegressor as _sklearn_KNeighborsRegressor, @@ -63,6 +62,8 @@ def __init__( ) def fit(self, X, y): + import sys + print(f"DEBUG KNeighborsRegressor.fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", file=sys.stderr) dispatch( self, "fit", @@ -73,20 +74,16 @@ def fit(self, X, y): X, y, ) + print(f"DEBUG KNeighborsRegressor.fit END: _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) return self @wrap_output_data def predict(self, X): + import sys + print(f"DEBUG KNeighborsRegressor.predict START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) check_is_fitted(self) check_feature_names(self, X, reset=False) - - # Perform preprocessing at sklearnex level - from onedal.utils.validation import _check_array - - X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) - self._validate_feature_count(X, "KNNRegressor") - - return dispatch( + result = dispatch( self, "predict", { @@ -95,19 +92,16 @@ def predict(self, X): }, X, ) + print(f"DEBUG KNeighborsRegressor.predict END: result type={type(result)}", file=sys.stderr) + return result @wrap_output_data def score(self, X, y, sample_weight=None): + import sys + print(f"DEBUG KNeighborsRegressor.score START: X type={type(X)}, y type={type(y)}", file=sys.stderr) check_is_fitted(self) check_feature_names(self, X, reset=False) - - # Perform preprocessing at sklearnex level - from onedal.utils.validation import _check_array - - X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) - self._validate_feature_count(X, "score") - - return dispatch( + result = dispatch( self, "score", { @@ -118,23 +112,17 @@ def score(self, X, y, sample_weight=None): y, sample_weight=sample_weight, ) + print(f"DEBUG KNeighborsRegressor.score END: result={result}", file=sys.stderr) + return result @wrap_output_data def kneighbors(self, X=None, n_neighbors=None, return_distance=True): + import sys + print(f"DEBUG KNeighborsRegressor.kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) check_is_fitted(self) if X is not None: check_feature_names(self, X, reset=False) - # Perform preprocessing at sklearnex level - from onedal.utils.validation import _check_array - - X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) - self._validate_feature_count(X, "kneighbors") - - # Validate n_neighbors - if n_neighbors is not None: - self._validate_n_neighbors(n_neighbors) - - return dispatch( + result = dispatch( self, "kneighbors", { @@ -145,22 +133,12 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors=n_neighbors, return_distance=return_distance, ) + print(f"DEBUG KNeighborsRegressor.kneighbors END: result type={type(result)}", file=sys.stderr) + return result def _onedal_fit(self, X, y, queue=None): - # Perform preprocessing at sklearnex level - X, y = self._validate_data( - X, y, dtype=[np.float64, np.float32], accept_sparse="csr" - ) - - # Validate n_neighbors - self._validate_n_neighbors(self.n_neighbors) - - # Parse auto method - self._fit_method = self._parse_auto_method(self.algorithm, X.shape[0], X.shape[1]) - - # Handle shape processing at sklearnex level - y = self._process_regression_targets(y) - + import sys + print(f"DEBUG KNeighborsRegressor._onedal_fit START: X type={type(X)}, y type={type(y)}", file=sys.stderr) onedal_params = { "n_neighbors": self.n_neighbors, "weights": self.weights, @@ -173,39 +151,52 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ - self._onedal_estimator._fit_method = self._fit_method - - # Set shape attributes on the onedal estimator - self._onedal_estimator._shape = self._shape - self._onedal_estimator._y = self._y - + print(f"DEBUG KNeighborsRegressor._onedal_fit: Calling onedal_estimator.fit", file=sys.stderr) self._onedal_estimator.fit(X, y, queue=queue) + print(f"DEBUG KNeighborsRegressor._onedal_fit: After fit, calling _save_attributes", file=sys.stderr) self._save_attributes() + print(f"DEBUG KNeighborsRegressor._onedal_fit END: self._fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) def _onedal_predict(self, X, queue=None): - return self._onedal_estimator.predict(X, queue=queue) + import sys + print(f"DEBUG KNeighborsRegressor._onedal_predict START: X type={type(X)}", file=sys.stderr) + result = self._onedal_estimator.predict(X, queue=queue) + print(f"DEBUG KNeighborsRegressor._onedal_predict END: result type={type(result)}", file=sys.stderr) + return result def _onedal_kneighbors( self, X=None, n_neighbors=None, return_distance=True, queue=None ): - return self._onedal_estimator.kneighbors( + import sys + print(f"DEBUG KNeighborsRegressor._onedal_kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) + result = self._onedal_estimator.kneighbors( X, n_neighbors, return_distance, queue=queue ) + print(f"DEBUG KNeighborsRegressor._onedal_kneighbors END: result type={type(result)}", file=sys.stderr) + return result def _onedal_score(self, X, y, sample_weight=None, queue=None): - return r2_score( + import sys + print(f"DEBUG KNeighborsRegressor._onedal_score START: X type={type(X)}, y type={type(y)}", file=sys.stderr) + result = r2_score( y, self._onedal_predict(X, queue=queue), sample_weight=sample_weight ) + print(f"DEBUG KNeighborsRegressor._onedal_score END: result={result}", file=sys.stderr) + return result def _save_attributes(self): + import sys + print(f"DEBUG KNeighborsRegressor._save_attributes START", file=sys.stderr) self.n_features_in_ = self._onedal_estimator.n_features_in_ self.n_samples_fit_ = self._onedal_estimator.n_samples_fit_ - fit_x = self._onedal_estimator._fit_X - self._fit_X = fit_x[0] if isinstance(fit_x, tuple) else fit_x + self._fit_X = self._onedal_estimator._fit_X + print(f"DEBUG KNeighborsRegressor._save_attributes: _fit_X type={type(self._fit_X)}", file=sys.stderr) self._y = self._onedal_estimator._y + print(f"DEBUG KNeighborsRegressor._save_attributes: _y type={type(self._y)}", file=sys.stderr) self._fit_method = self._onedal_estimator._fit_method self._tree = self._onedal_estimator._tree + print(f"DEBUG KNeighborsRegressor._save_attributes END", file=sys.stderr) fit.__doc__ = _sklearn_KNeighborsRegressor.__doc__ predict.__doc__ = _sklearn_KNeighborsRegressor.predict.__doc__ From f372bcbad14fdc6d3a086d9b0cdc6cfba5ea13f4 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 14 Oct 2025 11:59:01 -0700 Subject: [PATCH 29/87] fix: restore ad and add print --- sklearnex/neighbors/_lof.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sklearnex/neighbors/_lof.py b/sklearnex/neighbors/_lof.py index dd4525fb9c..e86d0f2b4f 100644 --- a/sklearnex/neighbors/_lof.py +++ b/sklearnex/neighbors/_lof.py @@ -53,9 +53,12 @@ class LocalOutlierFactor(KNeighborsDispatchingBase, _sklearn_LocalOutlierFactor) _onedal_kneighbors = NearestNeighbors._onedal_kneighbors def _onedal_fit(self, X, y, queue=None): + import sys + print(f"DEBUG LocalOutlierFactor._onedal_fit START: X type={type(X)}, y type={type(y)}", file=sys.stderr) if sklearn_check_version("1.2"): self._validate_params() + print(f"DEBUG LocalOutlierFactor._onedal_fit: Calling _onedal_knn_fit", file=sys.stderr) self._onedal_knn_fit(X, y, queue=queue) if self.contamination != "auto": @@ -75,6 +78,7 @@ def _onedal_fit(self, X, y, queue=None): ) self.n_neighbors_ = max(1, min(self.n_neighbors, n_samples - 1)) + print(f"DEBUG LocalOutlierFactor._onedal_fit: Calling _onedal_kneighbors", file=sys.stderr) ( self._distances_fit_X_, _neighbors_indices_fit_X_, @@ -109,6 +113,7 @@ def _onedal_fit(self, X, y, queue=None): "Increase the number of neighbors for more accurate results." ) + print(f"DEBUG LocalOutlierFactor._onedal_fit END: _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) return self def fit(self, X, y=None): From 169df263bc2d032c4508829839310e04a6742aa3 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 14 Oct 2025 12:24:13 -0700 Subject: [PATCH 30/87] fix: fix test as well --- .../tests/test_knn_classification.py | 118 ++++++++++++++---- 1 file changed, 93 insertions(+), 25 deletions(-) diff --git a/onedal/neighbors/tests/test_knn_classification.py b/onedal/neighbors/tests/test_knn_classification.py index c0410d8cb1..c272f7620a 100755 --- a/onedal/neighbors/tests/test_knn_classification.py +++ b/onedal/neighbors/tests/test_knn_classification.py @@ -1,5 +1,5 @@ # =============================================================================== -# Copyright 2022 Intel Corporation +# Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,36 +14,104 @@ # limitations under the License. # =============================================================================== -import numpy as np import pytest -from numpy.testing import assert_array_equal -from sklearn import datasets +from numpy.testing import assert_allclose -from sklearnex.neighbors import KNeighborsClassifier -from onedal.tests.utils._device_selection import get_queues +from onedal.tests.utils._dataframes_support import ( + _as_numpy, + _convert_to_dataframe, + get_dataframes_and_queues, +) +from sklearnex.neighbors import ( + KNeighborsClassifier, + KNeighborsRegressor, + LocalOutlierFactor, + NearestNeighbors, +) -@pytest.mark.parametrize("queue", get_queues()) -def test_iris(queue): - iris = datasets.load_iris() - clf = KNeighborsClassifier(2).fit(iris.data, iris.target) - assert clf.score(iris.data, iris.target) > 0.9 - assert_array_equal(clf.classes_, np.sort(clf.classes_)) +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) +def test_sklearnex_import_knn_classifier(dataframe, queue): + import sys + print(f"\n=== DEBUG test_sklearnex_import_knn_classifier START: dataframe={dataframe}, queue={queue} ===", file=sys.stderr) + X = _convert_to_dataframe([[0], [1], [2], [3]], sycl_queue=queue, target_df=dataframe) + print(f"DEBUG test: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + y = _convert_to_dataframe([0, 0, 1, 1], sycl_queue=queue, target_df=dataframe) + print(f"DEBUG test: y type={type(y)}", file=sys.stderr) + print(f"DEBUG test: Creating KNeighborsClassifier and calling fit", file=sys.stderr) + neigh = KNeighborsClassifier(n_neighbors=3).fit(X, y) + print(f"DEBUG test: fit completed, neigh._fit_X type={type(getattr(neigh, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + y_test = _convert_to_dataframe([[1.1]], sycl_queue=queue, target_df=dataframe) + print(f"DEBUG test: Calling predict with y_test type={type(y_test)}", file=sys.stderr) + pred = _as_numpy(neigh.predict(y_test)) + print(f"DEBUG test: predict completed, pred={pred}", file=sys.stderr) + assert "sklearnex" in neigh.__module__ + assert_allclose(pred, [0]) + print(f"=== DEBUG test_sklearnex_import_knn_classifier END ===\n", file=sys.stderr) -@pytest.mark.parametrize("queue", get_queues()) -def test_pickle(queue): - if queue and queue.sycl_device.is_gpu: - pytest.skip("KNN classifier pickling for the GPU sycl_queue is buggy.") - iris = datasets.load_iris() - clf = KNeighborsClassifier(2).fit(iris.data, iris.target) - expected = clf.predict(iris.data) +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) +def test_sklearnex_import_knn_regression(dataframe, queue): + import sys + print(f"\n=== DEBUG test_sklearnex_import_knn_regression START: dataframe={dataframe}, queue={queue} ===", file=sys.stderr) + X = _convert_to_dataframe([[0], [1], [2], [3]], sycl_queue=queue, target_df=dataframe) + print(f"DEBUG test: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + y = _convert_to_dataframe([0, 0, 1, 1], sycl_queue=queue, target_df=dataframe) + print(f"DEBUG test: y type={type(y)}", file=sys.stderr) + print(f"DEBUG test: Creating KNeighborsRegressor and calling fit", file=sys.stderr) + neigh = KNeighborsRegressor(n_neighbors=2).fit(X, y) + print(f"DEBUG test: fit completed, neigh._fit_X type={type(getattr(neigh, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + y_test = _convert_to_dataframe([[1.5]], sycl_queue=queue, target_df=dataframe) + print(f"DEBUG test: Calling predict with y_test type={type(y_test)}", file=sys.stderr) + pred = _as_numpy(neigh.predict(y_test)).squeeze() + print(f"DEBUG test: predict completed, pred={pred}", file=sys.stderr) + assert "sklearnex" in neigh.__module__ + assert_allclose(pred, 0.5) + print(f"=== DEBUG test_sklearnex_import_knn_regression END ===\n", file=sys.stderr) - import pickle - dump = pickle.dumps(clf) - clf2 = pickle.loads(dump) +@pytest.mark.parametrize("algorithm", ["auto", "brute"]) +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) +@pytest.mark.parametrize( + "estimator", + [LocalOutlierFactor, NearestNeighbors], +) +def test_sklearnex_kneighbors(algorithm, estimator, dataframe, queue): + import sys + print(f"\n=== DEBUG test_sklearnex_kneighbors START: algorithm={algorithm}, estimator={estimator.__name__}, dataframe={dataframe}, queue={queue} ===", file=sys.stderr) + X = [[0, 0, 2], [1, 0, 0], [0, 0, 1]] + X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + print(f"DEBUG test: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + test = _convert_to_dataframe([[0, 0, 1.3]], sycl_queue=queue, target_df=dataframe) + print(f"DEBUG test: test type={type(test)}", file=sys.stderr) + print(f"DEBUG test: Creating {estimator.__name__} and calling fit", file=sys.stderr) + neigh = estimator(n_neighbors=2, algorithm=algorithm).fit(X) + print(f"DEBUG test: fit completed, neigh._fit_X type={type(getattr(neigh, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print(f"DEBUG test: Calling kneighbors", file=sys.stderr) + result = neigh.kneighbors(test, 2, return_distance=False) + result = _as_numpy(result) + print(f"DEBUG test: kneighbors completed, result={result}", file=sys.stderr) + assert "sklearnex" in neigh.__module__ + assert_allclose(result, [[2, 0]]) + print(f"DEBUG test: Calling kneighbors with no args", file=sys.stderr) + result = neigh.kneighbors() + print(f"=== DEBUG test_sklearnex_kneighbors END ===\n", file=sys.stderr) - assert type(clf2) == clf.__class__ - result = clf2.predict(iris.data) - assert_array_equal(expected, result) + +@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) +def test_sklearnex_import_lof(dataframe, queue): + import sys + print(f"\n=== DEBUG test_sklearnex_import_lof START: dataframe={dataframe}, queue={queue} ===", file=sys.stderr) + X = [[7, 7, 7], [1, 0, 0], [0, 0, 1], [0, 0, 1]] + X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) + print(f"DEBUG test: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f"DEBUG test: Creating LocalOutlierFactor and calling fit_predict", file=sys.stderr) + lof = LocalOutlierFactor(n_neighbors=2) + result = lof.fit_predict(X) + result = _as_numpy(result) + print(f"DEBUG test: fit_predict completed, result={result}", file=sys.stderr) + print(f"DEBUG test: lof._fit_X type={type(getattr(lof, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + assert hasattr(lof, "_onedal_estimator") + assert "sklearnex" in lof.__module__ + assert_allclose(result, [-1, 1, 1, 1]) + print(f"=== DEBUG test_sklearnex_import_lof END ===\n", file=sys.stderr) \ No newline at end of file From 2a2a800ed91645541d8f982eadaeceaf4cbaace5 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 14 Oct 2025 12:26:21 -0700 Subject: [PATCH 31/87] fix: fix test --- .../tests/test_knn_classification.py | 135 ++++++------------ 1 file changed, 45 insertions(+), 90 deletions(-) diff --git a/onedal/neighbors/tests/test_knn_classification.py b/onedal/neighbors/tests/test_knn_classification.py index c272f7620a..0c0fb10edf 100755 --- a/onedal/neighbors/tests/test_knn_classification.py +++ b/onedal/neighbors/tests/test_knn_classification.py @@ -1,5 +1,5 @@ # =============================================================================== -# Copyright 2021 Intel Corporation +# Copyright 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,104 +14,59 @@ # limitations under the License. # =============================================================================== +import numpy as np import pytest -from numpy.testing import assert_allclose +from numpy.testing import assert_array_equal +from sklearn import datasets -from onedal.tests.utils._dataframes_support import ( - _as_numpy, - _convert_to_dataframe, - get_dataframes_and_queues, -) -from sklearnex.neighbors import ( - KNeighborsClassifier, - KNeighborsRegressor, - LocalOutlierFactor, - NearestNeighbors, -) +from onedal.neighbors import KNeighborsClassifier +from onedal.tests.utils._device_selection import get_queues -@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) -def test_sklearnex_import_knn_classifier(dataframe, queue): +@pytest.mark.parametrize("queue", get_queues()) +def test_iris(queue): import sys - print(f"\n=== DEBUG test_sklearnex_import_knn_classifier START: dataframe={dataframe}, queue={queue} ===", file=sys.stderr) - X = _convert_to_dataframe([[0], [1], [2], [3]], sycl_queue=queue, target_df=dataframe) - print(f"DEBUG test: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - y = _convert_to_dataframe([0, 0, 1, 1], sycl_queue=queue, target_df=dataframe) - print(f"DEBUG test: y type={type(y)}", file=sys.stderr) + print(f"\n=== DEBUG test_iris START: queue={queue} ===", file=sys.stderr) + iris = datasets.load_iris() + print(f"DEBUG test: iris.data type={type(iris.data)}, shape={iris.data.shape}", file=sys.stderr) + print(f"DEBUG test: iris.target type={type(iris.target)}, shape={iris.target.shape}", file=sys.stderr) print(f"DEBUG test: Creating KNeighborsClassifier and calling fit", file=sys.stderr) - neigh = KNeighborsClassifier(n_neighbors=3).fit(X, y) - print(f"DEBUG test: fit completed, neigh._fit_X type={type(getattr(neigh, '_fit_X', 'NOT_SET'))}", file=sys.stderr) - y_test = _convert_to_dataframe([[1.1]], sycl_queue=queue, target_df=dataframe) - print(f"DEBUG test: Calling predict with y_test type={type(y_test)}", file=sys.stderr) - pred = _as_numpy(neigh.predict(y_test)) - print(f"DEBUG test: predict completed, pred={pred}", file=sys.stderr) - assert "sklearnex" in neigh.__module__ - assert_allclose(pred, [0]) - print(f"=== DEBUG test_sklearnex_import_knn_classifier END ===\n", file=sys.stderr) + clf = KNeighborsClassifier(2).fit(iris.data, iris.target, queue=queue) + print(f"DEBUG test: fit completed, clf._fit_X type={type(getattr(clf, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print(f"DEBUG test: Calling score", file=sys.stderr) + score = clf.score(iris.data, iris.target, queue=queue) + print(f"DEBUG test: score completed, score={score}", file=sys.stderr) + assert score > 0.9 + assert_array_equal(clf.classes_, np.sort(clf.classes_)) + print(f"=== DEBUG test_iris END ===\n", file=sys.stderr) -@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) -def test_sklearnex_import_knn_regression(dataframe, queue): +@pytest.mark.parametrize("queue", get_queues()) +def test_pickle(queue): import sys - print(f"\n=== DEBUG test_sklearnex_import_knn_regression START: dataframe={dataframe}, queue={queue} ===", file=sys.stderr) - X = _convert_to_dataframe([[0], [1], [2], [3]], sycl_queue=queue, target_df=dataframe) - print(f"DEBUG test: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - y = _convert_to_dataframe([0, 0, 1, 1], sycl_queue=queue, target_df=dataframe) - print(f"DEBUG test: y type={type(y)}", file=sys.stderr) - print(f"DEBUG test: Creating KNeighborsRegressor and calling fit", file=sys.stderr) - neigh = KNeighborsRegressor(n_neighbors=2).fit(X, y) - print(f"DEBUG test: fit completed, neigh._fit_X type={type(getattr(neigh, '_fit_X', 'NOT_SET'))}", file=sys.stderr) - y_test = _convert_to_dataframe([[1.5]], sycl_queue=queue, target_df=dataframe) - print(f"DEBUG test: Calling predict with y_test type={type(y_test)}", file=sys.stderr) - pred = _as_numpy(neigh.predict(y_test)).squeeze() - print(f"DEBUG test: predict completed, pred={pred}", file=sys.stderr) - assert "sklearnex" in neigh.__module__ - assert_allclose(pred, 0.5) - print(f"=== DEBUG test_sklearnex_import_knn_regression END ===\n", file=sys.stderr) - + print(f"\n=== DEBUG test_pickle START: queue={queue} ===", file=sys.stderr) + if queue and queue.sycl_device.is_gpu: + pytest.skip("KNN classifier pickling for the GPU sycl_queue is buggy.") + iris = datasets.load_iris() + print(f"DEBUG test: iris.data type={type(iris.data)}, shape={iris.data.shape}", file=sys.stderr) + print(f"DEBUG test: iris.target type={type(iris.target)}, shape={iris.target.shape}", file=sys.stderr) + print(f"DEBUG test: Creating KNeighborsClassifier and calling fit", file=sys.stderr) + clf = KNeighborsClassifier(2).fit(iris.data, iris.target, queue=queue) + print(f"DEBUG test: fit completed, clf._fit_X type={type(getattr(clf, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print(f"DEBUG test: Calling predict", file=sys.stderr) + expected = clf.predict(iris.data, queue=queue) + print(f"DEBUG test: predict completed, expected type={type(expected)}, shape={expected.shape}", file=sys.stderr) -@pytest.mark.parametrize("algorithm", ["auto", "brute"]) -@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) -@pytest.mark.parametrize( - "estimator", - [LocalOutlierFactor, NearestNeighbors], -) -def test_sklearnex_kneighbors(algorithm, estimator, dataframe, queue): - import sys - print(f"\n=== DEBUG test_sklearnex_kneighbors START: algorithm={algorithm}, estimator={estimator.__name__}, dataframe={dataframe}, queue={queue} ===", file=sys.stderr) - X = [[0, 0, 2], [1, 0, 0], [0, 0, 1]] - X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) - print(f"DEBUG test: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - test = _convert_to_dataframe([[0, 0, 1.3]], sycl_queue=queue, target_df=dataframe) - print(f"DEBUG test: test type={type(test)}", file=sys.stderr) - print(f"DEBUG test: Creating {estimator.__name__} and calling fit", file=sys.stderr) - neigh = estimator(n_neighbors=2, algorithm=algorithm).fit(X) - print(f"DEBUG test: fit completed, neigh._fit_X type={type(getattr(neigh, '_fit_X', 'NOT_SET'))}", file=sys.stderr) - print(f"DEBUG test: Calling kneighbors", file=sys.stderr) - result = neigh.kneighbors(test, 2, return_distance=False) - result = _as_numpy(result) - print(f"DEBUG test: kneighbors completed, result={result}", file=sys.stderr) - assert "sklearnex" in neigh.__module__ - assert_allclose(result, [[2, 0]]) - print(f"DEBUG test: Calling kneighbors with no args", file=sys.stderr) - result = neigh.kneighbors() - print(f"=== DEBUG test_sklearnex_kneighbors END ===\n", file=sys.stderr) + import pickle + print(f"DEBUG test: Pickling classifier", file=sys.stderr) + dump = pickle.dumps(clf) + print(f"DEBUG test: Unpickling classifier", file=sys.stderr) + clf2 = pickle.loads(dump) -@pytest.mark.parametrize("dataframe,queue", get_dataframes_and_queues()) -def test_sklearnex_import_lof(dataframe, queue): - import sys - print(f"\n=== DEBUG test_sklearnex_import_lof START: dataframe={dataframe}, queue={queue} ===", file=sys.stderr) - X = [[7, 7, 7], [1, 0, 0], [0, 0, 1], [0, 0, 1]] - X = _convert_to_dataframe(X, sycl_queue=queue, target_df=dataframe) - print(f"DEBUG test: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - print(f"DEBUG test: Creating LocalOutlierFactor and calling fit_predict", file=sys.stderr) - lof = LocalOutlierFactor(n_neighbors=2) - result = lof.fit_predict(X) - result = _as_numpy(result) - print(f"DEBUG test: fit_predict completed, result={result}", file=sys.stderr) - print(f"DEBUG test: lof._fit_X type={type(getattr(lof, '_fit_X', 'NOT_SET'))}", file=sys.stderr) - assert hasattr(lof, "_onedal_estimator") - assert "sklearnex" in lof.__module__ - assert_allclose(result, [-1, 1, 1, 1]) - print(f"=== DEBUG test_sklearnex_import_lof END ===\n", file=sys.stderr) \ No newline at end of file + assert type(clf2) == clf.__class__ + print(f"DEBUG test: Calling predict on unpickled classifier", file=sys.stderr) + result = clf2.predict(iris.data, queue=queue) + print(f"DEBUG test: predict completed, result type={type(result)}, shape={result.shape}", file=sys.stderr) + assert_array_equal(expected, result) + print(f"=== DEBUG test_pickle END ===\n", file=sys.stderr) \ No newline at end of file From 4377198108014732c91b91ac5afa632415c465c9 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 14 Oct 2025 12:57:38 -0700 Subject: [PATCH 32/87] fix: comment out validate data --- sklearnex/neighbors/common.py | 60 +++++++++++++++++------------------ 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index 843952ffb0..d157be005e 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -60,36 +60,36 @@ def _parse_auto_method(self, method, n_samples, n_features): return result_method - def _validate_data( - self, X, y=None, reset=True, validate_separately=None, **check_params - ): - if y is None: - if getattr(self, "requires_y", False): - raise ValueError( - f"This {self.__class__.__name__} estimator " - f"requires y to be passed, but the target y is None." - ) - X = _check_array(X, **check_params) - out = X, y - else: - if validate_separately: - # We need this because some estimators validate X and y - # separately, and in general, separately calling _check_array() - # on X and y isn't equivalent to just calling _check_X_y() - # :( - check_X_params, check_y_params = validate_separately - X = _check_array(X, **check_X_params) - y = _check_array(y, **check_y_params) - else: - X, y = _check_X_y(X, y, **check_params) - out = X, y - - if check_params.get("ensure_2d", True): - from onedal.utils.validation import _check_n_features - - _check_n_features(self, X, reset=reset) - - return out + # def _validate_data( + # self, X, y=None, reset=True, validate_separately=None, **check_params + # ): + # if y is None: + # if getattr(self, "requires_y", False): + # raise ValueError( + # f"This {self.__class__.__name__} estimator " + # f"requires y to be passed, but the target y is None." + # ) + # X = _check_array(X, **check_params) + # out = X, y + # else: + # if validate_separately: + # # We need this because some estimators validate X and y + # # separately, and in general, separately calling _check_array() + # # on X and y isn't equivalent to just calling _check_X_y() + # # :( + # check_X_params, check_y_params = validate_separately + # X = _check_array(X, **check_X_params) + # y = _check_array(y, **check_y_params) + # else: + # X, y = _check_X_y(X, y, **check_params) + # out = X, y + + # if check_params.get("ensure_2d", True): + # from onedal.utils.validation import _check_n_features + + # _check_n_features(self, X, reset=reset) + + # return out def _get_weights(self, dist, weights): if weights in (None, "uniform"): From 50f9b9d965df4008fa2806b5e666b3e7289ca4b0 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 14 Oct 2025 13:30:48 -0700 Subject: [PATCH 33/87] fix: refactoredclassifier prepressing to sklearnex --- onedal/neighbors/neighbors.py | 51 +++++++++++++---------- sklearnex/neighbors/knn_classification.py | 15 +++++++ 2 files changed, 45 insertions(+), 21 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index 0785a4b754..f7d53a9067 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -197,7 +197,10 @@ def _fit(self, X, y): self._onedal_model = None self._tree = None self._shape = None - self.classes_ = None + # REFACTOR STEP 1: Don't reset classes_ - it may have been set by sklearnex layer + # self.classes_ = None + if not hasattr(self, 'classes_'): + self.classes_ = None self.effective_metric_ = getattr(self, "effective_metric_", self.metric) self.effective_metric_params_ = getattr( self, "effective_metric_params_", self.metric_params @@ -213,26 +216,32 @@ def _fit(self, X, y): ) self._shape = shape if shape is not None else y.shape - if _is_classifier(self): - if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1: - self.outputs_2d_ = False - y = y.reshape((-1, 1)) - else: - self.outputs_2d_ = True - - _check_classification_targets(y) - self.classes_ = [] - self._y = np.empty(y.shape, dtype=int) - for k in range(self._y.shape[1]): - classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True) - self.classes_.append(classes) - - if not self.outputs_2d_: - self.classes_ = self.classes_[0] - self._y = self._y.ravel() - - self._validate_n_classes() - else: + # REFACTOR STEP 1: Classification target processing moved to sklearnex layer + # This code is now commented out - processing happens in sklearnex before calling fit + # if _is_classifier(self): + # if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1: + # self.outputs_2d_ = False + # y = y.reshape((-1, 1)) + # else: + # self.outputs_2d_ = True + + # _check_classification_targets(y) + # self.classes_ = [] + # self._y = np.empty(y.shape, dtype=int) + # for k in range(self._y.shape[1]): + # classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True) + # self.classes_.append(classes) + + # if not self.outputs_2d_: + # self.classes_ = self.classes_[0] + # self._y = self._y.ravel() + + # self._validate_n_classes() + # else: + # self._y = y + + # For now, keep basic _y assignment for compatibility + if not hasattr(self, '_y'): self._y = y elif not use_raw_input: X, _ = super()._validate_data(X, dtype=[np.float64, np.float32]) diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index f6a867e234..82c155c185 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -159,6 +159,12 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): def _onedal_fit(self, X, y, queue=None): import sys print(f"DEBUG KNeighborsClassifier._onedal_fit START: X type={type(X)}, y type={type(y)}", file=sys.stderr) + + # REFACTOR STEP 1: Process classification targets in sklearnex before passing to onedal + print(f"DEBUG: Processing classification targets in sklearnex", file=sys.stderr) + y_processed = self._process_classification_targets(y) + print(f"DEBUG: After _process_classification_targets, y_processed type={type(y_processed)}", file=sys.stderr) + onedal_params = { "n_neighbors": self.n_neighbors, "weights": self.weights, @@ -171,6 +177,15 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ + + # REFACTOR: Pass both original and processed targets to onedal + # onedal needs the processed classes_ and _y attributes that we just set + self._onedal_estimator.classes_ = self.classes_ + self._onedal_estimator._y = self._y + self._onedal_estimator.outputs_2d_ = self.outputs_2d_ + print(f"DEBUG: Set onedal_estimator.classes_={self._onedal_estimator.classes_}", file=sys.stderr) + print(f"DEBUG: Set onedal_estimator._y shape={self._onedal_estimator._y.shape}", file=sys.stderr) + print(f"DEBUG KNeighborsClassifier._onedal_fit: Calling onedal_estimator.fit", file=sys.stderr) self._onedal_estimator.fit(X, y, queue=queue) print(f"DEBUG KNeighborsClassifier._onedal_fit: After fit, calling _save_attributes", file=sys.stderr) From 833f7aba32adf4a2a9d26ed505eef039bfb636a0 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 14 Oct 2025 14:09:22 -0700 Subject: [PATCH 34/87] fix: add vlaidate data and see if it fix attributeerror --- sklearnex/neighbors/common.py | 14 ++++++++++++-- sklearnex/neighbors/knn_classification.py | 9 ++++++++- sklearnex/neighbors/knn_regression.py | 10 +++++++++- sklearnex/neighbors/knn_unsupervised.py | 11 ++++++++++- 4 files changed, 39 insertions(+), 5 deletions(-) diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index d157be005e..6db3490840 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -25,7 +25,10 @@ from sklearn.neighbors._kd_tree import KDTree from sklearn.utils.validation import check_is_fitted +from daal4py.sklearn._n_jobs_support import control_n_jobs from daal4py.sklearn._utils import sklearn_check_version + +from ..utils.validation import validate_data from onedal._device_offload import _transfer_to_host from onedal.utils.validation import ( _check_array, @@ -167,8 +170,15 @@ def _validate_kneighbors_bounds(self, n_neighbors, query_is_train, X): ) def _process_classification_targets(self, y): - """Process classification targets and set class-related attributes.""" - import numpy as np + """Process classification targets and set class-related attributes. + + Note: y should already be converted to numpy array via validate_data before calling this. + """ + import sys + print(f"DEBUG _process_classification_targets: y type={type(y)}, y shape={getattr(y, 'shape', 'NO_SHAPE')}", file=sys.stderr) + + # y should already be numpy array from validate_data + y = np.asarray(y) # Handle shape processing shape = getattr(y, "shape", None) diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index 82c155c185..57fb511a8a 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -14,6 +14,7 @@ # limitations under the License. # =============================================================================== +import numpy as np from sklearn.metrics import accuracy_score from sklearn.neighbors._classification import ( KNeighborsClassifier as _sklearn_KNeighborsClassifier, @@ -26,7 +27,7 @@ from onedal.neighbors import KNeighborsClassifier as onedal_KNeighborsClassifier from .._device_offload import dispatch, wrap_output_data -from ..utils.validation import check_feature_names +from ..utils.validation import check_feature_names, validate_data from .common import KNeighborsDispatchingBase @@ -160,6 +161,12 @@ def _onedal_fit(self, X, y, queue=None): import sys print(f"DEBUG KNeighborsClassifier._onedal_fit START: X type={type(X)}, y type={type(y)}", file=sys.stderr) + # REFACTOR: Use validate_data from sklearnex.utils.validation to convert pandas to numpy + X, y = validate_data( + self, X, y, dtype=[np.float64, np.float32], accept_sparse="csr" + ) + print(f"DEBUG: After validate_data, X type={type(X)}, y type={type(y)}", file=sys.stderr) + # REFACTOR STEP 1: Process classification targets in sklearnex before passing to onedal print(f"DEBUG: Processing classification targets in sklearnex", file=sys.stderr) y_processed = self._process_classification_targets(y) diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index f788ed6618..b659e478f7 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -14,6 +14,7 @@ # limitations under the License. # ============================================================================== +import numpy as np from sklearn.metrics import r2_score from sklearn.neighbors._regression import ( KNeighborsRegressor as _sklearn_KNeighborsRegressor, @@ -26,7 +27,7 @@ from onedal.neighbors import KNeighborsRegressor as onedal_KNeighborsRegressor from .._device_offload import dispatch, wrap_output_data -from ..utils.validation import check_feature_names +from ..utils.validation import check_feature_names, validate_data from .common import KNeighborsDispatchingBase @@ -139,6 +140,13 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): def _onedal_fit(self, X, y, queue=None): import sys print(f"DEBUG KNeighborsRegressor._onedal_fit START: X type={type(X)}, y type={type(y)}", file=sys.stderr) + + # REFACTOR: Use validate_data from sklearnex.utils.validation to convert pandas to numpy + X, y = validate_data( + self, X, y, dtype=[np.float64, np.float32], accept_sparse="csr", y_numeric=True + ) + print(f"DEBUG: After validate_data, X type={type(X)}, y type={type(y)}", file=sys.stderr) + onedal_params = { "n_neighbors": self.n_neighbors, "weights": self.weights, diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 556847fc6e..d5851792ac 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -15,6 +15,8 @@ # =============================================================================== import sys + +import numpy as np from sklearn.neighbors._unsupervised import NearestNeighbors as _sklearn_NearestNeighbors from sklearn.utils.validation import _deprecate_positional_args, check_is_fitted @@ -24,7 +26,7 @@ from onedal.neighbors import NearestNeighbors as onedal_NearestNeighbors from .._device_offload import dispatch, wrap_output_data -from ..utils.validation import check_feature_names +from ..utils.validation import check_feature_names, validate_data from .common import KNeighborsDispatchingBase @@ -142,6 +144,13 @@ def radius_neighbors_graph( def _onedal_fit(self, X, y=None, queue=None): print(f"DEBUG NearestNeighbors._onedal_fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", file=sys.stderr) + + # REFACTOR: Use validate_data from sklearnex.utils.validation to convert pandas to numpy + X = validate_data( + self, X, dtype=[np.float64, np.float32], accept_sparse="csr" + ) + print(f"DEBUG: After validate_data, X type={type(X)}", file=sys.stderr) + onedal_params = { "n_neighbors": self.n_neighbors, "algorithm": self.algorithm, From a2af2ef51e8b99ab7037b9985090185b271c6405 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 14 Oct 2025 15:01:26 -0700 Subject: [PATCH 35/87] fix: fix onedal test --- onedal/neighbors/neighbors.py | 25 ++++++++++++++----- .../tests/test_knn_classification.py | 16 +++++++----- sklearnex/neighbors/knn_classification.py | 3 ++- 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index f7d53a9067..1730ded60a 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -216,8 +216,23 @@ def _fit(self, X, y): ) self._shape = shape if shape is not None else y.shape - # REFACTOR STEP 1: Classification target processing moved to sklearnex layer - # This code is now commented out - processing happens in sklearnex before calling fit + # REFACTOR: Classification target processing moved to sklearnex layer + # This code is now commented out - processing MUST happen in sklearnex before calling fit + # Assertion: Verify that sklearnex has done the preprocessing + if _is_classifier(self): + if not hasattr(self, 'classes_') or self.classes_ is None: + raise ValueError( + "Classification target processing must be done in sklearnex layer before calling onedal fit. " + "classes_ attribute is not set. This indicates the refactoring is incomplete." + ) + if not hasattr(self, '_y') or self._y is None: + raise ValueError( + "Classification target processing must be done in sklearnex layer before calling onedal fit. " + "_y attribute is not set. This indicates the refactoring is incomplete." + ) + print(f"DEBUG oneDAL: Using pre-processed classification targets from sklearnex (classes_={self.classes_})", file=sys.stderr) + + # Original classification processing code - NOW COMMENTED OUT (moved to sklearnex) # if _is_classifier(self): # if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1: # self.outputs_2d_ = False @@ -238,10 +253,8 @@ def _fit(self, X, y): # self._validate_n_classes() # else: - # self._y = y - - # For now, keep basic _y assignment for compatibility - if not hasattr(self, '_y'): + else: + # For regressors, just store y self._y = y elif not use_raw_input: X, _ = super()._validate_data(X, dtype=[np.float64, np.float32]) diff --git a/onedal/neighbors/tests/test_knn_classification.py b/onedal/neighbors/tests/test_knn_classification.py index 0c0fb10edf..783d9d6e24 100755 --- a/onedal/neighbors/tests/test_knn_classification.py +++ b/onedal/neighbors/tests/test_knn_classification.py @@ -19,7 +19,9 @@ from numpy.testing import assert_array_equal from sklearn import datasets -from onedal.neighbors import KNeighborsClassifier +# REFACTOR: Import from sklearnex instead of onedal +# Classification processing now happens in sklearnex layer +from sklearnex.neighbors import KNeighborsClassifier from onedal.tests.utils._device_selection import get_queues @@ -27,14 +29,15 @@ def test_iris(queue): import sys print(f"\n=== DEBUG test_iris START: queue={queue} ===", file=sys.stderr) + # REFACTOR NOTE: queue parameter not used with sklearnex, but kept for test parametrization iris = datasets.load_iris() print(f"DEBUG test: iris.data type={type(iris.data)}, shape={iris.data.shape}", file=sys.stderr) print(f"DEBUG test: iris.target type={type(iris.target)}, shape={iris.target.shape}", file=sys.stderr) print(f"DEBUG test: Creating KNeighborsClassifier and calling fit", file=sys.stderr) - clf = KNeighborsClassifier(2).fit(iris.data, iris.target, queue=queue) + clf = KNeighborsClassifier(2).fit(iris.data, iris.target) print(f"DEBUG test: fit completed, clf._fit_X type={type(getattr(clf, '_fit_X', 'NOT_SET'))}", file=sys.stderr) print(f"DEBUG test: Calling score", file=sys.stderr) - score = clf.score(iris.data, iris.target, queue=queue) + score = clf.score(iris.data, iris.target) print(f"DEBUG test: score completed, score={score}", file=sys.stderr) assert score > 0.9 assert_array_equal(clf.classes_, np.sort(clf.classes_)) @@ -45,16 +48,17 @@ def test_iris(queue): def test_pickle(queue): import sys print(f"\n=== DEBUG test_pickle START: queue={queue} ===", file=sys.stderr) + # REFACTOR NOTE: queue parameter not used with sklearnex, but kept for test parametrization if queue and queue.sycl_device.is_gpu: pytest.skip("KNN classifier pickling for the GPU sycl_queue is buggy.") iris = datasets.load_iris() print(f"DEBUG test: iris.data type={type(iris.data)}, shape={iris.data.shape}", file=sys.stderr) print(f"DEBUG test: iris.target type={type(iris.target)}, shape={iris.target.shape}", file=sys.stderr) print(f"DEBUG test: Creating KNeighborsClassifier and calling fit", file=sys.stderr) - clf = KNeighborsClassifier(2).fit(iris.data, iris.target, queue=queue) + clf = KNeighborsClassifier(2).fit(iris.data, iris.target) print(f"DEBUG test: fit completed, clf._fit_X type={type(getattr(clf, '_fit_X', 'NOT_SET'))}", file=sys.stderr) print(f"DEBUG test: Calling predict", file=sys.stderr) - expected = clf.predict(iris.data, queue=queue) + expected = clf.predict(iris.data) print(f"DEBUG test: predict completed, expected type={type(expected)}, shape={expected.shape}", file=sys.stderr) import pickle @@ -66,7 +70,7 @@ def test_pickle(queue): assert type(clf2) == clf.__class__ print(f"DEBUG test: Calling predict on unpickled classifier", file=sys.stderr) - result = clf2.predict(iris.data, queue=queue) + result = clf2.predict(iris.data) print(f"DEBUG test: predict completed, result type={type(result)}, shape={result.shape}", file=sys.stderr) assert_array_equal(expected, result) print(f"=== DEBUG test_pickle END ===\n", file=sys.stderr) \ No newline at end of file diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index 57fb511a8a..59a2d6f73d 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -193,7 +193,8 @@ def _onedal_fit(self, X, y, queue=None): print(f"DEBUG: Set onedal_estimator.classes_={self._onedal_estimator.classes_}", file=sys.stderr) print(f"DEBUG: Set onedal_estimator._y shape={self._onedal_estimator._y.shape}", file=sys.stderr) - print(f"DEBUG KNeighborsClassifier._onedal_fit: Calling onedal_estimator.fit", file=sys.stderr) + print(f"DEBUG KNeighborsClassifier._onedal_fit: Calling onedal_estimator.fit with X and original y", file=sys.stderr) + # Pass original y to onedal - it will use the pre-set classes_ and _y attributes we just assigned self._onedal_estimator.fit(X, y, queue=queue) print(f"DEBUG KNeighborsClassifier._onedal_fit: After fit, calling _save_attributes", file=sys.stderr) From 0b601f9d8720f34898db3d806f3c22786bd81e3f Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 14 Oct 2025 15:26:26 -0700 Subject: [PATCH 36/87] fix: dpm --- sklearnex/neighbors/knn_regression.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index b659e478f7..27c5ce0e4d 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -141,9 +141,9 @@ def _onedal_fit(self, X, y, queue=None): import sys print(f"DEBUG KNeighborsRegressor._onedal_fit START: X type={type(X)}, y type={type(y)}", file=sys.stderr) - # REFACTOR: Use validate_data from sklearnex.utils.validation to convert pandas to numpy - X, y = validate_data( - self, X, y, dtype=[np.float64, np.float32], accept_sparse="csr", y_numeric=True + # REFACTOR: Use validate_data from sklearnex.utils.validation to convert pandas to numpy for X only + X = validate_data( + self, X, dtype=[np.float64, np.float32], accept_sparse="csr" ) print(f"DEBUG: After validate_data, X type={type(X)}, y type={type(y)}", file=sys.stderr) From 97f9bd1d35273aa84db2a1fc200a3504a7514eb2 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 14 Oct 2025 16:24:47 -0700 Subject: [PATCH 37/87] fix: refacto validate n classes --- onedal/neighbors/neighbors.py | 19 ++++++++++++------- sklearnex/neighbors/common.py | 14 ++++++++++++++ 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index 1730ded60a..f78c3c9b13 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -185,12 +185,15 @@ def _validate_targets(self, y, dtype): except ValueError: return arr - def _validate_n_classes(self): - length = 0 if self.classes_ is None else len(self.classes_) - if length < 2: - raise ValueError( - f"The number of classes has to be greater than one; got {length}" - ) + # REFACTOR NOTE: _validate_n_classes moved to sklearnex/neighbors/common.py + # This method is no longer used in the onedal layer - all validation happens in sklearnex + # Commented out for reference only + # def _validate_n_classes(self): + # length = 0 if self.classes_ is None else len(self.classes_) + # if length < 2: + # raise ValueError( + # f"The number of classes has to be greater than one; got {length}" + # ) def _fit(self, X, y): print(f"DEBUG oneDAL _fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", file=sys.stderr) @@ -480,7 +483,9 @@ def predict(self, X, queue=None): self.algorithm, n_samples_fit_, n_features ) - self._validate_n_classes() + # REFACTOR NOTE: _validate_n_classes() is now called during fit in sklearnex layer + # No need to validate again during predict + # self._validate_n_classes() params = self._get_onedal_params(X) prediction_result = self._onedal_predict(onedal_model, X, params) diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index 6db3490840..636f577e85 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -143,6 +143,14 @@ def _validate_n_neighbors(self, n_neighbors): "enter integer value" % type(n_neighbors) ) + def _validate_n_classes(self): + """Validate that the classifier has at least 2 classes.""" + length = 0 if self.classes_ is None else len(self.classes_) + if length < 2: + raise ValueError( + f"The number of classes has to be greater than one; got {length}" + ) + def _validate_feature_count(self, X, method_name=""): n_features = getattr(self, "n_features_in_", None) shape = getattr(X, "shape", None) @@ -190,6 +198,9 @@ def _process_classification_targets(self, y): else: self.outputs_2d_ = True + # Validate classification targets + _check_classification_targets(y) + # Process classes self.classes_ = [] self._y = np.empty(y.shape, dtype=int) @@ -201,6 +212,9 @@ def _process_classification_targets(self, y): self.classes_ = self.classes_[0] self._y = self._y.ravel() + # Validate we have at least 2 classes + self._validate_n_classes() + return y def _process_regression_targets(self, y): From e5300cad3b8458528a617434690f8f39ec5f1ee1 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 14 Oct 2025 17:01:07 -0700 Subject: [PATCH 38/87] fix: refacor kneighbors validation --- onedal/neighbors/neighbors.py | 96 ++++++++++++++--------- sklearnex/neighbors/common.py | 19 +++++ sklearnex/neighbors/knn_classification.py | 4 + sklearnex/neighbors/knn_regression.py | 4 + sklearnex/neighbors/knn_unsupervised.py | 4 + 5 files changed, 91 insertions(+), 36 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index f78c3c9b13..bc68f4a8ab 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -210,13 +210,16 @@ def _fit(self, X, y): ) _, xp, _ = _get_sycl_namespace(X) - use_raw_input = _get_config().get("use_raw_input", False) is True + # REFACTOR: _validate_data call commented out - validation now happens in sklearnex layer + # Original code kept for reference: + # use_raw_input = _get_config().get("use_raw_input", False) is True if y is not None or self.requires_y: shape = getattr(y, "shape", None) - if not use_raw_input: - X, y = super()._validate_data( - X, y, dtype=[np.float64, np.float32], accept_sparse="csr" - ) + # REFACTOR: _validate_data call commented out - validation now happens in sklearnex layer + # if not use_raw_input: + # X, y = super()._validate_data( + # X, y, dtype=[np.float64, np.float32], accept_sparse="csr" + # ) self._shape = shape if shape is not None else y.shape # REFACTOR: Classification target processing moved to sklearnex layer @@ -259,21 +262,24 @@ def _fit(self, X, y): else: # For regressors, just store y self._y = y - elif not use_raw_input: - X, _ = super()._validate_data(X, dtype=[np.float64, np.float32]) + # REFACTOR: _validate_data call commented out - validation now happens in sklearnex layer + # elif not use_raw_input: + # X, _ = super()._validate_data(X, dtype=[np.float64, np.float32]) self.n_samples_fit_ = X.shape[0] self.n_features_in_ = X.shape[1] self._fit_X = X - if self.n_neighbors is not None: - if self.n_neighbors <= 0: - raise ValueError("Expected n_neighbors > 0. Got %d" % self.n_neighbors) - if not isinstance(self.n_neighbors, Integral): - raise TypeError( - "n_neighbors does not take %s value, " - "enter integer value" % type(self.n_neighbors) - ) + # REFACTOR: n_neighbors validation commented out - should be done in sklearnex layer + # Original code kept for reference: + # if self.n_neighbors is not None: + # if self.n_neighbors <= 0: + # raise ValueError("Expected n_neighbors > 0. Got %d" % self.n_neighbors) + # if not isinstance(self.n_neighbors, Integral): + # raise TypeError( + # "n_neighbors does not take %s value, " + # "enter integer value" % type(self.n_neighbors) + # ) self._fit_method = super()._parse_auto_method( self.algorithm, self.n_samples_fit_, self.n_features_in_ @@ -298,35 +304,53 @@ def _fit(self, X, y): return result def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): - use_raw_input = _get_config().get("use_raw_input", False) is True + # REFACTOR: Feature count validation commented out - should be done in sklearnex layer + # Original validation code kept for reference: + # use_raw_input = _get_config().get("use_raw_input", False) is True + # n_features = getattr(self, "n_features_in_", None) + # shape = getattr(X, "shape", None) + # if n_features and shape and len(shape) > 1 and shape[1] != n_features: + # raise ValueError( + # ( + # f"X has {X.shape[1]} features, " + # f"but kneighbors is expecting " + # f"{n_features} features as input" + # ) + # ) + + # Still need n_features for _parse_auto_method call later n_features = getattr(self, "n_features_in_", None) - shape = getattr(X, "shape", None) - if n_features and shape and len(shape) > 1 and shape[1] != n_features: - raise ValueError( - ( - f"X has {X.shape[1]} features, " - f"but kneighbors is expecting " - f"{n_features} features as input" - ) - ) _check_is_fitted(self) if n_neighbors is None: n_neighbors = self.n_neighbors - elif n_neighbors <= 0: - raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors) - else: - if not isinstance(n_neighbors, Integral): - raise TypeError( - "n_neighbors does not take %s value, " - "enter integer value" % type(n_neighbors) - ) - + # REFACTOR: n_neighbors validation commented out - should be done in sklearnex layer + # Original validation code kept for reference: + # elif n_neighbors <= 0: + # raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors) + # else: + # if not isinstance(n_neighbors, Integral): + # raise TypeError( + # "n_neighbors does not take %s value, " + # "enter integer value" % type(n_neighbors) + # ) + + # REFACTOR: X array validation commented out - should be done in sklearnex layer + # Original validation code kept for reference: + # if X is not None: + # query_is_train = False + # if not use_raw_input: + # X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + # else: + # query_is_train = True + # X = self._fit_X + # # Include an extra neighbor to account for the sample itself being + # # returned, which is removed later + # n_neighbors += 1 + if X is not None: query_is_train = False - if not use_raw_input: - X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) else: query_is_train = True X = self._fit_X diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index 636f577e85..4a5072fd80 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -177,6 +177,23 @@ def _validate_kneighbors_bounds(self, n_neighbors, query_is_train, X): f"n_samples = {X.shape[0]}" # include n_samples for common tests ) + def _kneighbors_validation(self, X, n_neighbors): + """Shared validation for kneighbors method called from sklearnex layer. + + Validates: + - Feature count matches training data if X is provided + - n_neighbors is within valid bounds if provided + """ + # Validate feature count if X is provided + if X is not None: + self._validate_feature_count(X) + + # Validate n_neighbors bounds if provided + if n_neighbors is not None: + # Determine if query is the training set + query_is_train = X is None or (hasattr(self, '_fit_X') and X is self._fit_X) + self._validate_kneighbors_bounds(n_neighbors, query_is_train, X if X is not None else self._fit_X) + def _process_classification_targets(self, y): """Process classification targets and set class-related attributes. @@ -229,6 +246,8 @@ def _fit_validation(self, X, y=None): if sklearn_check_version("1.2"): self._validate_params() check_feature_names(self, X, reset=True) + # Validate n_neighbors parameter + self._validate_n_neighbors() if self.metric_params is not None and "p" in self.metric_params: if self.p is not None: warnings.warn( diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index 59a2d6f73d..e84a3d6da3 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -143,6 +143,10 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): check_is_fitted(self) if X is not None: check_feature_names(self, X, reset=False) + + # Validate kneighbors parameters (inherited from KNeighborsDispatchingBase) + self._kneighbors_validation(X, n_neighbors) + result = dispatch( self, "kneighbors", diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 27c5ce0e4d..1591a7a744 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -123,6 +123,10 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): check_is_fitted(self) if X is not None: check_feature_names(self, X, reset=False) + + # Validate kneighbors parameters (inherited from KNeighborsDispatchingBase) + self._kneighbors_validation(X, n_neighbors) + result = dispatch( self, "kneighbors", diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index d5851792ac..9fc43a5043 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -82,6 +82,10 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): check_is_fitted(self) if X is not None: check_feature_names(self, X, reset=False) + + # Validate kneighbors parameters (inherited from KNeighborsDispatchingBase) + self._kneighbors_validation(X, n_neighbors) + result = dispatch( self, "kneighbors", From ae590e989da80e8c01411cc7aed4f83261389af5 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 14 Oct 2025 17:12:31 -0700 Subject: [PATCH 39/87] fix: add vlaidation data to rest of the functions --- sklearnex/neighbors/common.py | 2 +- sklearnex/neighbors/knn_classification.py | 13 +++++++++++++ sklearnex/neighbors/knn_regression.py | 9 +++++++++ sklearnex/neighbors/knn_unsupervised.py | 9 +++++++++ 4 files changed, 32 insertions(+), 1 deletion(-) diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index 4a5072fd80..bfe2d76f49 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -247,7 +247,7 @@ def _fit_validation(self, X, y=None): self._validate_params() check_feature_names(self, X, reset=True) # Validate n_neighbors parameter - self._validate_n_neighbors() + self._validate_n_neighbors(self.n_neighbors) if self.metric_params is not None and "p" in self.metric_params: if self.p is not None: warnings.warn( diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index e84a3d6da3..e18bf38f96 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -208,6 +208,10 @@ def _onedal_fit(self, X, y, queue=None): def _onedal_predict(self, X, queue=None): import sys print(f"DEBUG KNeighborsClassifier._onedal_predict START: X type={type(X)}", file=sys.stderr) + # Validate and convert X (pandas to numpy if needed) + X = validate_data( + self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False + ) result = self._onedal_estimator.predict(X, queue=queue) print(f"DEBUG KNeighborsClassifier._onedal_predict END: result type={type(result)}", file=sys.stderr) return result @@ -215,6 +219,10 @@ def _onedal_predict(self, X, queue=None): def _onedal_predict_proba(self, X, queue=None): import sys print(f"DEBUG KNeighborsClassifier._onedal_predict_proba START: X type={type(X)}", file=sys.stderr) + # Validate and convert X (pandas to numpy if needed) + X = validate_data( + self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False + ) result = self._onedal_estimator.predict_proba(X, queue=queue) print(f"DEBUG KNeighborsClassifier._onedal_predict_proba END: result type={type(result)}", file=sys.stderr) return result @@ -224,6 +232,11 @@ def _onedal_kneighbors( ): import sys print(f"DEBUG KNeighborsClassifier._onedal_kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) + # Validate and convert X (pandas to numpy if needed) + if X is not None: + X = validate_data( + self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False + ) result = self._onedal_estimator.kneighbors( X, n_neighbors, return_distance, queue=queue ) diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 1591a7a744..0461c78faf 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -173,6 +173,10 @@ def _onedal_fit(self, X, y, queue=None): def _onedal_predict(self, X, queue=None): import sys print(f"DEBUG KNeighborsRegressor._onedal_predict START: X type={type(X)}", file=sys.stderr) + # Validate and convert X (pandas to numpy if needed) + X = validate_data( + self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False + ) result = self._onedal_estimator.predict(X, queue=queue) print(f"DEBUG KNeighborsRegressor._onedal_predict END: result type={type(result)}", file=sys.stderr) return result @@ -182,6 +186,11 @@ def _onedal_kneighbors( ): import sys print(f"DEBUG KNeighborsRegressor._onedal_kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) + # Validate and convert X (pandas to numpy if needed) + if X is not None: + X = validate_data( + self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False + ) result = self._onedal_estimator.kneighbors( X, n_neighbors, return_distance, queue=queue ) diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 9fc43a5043..5ae891b696 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -174,11 +174,20 @@ def _onedal_fit(self, X, y=None, queue=None): print(f"DEBUG NearestNeighbors._onedal_fit END: self._fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) def _onedal_predict(self, X, queue=None): + # Validate and convert X (pandas to numpy if needed) + X = validate_data( + self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False + ) return self._onedal_estimator.predict(X, queue=queue) def _onedal_kneighbors( self, X=None, n_neighbors=None, return_distance=True, queue=None ): + # Validate and convert X (pandas to numpy if needed) + if X is not None: + X = validate_data( + self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False + ) return self._onedal_estimator.kneighbors( X, n_neighbors, return_distance, queue=queue ) From 0a2850e7c4d50b7e5f47d7582b62c39faefece15 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 14 Oct 2025 17:43:33 -0700 Subject: [PATCH 40/87] fix: fix check n neighbors validation before check is fitted --- sklearnex/neighbors/knn_classification.py | 5 +++++ sklearnex/neighbors/knn_regression.py | 5 +++++ sklearnex/neighbors/knn_unsupervised.py | 5 +++++ 3 files changed, 15 insertions(+) diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index e18bf38f96..e712255693 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -140,6 +140,11 @@ def score(self, X, y, sample_weight=None): def kneighbors(self, X=None, n_neighbors=None, return_distance=True): import sys print(f"DEBUG KNeighborsClassifier.kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) + + # Validate n_neighbors parameter first (before check_is_fitted) + if n_neighbors is not None: + self._validate_n_neighbors(n_neighbors) + check_is_fitted(self) if X is not None: check_feature_names(self, X, reset=False) diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 0461c78faf..f734a61265 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -120,6 +120,11 @@ def score(self, X, y, sample_weight=None): def kneighbors(self, X=None, n_neighbors=None, return_distance=True): import sys print(f"DEBUG KNeighborsRegressor.kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) + + # Validate n_neighbors parameter first (before check_is_fitted) + if n_neighbors is not None: + self._validate_n_neighbors(n_neighbors) + check_is_fitted(self) if X is not None: check_feature_names(self, X, reset=False) diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 5ae891b696..caeb435ab7 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -79,6 +79,11 @@ def fit(self, X, y=None): @wrap_output_data def kneighbors(self, X=None, n_neighbors=None, return_distance=True): print(f"DEBUG NearestNeighbors.kneighbors START: X type={type(X)}, _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + + # Validate n_neighbors parameter first (before check_is_fitted) + if n_neighbors is not None: + self._validate_n_neighbors(n_neighbors) + check_is_fitted(self) if X is not None: check_feature_names(self, X, reset=False) From 24bd02da0161fd57e8088af890850f77d38a194f Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 14 Oct 2025 18:17:42 -0700 Subject: [PATCH 41/87] fix: fix when predict(none) is called by adding x is not none check --- onedal/neighbors/neighbors.py | 24 ++++++++++++----------- sklearnex/neighbors/knn_classification.py | 18 +++++++++-------- sklearnex/neighbors/knn_regression.py | 9 +++++---- sklearnex/neighbors/knn_unsupervised.py | 9 +++++---- 4 files changed, 33 insertions(+), 27 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index bc68f4a8ab..f0a299d134 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -359,17 +359,19 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors += 1 n_samples_fit = self.n_samples_fit_ - if n_neighbors > n_samples_fit: - if query_is_train: - n_neighbors -= 1 # ok to modify inplace because an error is raised - inequality_str = "n_neighbors < n_samples_fit" - else: - inequality_str = "n_neighbors <= n_samples_fit" - raise ValueError( - f"Expected {inequality_str}, but " - f"n_neighbors = {n_neighbors}, n_samples_fit = {n_samples_fit}, " - f"n_samples = {X.shape[0]}" # include n_samples for common tests - ) + # REFACTOR: n_neighbors bounds validation commented out - should be done in sklearnex layer + # Original validation code kept for reference: + # if n_neighbors > n_samples_fit: + # if query_is_train: + # n_neighbors -= 1 # ok to modify inplace because an error is raised + # inequality_str = "n_neighbors < n_samples_fit" + # else: + # inequality_str = "n_neighbors <= n_samples_fit" + # raise ValueError( + # f"Expected {inequality_str}, but " + # f"n_neighbors = {n_neighbors}, n_samples_fit = {n_samples_fit}, " + # f"n_samples = {X.shape[0]}" # include n_samples for common tests + # ) chunked_results = None method = self._parse_auto_method( diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index e712255693..aee6211970 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -213,10 +213,11 @@ def _onedal_fit(self, X, y, queue=None): def _onedal_predict(self, X, queue=None): import sys print(f"DEBUG KNeighborsClassifier._onedal_predict START: X type={type(X)}", file=sys.stderr) - # Validate and convert X (pandas to numpy if needed) - X = validate_data( - self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False - ) + # Validate and convert X (pandas to numpy if needed) only if X is not None + if X is not None: + X = validate_data( + self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False + ) result = self._onedal_estimator.predict(X, queue=queue) print(f"DEBUG KNeighborsClassifier._onedal_predict END: result type={type(result)}", file=sys.stderr) return result @@ -224,10 +225,11 @@ def _onedal_predict(self, X, queue=None): def _onedal_predict_proba(self, X, queue=None): import sys print(f"DEBUG KNeighborsClassifier._onedal_predict_proba START: X type={type(X)}", file=sys.stderr) - # Validate and convert X (pandas to numpy if needed) - X = validate_data( - self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False - ) + # Validate and convert X (pandas to numpy if needed) only if X is not None + if X is not None: + X = validate_data( + self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False + ) result = self._onedal_estimator.predict_proba(X, queue=queue) print(f"DEBUG KNeighborsClassifier._onedal_predict_proba END: result type={type(result)}", file=sys.stderr) return result diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index f734a61265..d381bec497 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -178,10 +178,11 @@ def _onedal_fit(self, X, y, queue=None): def _onedal_predict(self, X, queue=None): import sys print(f"DEBUG KNeighborsRegressor._onedal_predict START: X type={type(X)}", file=sys.stderr) - # Validate and convert X (pandas to numpy if needed) - X = validate_data( - self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False - ) + # Validate and convert X (pandas to numpy if needed) only if X is not None + if X is not None: + X = validate_data( + self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False + ) result = self._onedal_estimator.predict(X, queue=queue) print(f"DEBUG KNeighborsRegressor._onedal_predict END: result type={type(result)}", file=sys.stderr) return result diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index caeb435ab7..b7c60c0979 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -179,10 +179,11 @@ def _onedal_fit(self, X, y=None, queue=None): print(f"DEBUG NearestNeighbors._onedal_fit END: self._fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) def _onedal_predict(self, X, queue=None): - # Validate and convert X (pandas to numpy if needed) - X = validate_data( - self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False - ) + # Validate and convert X (pandas to numpy if needed) only if X is not None + if X is not None: + X = validate_data( + self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False + ) return self._onedal_estimator.predict(X, queue=queue) def _onedal_kneighbors( From 27023225f00848d0f6ff4bcf552d37d04a645040 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 14 Oct 2025 21:13:34 -0700 Subject: [PATCH 42/87] fix: fix lof --- sklearnex/neighbors/_lof.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/sklearnex/neighbors/_lof.py b/sklearnex/neighbors/_lof.py index e86d0f2b4f..7a47f25ffb 100644 --- a/sklearnex/neighbors/_lof.py +++ b/sklearnex/neighbors/_lof.py @@ -29,7 +29,7 @@ from sklearnex.neighbors.knn_unsupervised import NearestNeighbors from ..utils._array_api import get_namespace -from ..utils.validation import check_feature_names +from ..utils.validation import check_feature_names, validate_data @control_n_jobs(decorated_methods=["fit", "kneighbors", "_kneighbors"]) @@ -58,6 +58,12 @@ def _onedal_fit(self, X, y, queue=None): if sklearn_check_version("1.2"): self._validate_params() + # REFACTOR: Use validate_data from sklearnex.utils.validation to convert pandas to numpy + X = validate_data( + self, X, dtype=[np.float64, np.float32], accept_sparse="csr" + ) + print(f"DEBUG: After validate_data, X type={type(X)}", file=sys.stderr) + print(f"DEBUG LocalOutlierFactor._onedal_fit: Calling _onedal_knn_fit", file=sys.stderr) self._onedal_knn_fit(X, y, queue=queue) @@ -166,9 +172,18 @@ def fit_predict(self, X, y=None): def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): import sys print(f"DEBUG LocalOutlierFactor._kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) + + # Validate n_neighbors parameter first (before check_is_fitted) + if n_neighbors is not None: + self._validate_n_neighbors(n_neighbors) + check_is_fitted(self) if X is not None: check_feature_names(self, X, reset=False) + + # Validate kneighbors parameters (inherited from KNeighborsDispatchingBase) + self._kneighbors_validation(X, n_neighbors) + result = dispatch( self, "kneighbors", @@ -192,6 +207,13 @@ def score_samples(self, X): import sys print(f"DEBUG LocalOutlierFactor.score_samples START: X type={type(X)}", file=sys.stderr) check_is_fitted(self) + + # Validate and convert X (pandas to numpy if needed) + X = validate_data( + self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False + ) + + check_feature_names(self, X, reset=False) distances_X, neighbors_indices_X = self._kneighbors( X, n_neighbors=self.n_neighbors_ From 965389e4aeb1464465afa6898c77bfcc532342bb Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 14 Oct 2025 21:29:22 -0700 Subject: [PATCH 43/87] fix: add validation in kneihbors for lof --- onedal/neighbors/neighbors.py | 2 +- sklearnex/neighbors/knn_unsupervised.py | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index f0a299d134..b75ecf7bcb 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -359,7 +359,7 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors += 1 n_samples_fit = self.n_samples_fit_ - # REFACTOR: n_neighbors bounds validation commented out - should be done in sklearnex layer + # REFACTOR: n_neighbors bounds validation moved to sklearnex layer (_onedal_kneighbors) # Original validation code kept for reference: # if n_neighbors > n_samples_fit: # if query_is_train: diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index b7c60c0979..55456be602 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -194,6 +194,19 @@ def _onedal_kneighbors( X = validate_data( self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False ) + + # REFACTOR: Validate n_neighbors bounds when X=None (query_is_train case) + # When X=None, oneDAL will add +1 to n_neighbors internally to account for the sample itself + # We need to check this BEFORE calling oneDAL to provide proper error messages + if X is None and n_neighbors is not None: + # oneDAL will add +1, so validate n_neighbors + 1 against n_samples_fit + if n_neighbors + 1 > self.n_samples_fit_: + raise ValueError( + f"Expected n_neighbors < n_samples_fit, but " + f"n_neighbors = {n_neighbors}, n_samples_fit = {self.n_samples_fit_}, " + f"n_samples = {self.n_samples_fit_}" + ) + return self._onedal_estimator.kneighbors( X, n_neighbors, return_distance, queue=queue ) From 5b8b091bf71c4b96288801b1f600d17155bb36d9 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 14 Oct 2025 22:04:24 -0700 Subject: [PATCH 44/87] fix: remove count valitation in onedal --- onedal/neighbors/neighbors.py | 117 +++++++++++++++++++--------------- 1 file changed, 65 insertions(+), 52 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index b75ecf7bcb..9bceb589a1 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -77,34 +77,34 @@ def infer(self, *args, **kwargs): ... @abstractmethod def _onedal_fit(self, X, y): ... - def _validate_data( - self, X, y=None, reset=True, validate_separately=None, **check_params - ): - if y is None: - if self.requires_y: - raise ValueError( - f"This {self.__class__.__name__} estimator " - f"requires y to be passed, but the target y is None." - ) - X = _check_array(X, **check_params) - out = X, y - else: - if validate_separately: - # We need this because some estimators validate X and y - # separately, and in general, separately calling _check_array() - # on X and y isn't equivalent to just calling _check_X_y() - # :( - check_X_params, check_y_params = validate_separately - X = _check_array(X, **check_X_params) - y = _check_array(y, **check_y_params) - else: - X, y = _check_X_y(X, y, **check_params) - out = X, y - - if check_params.get("ensure_2d", True): - _check_n_features(self, X, reset=reset) - - return out + # def _validate_data( + # self, X, y=None, reset=True, validate_separately=None, **check_params + # ): + # if y is None: + # if self.requires_y: + # raise ValueError( + # f"This {self.__class__.__name__} estimator " + # f"requires y to be passed, but the target y is None." + # ) + # X = _check_array(X, **check_params) + # out = X, y + # else: + # if validate_separately: + # # We need this because some estimators validate X and y + # # separately, and in general, separately calling _check_array() + # # on X and y isn't equivalent to just calling _check_X_y() + # # :( + # check_X_params, check_y_params = validate_separately + # X = _check_array(X, **check_X_params) + # y = _check_array(y, **check_y_params) + # else: + # X, y = _check_X_y(X, y, **check_params) + # out = X, y + + # if check_params.get("ensure_2d", True): + # _check_n_features(self, X, reset=reset) + + # return out def _get_weights(self, dist, weights): if weights in (None, "uniform"): @@ -487,21 +487,28 @@ def fit(self, X, y, queue=None): @supports_queue def predict(self, X, queue=None): print(f"DEBUG KNeighborsClassifier.predict START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - use_raw_input = _get_config().get("use_raw_input", False) is True - if not use_raw_input: - X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + + # REFACTOR: _check_array validation commented out - should be done in sklearnex layer + # Original validation code kept for reference: + # use_raw_input = _get_config().get("use_raw_input", False) is True + # if not use_raw_input: + # X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + onedal_model = getattr(self, "_onedal_model", None) n_features = getattr(self, "n_features_in_", None) n_samples_fit_ = getattr(self, "n_samples_fit_", None) - shape = getattr(X, "shape", None) - if n_features and shape and len(shape) > 1 and shape[1] != n_features: - raise ValueError( - ( - f"X has {X.shape[1]} features, " - f"but KNNClassifier is expecting " - f"{n_features} features as input" - ) - ) + + # REFACTOR: Feature count validation commented out - should be done in sklearnex layer + # Original validation code kept for reference: + # shape = getattr(X, "shape", None) + # if n_features and shape and len(shape) > 1 and shape[1] != n_features: + # raise ValueError( + # ( + # f"X has {X.shape[1]} features, " + # f"but KNNClassifier is expecting " + # f"{n_features} features as input" + # ) + # ) _check_is_fitted(self) @@ -641,21 +648,27 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None) return self._kneighbors(X, n_neighbors, return_distance) def _predict_gpu(self, X): - use_raw_input = _get_config().get("use_raw_input", False) is True - if not use_raw_input: - X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + # REFACTOR: _check_array validation commented out - should be done in sklearnex layer + # Original validation code kept for reference: + # use_raw_input = _get_config().get("use_raw_input", False) is True + # if not use_raw_input: + # X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + onedal_model = getattr(self, "_onedal_model", None) n_features = getattr(self, "n_features_in_", None) n_samples_fit_ = getattr(self, "n_samples_fit_", None) - shape = getattr(X, "shape", None) - if n_features and shape and len(shape) > 1 and shape[1] != n_features: - raise ValueError( - ( - f"X has {X.shape[1]} features, " - f"but KNNClassifier is expecting " - f"{n_features} features as input" - ) - ) + + # REFACTOR: Feature count validation commented out - should be done in sklearnex layer + # Original validation code kept for reference: + # shape = getattr(X, "shape", None) + # if n_features and shape and len(shape) > 1 and shape[1] != n_features: + # raise ValueError( + # ( + # f"X has {X.shape[1]} features, " + # f"but KNNClassifier is expecting " + # f"{n_features} features as input" + # ) + # ) _check_is_fitted(self) From 5e54b86520068b6b68f41d397fb5a9c3435e639b Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 14 Oct 2025 22:52:31 -0700 Subject: [PATCH 45/87] fix: refactor shape --- onedal/neighbors/neighbors.py | 24 +++++++++++++++----- sklearnex/neighbors/common.py | 16 ++++++++++++-- sklearnex/neighbors/knn_classification.py | 2 ++ sklearnex/neighbors/knn_regression.py | 27 +++++++++++++++++++++++ 4 files changed, 62 insertions(+), 7 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index 9bceb589a1..5c2468c7e0 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -106,7 +106,13 @@ def _onedal_fit(self, X, y): ... # return out + # TODO FUTURE REFACTORING: This method should not be in onedal layer + # The entire predict_proba and _predict_skl implementations should be moved to sklearnex layer + # Then _get_weights can be removed from onedal entirely (it already exists in sklearnex/neighbors/common.py) + # For now keeping it here to avoid circular dependency issues def _get_weights(self, dist, weights): + # REFACTOR NOTE: Weight parameter validation (raise ValueError) should be in sklearnex + # But keeping entire method here temporarily until predict_proba/predict_skl are moved to sklearnex if weights in (None, "uniform"): return None if weights == "distance": @@ -199,7 +205,11 @@ def _fit(self, X, y): print(f"DEBUG oneDAL _fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", file=sys.stderr) self._onedal_model = None self._tree = None - self._shape = None + # REFACTOR: Shape processing moved to sklearnex layer + # _shape should be set by _process_classification_targets or _process_regression_targets in sklearnex + # self._shape = None + if not hasattr(self, '_shape'): + self._shape = None # REFACTOR STEP 1: Don't reset classes_ - it may have been set by sklearnex layer # self.classes_ = None if not hasattr(self, 'classes_'): @@ -214,13 +224,15 @@ def _fit(self, X, y): # Original code kept for reference: # use_raw_input = _get_config().get("use_raw_input", False) is True if y is not None or self.requires_y: - shape = getattr(y, "shape", None) + # REFACTOR: Shape processing commented out - should be done in sklearnex layer + # Original code kept for reference: + # shape = getattr(y, "shape", None) # REFACTOR: _validate_data call commented out - validation now happens in sklearnex layer # if not use_raw_input: # X, y = super()._validate_data( # X, y, dtype=[np.float64, np.float32], accept_sparse="csr" # ) - self._shape = shape if shape is not None else y.shape + # self._shape = shape if shape is not None else y.shape # REFACTOR: Classification target processing moved to sklearnex layer # This code is now commented out - processing MUST happen in sklearnex before calling fit @@ -295,8 +307,10 @@ def _fit(self, X, y): result = self._onedal_fit(X, _fit_y) print(f"DEBUG oneDAL _fit: After _onedal_fit, self._fit_X type={type(self._fit_X)}, shape={getattr(self._fit_X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - if y is not None and _is_regressor(self): - self._y = y if self._shape is None else xp.reshape(y, self._shape) + # REFACTOR: Shape-based y reshaping commented out - y should already be properly shaped by sklearnex + # Original code kept for reference: + # if y is not None and _is_regressor(self): + # self._y = y if self._shape is None else xp.reshape(y, self._shape) self._onedal_model = result result = self diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index bfe2d76f49..a3ac9e573c 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -235,11 +235,23 @@ def _process_classification_targets(self, y): return y def _process_regression_targets(self, y): - """Process regression targets and set shape-related attributes.""" - # Handle shape processing for regression + """Process regression targets and set shape-related attributes. + + REFACTOR: This replicates the EXACT shape processing that was in onedal _fit. + Original onedal code: + shape = getattr(y, "shape", None) + self._shape = shape if shape is not None else y.shape + # (later, after fit) + self._y = y if self._shape is None else xp.reshape(y, self._shape) + + For now, just store _shape and _y as-is. The reshape happens after onedal fit is complete. + """ + import sys + # EXACT replication of original onedal shape processing shape = getattr(y, "shape", None) self._shape = shape if shape is not None else y.shape self._y = y + print(f"DEBUG _process_regression_targets: _y type={type(self._y)}, _shape={self._shape}", file=sys.stderr) return y def _fit_validation(self, X, y=None): diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index aee6211970..c4bd18668b 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -199,8 +199,10 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator.classes_ = self.classes_ self._onedal_estimator._y = self._y self._onedal_estimator.outputs_2d_ = self.outputs_2d_ + self._onedal_estimator._shape = self._shape # Pass shape from sklearnex print(f"DEBUG: Set onedal_estimator.classes_={self._onedal_estimator.classes_}", file=sys.stderr) print(f"DEBUG: Set onedal_estimator._y shape={self._onedal_estimator._y.shape}", file=sys.stderr) + print(f"DEBUG: Set onedal_estimator._shape={self._onedal_estimator._shape}", file=sys.stderr) print(f"DEBUG KNeighborsClassifier._onedal_fit: Calling onedal_estimator.fit with X and original y", file=sys.stderr) # Pass original y to onedal - it will use the pre-set classes_ and _y attributes we just assigned diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index d381bec497..97ade06caa 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -156,6 +156,12 @@ def _onedal_fit(self, X, y, queue=None): ) print(f"DEBUG: After validate_data, X type={type(X)}, y type={type(y)}", file=sys.stderr) + # REFACTOR: Process regression targets in sklearnex before passing to onedal + # This sets _shape and _y attributes + print(f"DEBUG: Processing regression targets in sklearnex", file=sys.stderr) + y_processed = self._process_regression_targets(y) + print(f"DEBUG: After _process_regression_targets, _shape={self._shape}, _y type={type(self._y)}", file=sys.stderr) + onedal_params = { "n_neighbors": self.n_neighbors, "weights": self.weights, @@ -168,11 +174,32 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ + + # REFACTOR: Pass pre-processed shape and _y to onedal + self._onedal_estimator._shape = self._shape + self._onedal_estimator._y = self._y + print(f"DEBUG: Set onedal_estimator._shape={self._onedal_estimator._shape}", file=sys.stderr) + print(f"DEBUG KNeighborsRegressor._onedal_fit: Calling onedal_estimator.fit", file=sys.stderr) self._onedal_estimator.fit(X, y, queue=queue) print(f"DEBUG KNeighborsRegressor._onedal_fit: After fit, calling _save_attributes", file=sys.stderr) self._save_attributes() + + # REFACTOR: Replicate the EXACT post-fit reshaping from original onedal code + # Original onedal code (after fit): + # if y is not None and _is_regressor(self): + # _, xp, _ = _get_sycl_namespace(X) + # self._y = y if self._shape is None else xp.reshape(y, self._shape) + # Now doing this in sklearnex layer + from ..utils._array_api import get_namespace + if y is not None: + xp, _ = get_namespace(y) + self._y = y if self._shape is None else xp.reshape(y, self._shape) + # Also update the onedal estimator's _y since that's what gets used in predict + self._onedal_estimator._y = self._y + print(f"DEBUG: After reshape, self._y type={type(self._y)}, shape={getattr(self._y, 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f"DEBUG KNeighborsRegressor._onedal_fit END: self._fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) def _onedal_predict(self, X, queue=None): From b16ecc8d740ec7556993aa59a633f70a06132a65 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Wed, 15 Oct 2025 15:56:59 -0700 Subject: [PATCH 46/87] refactor: neighbors processing logic to skleranex --- onedal/neighbors/neighbors.py | 116 +++++++++++++--------- sklearnex/neighbors/common.py | 95 ++++++++++++++++++ sklearnex/neighbors/knn_classification.py | 15 ++- sklearnex/neighbors/knn_regression.py | 15 ++- sklearnex/neighbors/knn_unsupervised.py | 30 +++--- 5 files changed, 196 insertions(+), 75 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index 5c2468c7e0..c6519860e3 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -363,16 +363,24 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): # # returned, which is removed later # n_neighbors += 1 - if X is not None: - query_is_train = False - else: - query_is_train = True + # REFACTOR: query_is_train handling moved to sklearnex layer + # All post-processing now happens in sklearnex._kneighbors_post_processing() + # Original code kept for reference: + # if X is not None: + # query_is_train = False + # else: + # query_is_train = True + # X = self._fit_X + # # Include an extra neighbor to account for the sample itself being + # # returned, which is removed later + # n_neighbors += 1 + + # REFACTOR: onedal now just returns raw results, sklearnex does all processing + # Following PCA pattern: simple onedal layer + if X is None: X = self._fit_X - # Include an extra neighbor to account for the sample itself being - # returned, which is removed later - n_neighbors += 1 - n_samples_fit = self.n_samples_fit_ + # n_samples_fit = self.n_samples_fit_ # REFACTOR: n_neighbors bounds validation moved to sklearnex layer (_onedal_kneighbors) # Original validation code kept for reference: # if n_neighbors > n_samples_fit: @@ -387,62 +395,74 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): # f"n_samples = {X.shape[0]}" # include n_samples for common tests # ) - chunked_results = None + # chunked_results = None method = self._parse_auto_method( self._fit_method, self.n_samples_fit_, n_features ) + # REFACTOR: Following PCA pattern - onedal just calls backend and returns raw results + # All post-processing (kd_tree sorting, removing self, etc.) moved to sklearnex params = super()._get_onedal_params(X, n_neighbors=n_neighbors) prediction_results = self._onedal_predict(self._onedal_model, X, params) distances = from_table(prediction_results.distances) indices = from_table(prediction_results.indices) - if method == "kd_tree": - for i in range(distances.shape[0]): - seq = distances[i].argsort() - indices[i] = indices[i][seq] - distances[i] = distances[i][seq] + # REFACTOR: kd_tree sorting moved to sklearnex._kneighbors_post_processing() + # Original code kept for reference: + # if method == "kd_tree": + # for i in range(distances.shape[0]): + # seq = distances[i].argsort() + # indices[i] = indices[i][seq] + # distances[i] = distances[i][seq] if return_distance: results = distances, indices else: results = indices - if chunked_results is not None: - if return_distance: - neigh_dist, neigh_ind = zip(*chunked_results) - results = np.vstack(neigh_dist), np.vstack(neigh_ind) - else: - results = np.vstack(chunked_results) - - if not query_is_train: - return results - - # If the query data is the same as the indexed data, we would like - # to ignore the first nearest neighbor of every sample, i.e - # the sample itself. - if return_distance: - neigh_dist, neigh_ind = results - else: - neigh_ind = results - - n_queries, _ = X.shape - sample_range = np.arange(n_queries)[:, None] - sample_mask = neigh_ind != sample_range - - # Corner case: When the number of duplicates are more - # than the number of neighbors, the first NN will not - # be the sample, but a duplicate. - # In that case mask the first duplicate. - dup_gr_nbrs = np.all(sample_mask, axis=1) - sample_mask[:, 0][dup_gr_nbrs] = False - - neigh_ind = np.reshape(neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) + # REFACTOR: chunked_results vstack moved to sklearnex (was dead code anyway) + # Original code kept for reference: + # if chunked_results is not None: + # if return_distance: + # neigh_dist, neigh_ind = zip(*chunked_results) + # results = np.vstack(neigh_dist), np.vstack(neigh_ind) + # else: + # results = np.vstack(chunked_results) - if return_distance: - neigh_dist = np.reshape(neigh_dist[sample_mask], (n_queries, n_neighbors - 1)) - return neigh_dist, neigh_ind - return neigh_ind + # REFACTOR: Removing self from results moved to sklearnex._kneighbors_post_processing() + # All query_is_train post-processing now in sklearnex layer + # Original code kept for reference: + # if not query_is_train: + # return results + # + # # If the query data is the same as the indexed data, we would like + # # to ignore the first nearest neighbor of every sample, i.e + # # the sample itself. + # if return_distance: + # neigh_dist, neigh_ind = results + # else: + # neigh_ind = results + # + # n_queries, _ = X.shape + # sample_range = np.arange(n_queries)[:, None] + # sample_mask = neigh_ind != sample_range + # + # # Corner case: When the number of duplicates are more + # # than the number of neighbors, the first NN will not + # # be the sample, but a duplicate. + # # In that case mask the first duplicate. + # dup_gr_nbrs = np.all(sample_mask, axis=1) + # sample_mask[:, 0][dup_gr_nbrs] = False + # + # neigh_ind = np.reshape(neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) + # + # if return_distance: + # neigh_dist = np.reshape(neigh_dist[sample_mask], (n_queries, n_neighbors - 1)) + # return neigh_dist, neigh_ind + # return neigh_ind + + # Return raw results - sklearnex will do all post-processing + return results class KNeighborsClassifier(NeighborsBase, ClassifierMixin): diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index a3ac9e573c..d11e1831a3 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -194,6 +194,101 @@ def _kneighbors_validation(self, X, n_neighbors): query_is_train = X is None or (hasattr(self, '_fit_X') and X is self._fit_X) self._validate_kneighbors_bounds(n_neighbors, query_is_train, X if X is not None else self._fit_X) + def _prepare_kneighbors_inputs(self, X, n_neighbors): + """Prepare inputs for kneighbors call to onedal backend. + + Handles query_is_train case: when X=None, sets X to training data and adds +1 to n_neighbors. + + Args: + X: Query data or None + n_neighbors: Number of neighbors or None + + Returns: + Tuple of (X, n_neighbors, query_is_train) + - X: Processed query data (self._fit_X if original X was None) + - n_neighbors: Adjusted n_neighbors (includes +1 if query_is_train) + - query_is_train: Boolean flag indicating if original X was None + """ + query_is_train = X is None + + if X is not None: + # Validate and convert X (pandas to numpy if needed) + X = validate_data( + self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False + ) + else: + X = self._fit_X + # Include an extra neighbor to account for the sample itself being + # returned, which is removed later + if n_neighbors is None: + n_neighbors = self.n_neighbors + n_neighbors += 1 + + return X, n_neighbors, query_is_train + + def _kneighbors_post_processing(self, X, n_neighbors, return_distance, result, query_is_train): + """Shared post-processing for kneighbors results. + + Following PCA pattern: all post-processing in sklearnex, onedal returns raw results. + + Handles: + - query_is_train case (X=None): removes self from results + - kd_tree sorting: sorts results by distance + + Args: + X: Query data (self._fit_X if query_is_train) + n_neighbors: Number of neighbors (already includes +1 if query_is_train) + return_distance: Whether distances are included in result + result: Raw result from onedal backend (distances, indices) or just indices + query_is_train: Boolean indicating if original X was None + + Returns: + Post-processed result in same format as input result + """ + # POST-PROCESSING: kd_tree sorting (moved from onedal) + if self._fit_method == "kd_tree": + if return_distance: + distances, indices = result + for i in range(distances.shape[0]): + seq = distances[i].argsort() + indices[i] = indices[i][seq] + distances[i] = distances[i][seq] + result = distances, indices + else: + indices = result + # For indices-only, we still need to sort but we don't have distances + # In this case, indices should already be sorted by onedal + pass + + # POST-PROCESSING: Remove self from results when query_is_train (moved from onedal) + if query_is_train: + if return_distance: + neigh_dist, neigh_ind = result + else: + neigh_ind = result + + # X is self._fit_X in query_is_train case (set by caller) + n_queries, _ = X.shape + sample_range = np.arange(n_queries)[:, None] + sample_mask = neigh_ind != sample_range + + # Corner case: When the number of duplicates are more + # than the number of neighbors, the first NN will not + # be the sample, but a duplicate. + # In that case mask the first duplicate. + dup_gr_nbrs = np.all(sample_mask, axis=1) + sample_mask[:, 0][dup_gr_nbrs] = False + + neigh_ind = np.reshape(neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) + + if return_distance: + neigh_dist = np.reshape(neigh_dist[sample_mask], (n_queries, n_neighbors - 1)) + result = neigh_dist, neigh_ind + else: + result = neigh_ind + + return result + def _process_classification_targets(self, y): """Process classification targets and set class-related attributes. diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index c4bd18668b..5868ca5d45 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -241,14 +241,19 @@ def _onedal_kneighbors( ): import sys print(f"DEBUG KNeighborsClassifier._onedal_kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) - # Validate and convert X (pandas to numpy if needed) - if X is not None: - X = validate_data( - self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False - ) + + # REFACTOR: All post-processing now in sklearnex following PCA pattern + # Prepare inputs and handle query_is_train case + X, n_neighbors, query_is_train = self._prepare_kneighbors_inputs(X, n_neighbors) + + # Get raw results from onedal backend result = self._onedal_estimator.kneighbors( X, n_neighbors, return_distance, queue=queue ) + + # Apply post-processing (kd_tree sorting, removing self from results) + result = self._kneighbors_post_processing(X, n_neighbors, return_distance, result, query_is_train) + print(f"DEBUG KNeighborsClassifier._onedal_kneighbors END: result type={type(result)}", file=sys.stderr) return result diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 97ade06caa..146cc817f0 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -219,14 +219,19 @@ def _onedal_kneighbors( ): import sys print(f"DEBUG KNeighborsRegressor._onedal_kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) - # Validate and convert X (pandas to numpy if needed) - if X is not None: - X = validate_data( - self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False - ) + + # REFACTOR: All post-processing now in sklearnex following PCA pattern + # Prepare inputs and handle query_is_train case + X, n_neighbors, query_is_train = self._prepare_kneighbors_inputs(X, n_neighbors) + + # Get raw results from onedal backend result = self._onedal_estimator.kneighbors( X, n_neighbors, return_distance, queue=queue ) + + # Apply post-processing (kd_tree sorting, removing self from results) + result = self._kneighbors_post_processing(X, n_neighbors, return_distance, result, query_is_train) + print(f"DEBUG KNeighborsRegressor._onedal_kneighbors END: result type={type(result)}", file=sys.stderr) return result diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 55456be602..17c1604a9c 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -189,27 +189,23 @@ def _onedal_predict(self, X, queue=None): def _onedal_kneighbors( self, X=None, n_neighbors=None, return_distance=True, queue=None ): - # Validate and convert X (pandas to numpy if needed) - if X is not None: - X = validate_data( - self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False - ) + import sys + print(f"DEBUG NearestNeighbors._onedal_kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) - # REFACTOR: Validate n_neighbors bounds when X=None (query_is_train case) - # When X=None, oneDAL will add +1 to n_neighbors internally to account for the sample itself - # We need to check this BEFORE calling oneDAL to provide proper error messages - if X is None and n_neighbors is not None: - # oneDAL will add +1, so validate n_neighbors + 1 against n_samples_fit - if n_neighbors + 1 > self.n_samples_fit_: - raise ValueError( - f"Expected n_neighbors < n_samples_fit, but " - f"n_neighbors = {n_neighbors}, n_samples_fit = {self.n_samples_fit_}, " - f"n_samples = {self.n_samples_fit_}" - ) + # REFACTOR: All post-processing now in sklearnex following PCA pattern + # Prepare inputs and handle query_is_train case + X, n_neighbors, query_is_train = self._prepare_kneighbors_inputs(X, n_neighbors) - return self._onedal_estimator.kneighbors( + # Get raw results from onedal backend + result = self._onedal_estimator.kneighbors( X, n_neighbors, return_distance, queue=queue ) + + # Apply post-processing (kd_tree sorting, removing self from results) + result = self._kneighbors_post_processing(X, n_neighbors, return_distance, result, query_is_train) + + print(f"DEBUG NearestNeighbors._onedal_kneighbors END: result type={type(result)}", file=sys.stderr) + return result def _save_attributes(self): print(f"DEBUG NearestNeighbors._save_attributes START: onedal_estimator._fit_X type={type(getattr(self._onedal_estimator, '_fit_X', 'NOT_SET'))}", file=sys.stderr) From 8c89422eef98898d70b807b3d7ad449438b0e57c Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Wed, 15 Oct 2025 16:50:14 -0700 Subject: [PATCH 47/87] fix: validationeighbors < samples after +1 --- sklearnex/neighbors/common.py | 12 ++++++++++++ sklearnex/neighbors/knn_unsupervised.py | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index d11e1831a3..bae321d2ce 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -198,6 +198,7 @@ def _prepare_kneighbors_inputs(self, X, n_neighbors): """Prepare inputs for kneighbors call to onedal backend. Handles query_is_train case: when X=None, sets X to training data and adds +1 to n_neighbors. + Validates n_neighbors bounds AFTER adding +1 (replicates original onedal behavior). Args: X: Query data or None @@ -223,6 +224,17 @@ def _prepare_kneighbors_inputs(self, X, n_neighbors): if n_neighbors is None: n_neighbors = self.n_neighbors n_neighbors += 1 + + # Validate bounds AFTER adding +1 (replicates original onedal behavior) + # Original code in onedal had validation after n_neighbors += 1 + n_samples_fit = self.n_samples_fit_ + if n_neighbors > n_samples_fit: + n_neighbors_for_msg = n_neighbors - 1 # for error message, show original value + raise ValueError( + f"Expected n_neighbors < n_samples_fit, but " + f"n_neighbors = {n_neighbors_for_msg}, n_samples_fit = {n_samples_fit}, " + f"n_samples = {X.shape[0]}" + ) return X, n_neighbors, query_is_train diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 17c1604a9c..7d28ff0bb0 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -193,7 +193,7 @@ def _onedal_kneighbors( print(f"DEBUG NearestNeighbors._onedal_kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) # REFACTOR: All post-processing now in sklearnex following PCA pattern - # Prepare inputs and handle query_is_train case + # Prepare inputs and handle query_is_train case (includes validation AFTER +=1) X, n_neighbors, query_is_train = self._prepare_kneighbors_inputs(X, n_neighbors) # Get raw results from onedal backend From 273a0844cd45cd0deb3d66bcc1fbb0e06cc53a3d Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Wed, 15 Oct 2025 18:07:43 -0700 Subject: [PATCH 48/87] fix: fix assertion error --- onedal/neighbors/neighbors.py | 16 ++++-- sklearnex/neighbors/common.py | 100 ++++++++++++++++++---------------- 2 files changed, 65 insertions(+), 51 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index c6519860e3..59a5c0bbac 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -401,7 +401,7 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): ) # REFACTOR: Following PCA pattern - onedal just calls backend and returns raw results - # All post-processing (kd_tree sorting, removing self, etc.) moved to sklearnex + # All post-processing (kd_tree sorting, removing self, return_distance decision) moved to sklearnex params = super()._get_onedal_params(X, n_neighbors=n_neighbors) prediction_results = self._onedal_predict(self._onedal_model, X, params) distances = from_table(prediction_results.distances) @@ -415,10 +415,16 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): # indices[i] = indices[i][seq] # distances[i] = distances[i][seq] - if return_distance: - results = distances, indices - else: - results = indices + # REFACTOR: return_distance decision moved to sklearnex._kneighbors_post_processing() + # onedal always returns both distances and indices (backend always computes both) + # Original code kept for reference: + # if return_distance: + # results = distances, indices + # else: + # results = indices + + # Always return both - sklearnex will decide what to return to user + results = distances, indices # REFACTOR: chunked_results vstack moved to sklearnex (was dead code anyway) # Original code kept for reference: diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index bae321d2ce..fc56141001 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -242,64 +242,72 @@ def _kneighbors_post_processing(self, X, n_neighbors, return_distance, result, q """Shared post-processing for kneighbors results. Following PCA pattern: all post-processing in sklearnex, onedal returns raw results. + Replicates exact logic from main branch onedal._kneighbors() method. - Handles: - - query_is_train case (X=None): removes self from results - - kd_tree sorting: sorts results by distance + Handles (in order, matching main branch): + 1. kd_tree sorting: sorts results by distance (BEFORE deciding what to return) + 2. query_is_train case (X=None): removes self from results + 3. return_distance decision: return distances+indices or just indices Args: X: Query data (self._fit_X if query_is_train) n_neighbors: Number of neighbors (already includes +1 if query_is_train) - return_distance: Whether distances are included in result - result: Raw result from onedal backend (distances, indices) or just indices + return_distance: Whether to return distances to user + result: Raw result from onedal backend - always (distances, indices) query_is_train: Boolean indicating if original X was None Returns: - Post-processed result in same format as input result + Post-processed result: (distances, indices) if return_distance else indices """ - # POST-PROCESSING: kd_tree sorting (moved from onedal) + # onedal always returns both distances and indices (backend computes both) + distances, indices = result + + # POST-PROCESSING STEP 1: kd_tree sorting (moved from onedal) + # This happens BEFORE deciding what to return, using distances that are always available + # Matches main branch: sorting uses distances even when return_distance=False if self._fit_method == "kd_tree": - if return_distance: - distances, indices = result - for i in range(distances.shape[0]): - seq = distances[i].argsort() - indices[i] = indices[i][seq] - distances[i] = distances[i][seq] - result = distances, indices - else: - indices = result - # For indices-only, we still need to sort but we don't have distances - # In this case, indices should already be sorted by onedal - pass + for i in range(distances.shape[0]): + seq = distances[i].argsort() + indices[i] = indices[i][seq] + distances[i] = distances[i][seq] - # POST-PROCESSING: Remove self from results when query_is_train (moved from onedal) - if query_is_train: - if return_distance: - neigh_dist, neigh_ind = result - else: - neigh_ind = result - - # X is self._fit_X in query_is_train case (set by caller) - n_queries, _ = X.shape - sample_range = np.arange(n_queries)[:, None] - sample_mask = neigh_ind != sample_range - - # Corner case: When the number of duplicates are more - # than the number of neighbors, the first NN will not - # be the sample, but a duplicate. - # In that case mask the first duplicate. - dup_gr_nbrs = np.all(sample_mask, axis=1) - sample_mask[:, 0][dup_gr_nbrs] = False - - neigh_ind = np.reshape(neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) - - if return_distance: - neigh_dist = np.reshape(neigh_dist[sample_mask], (n_queries, n_neighbors - 1)) - result = neigh_dist, neigh_ind - else: - result = neigh_ind + # POST-PROCESSING STEP 2: Decide what to return (moved from onedal) + # This happens AFTER kd_tree sorting + if return_distance: + results = distances, indices + else: + results = indices + + # POST-PROCESSING STEP 3: Remove self from results when query_is_train (moved from onedal) + # This happens LAST, after sorting and after deciding format + if not query_is_train: + return results + + # If the query data is the same as the indexed data, we would like + # to ignore the first nearest neighbor of every sample, i.e the sample itself. + if return_distance: + neigh_dist, neigh_ind = results + else: + neigh_ind = results + + # X is self._fit_X in query_is_train case (set by caller) + n_queries, _ = X.shape + sample_range = np.arange(n_queries)[:, None] + sample_mask = neigh_ind != sample_range + + # Corner case: When the number of duplicates are more + # than the number of neighbors, the first NN will not + # be the sample, but a duplicate. + # In that case mask the first duplicate. + dup_gr_nbrs = np.all(sample_mask, axis=1) + sample_mask[:, 0][dup_gr_nbrs] = False + + neigh_ind = np.reshape(neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) - return result + if return_distance: + neigh_dist = np.reshape(neigh_dist[sample_mask], (n_queries, n_neighbors - 1)) + return neigh_dist, neigh_ind + return neigh_ind def _process_classification_targets(self, y): """Process classification targets and set class-related attributes. From 35afada9a31615cfd260071adbfdbde10d86d921 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Wed, 15 Oct 2025 23:19:16 -0700 Subject: [PATCH 49/87] fix: fix asswertion error by dispatch gpu/skl in sklearnex --- sklearnex/neighbors/common.py | 83 +++++++++++++++++++++++ sklearnex/neighbors/knn_classification.py | 16 +++-- sklearnex/neighbors/knn_regression.py | 40 ++++++++++- 3 files changed, 131 insertions(+), 8 deletions(-) diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index fc56141001..d95a4cec1f 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -124,6 +124,89 @@ def _get_weights(self, dist, weights): "weights not recognized: should be 'uniform', " "'distance', or a callable function" ) + + def _compute_weighted_prediction(self, neigh_dist, neigh_ind, weights_param, y_train): + """Compute weighted prediction for regression. + + Args: + neigh_dist: Distances to neighbors + neigh_ind: Indices of neighbors + weights_param: Weight parameter ('uniform', 'distance', or callable) + y_train: Training target values + + Returns: + Predicted values + """ + weights = self._get_weights(neigh_dist, weights_param) + + _y = y_train + if _y.ndim == 1: + _y = _y.reshape((-1, 1)) + + if weights is None: + y_pred = np.mean(_y[neigh_ind], axis=1) + else: + y_pred = np.empty((neigh_ind.shape[0], _y.shape[1]), dtype=np.float64) + denom = np.sum(weights, axis=1) + + for j in range(_y.shape[1]): + num = np.sum(_y[neigh_ind, j] * weights, axis=1) + y_pred[:, j] = num / denom + + if y_train.ndim == 1: + y_pred = y_pred.ravel() + + return y_pred + + def _compute_class_probabilities(self, neigh_dist, neigh_ind, weights_param, y_train, classes, outputs_2d): + """Compute class probabilities for classification. + + Args: + neigh_dist: Distances to neighbors + neigh_ind: Indices of neighbors + weights_param: Weight parameter ('uniform', 'distance', or callable) + y_train: Encoded training labels + classes: Class labels + outputs_2d: Whether output is 2D (multi-output) + + Returns: + Class probabilities + """ + from ..utils.validation import _num_samples + + _y = y_train + classes_ = classes + if not outputs_2d: + _y = y_train.reshape((-1, 1)) + classes_ = [classes] + + n_queries = neigh_ind.shape[0] + + weights = self._get_weights(neigh_dist, weights_param) + if weights is None: + weights = np.ones_like(neigh_ind) + + all_rows = np.arange(n_queries) + probabilities = [] + for k, classes_k in enumerate(classes_): + pred_labels = _y[:, k][neigh_ind] + proba_k = np.zeros((n_queries, classes_k.size)) + + # a simple ':' index doesn't work right + for i, idx in enumerate(pred_labels.T): # loop is O(n_neighbors) + proba_k[all_rows, idx] += weights[:, i] + + # normalize 'votes' into real [0,1] probabilities + normalizer = proba_k.sum(axis=1)[:, np.newaxis] + normalizer[normalizer == 0.0] = 1.0 + proba_k /= normalizer + + probabilities.append(proba_k) + + if not outputs_2d: + probabilities = probabilities[0] + + return probabilities def _validate_targets(self, y, dtype): arr = _column_or_1d(y, warn=True) diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index 5868ca5d45..2cbac6923f 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -227,12 +227,16 @@ def _onedal_predict(self, X, queue=None): def _onedal_predict_proba(self, X, queue=None): import sys print(f"DEBUG KNeighborsClassifier._onedal_predict_proba START: X type={type(X)}", file=sys.stderr) - # Validate and convert X (pandas to numpy if needed) only if X is not None - if X is not None: - X = validate_data( - self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False - ) - result = self._onedal_estimator.predict_proba(X, queue=queue) + + # Call kneighbors through sklearnex (self.kneighbors is the sklearnex method) + # This properly handles X=None case (LOOCV) with query_is_train logic + neigh_dist, neigh_ind = self.kneighbors(X) + + # Use the helper method to compute class probabilities + result = self._compute_class_probabilities( + neigh_dist, neigh_ind, self.weights, self._y, self.classes_, self.outputs_2d_ + ) + print(f"DEBUG KNeighborsClassifier._onedal_predict_proba END: result type={type(result)}", file=sys.stderr) return result diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 146cc817f0..59122fac7f 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -205,13 +205,49 @@ def _onedal_fit(self, X, y, queue=None): def _onedal_predict(self, X, queue=None): import sys print(f"DEBUG KNeighborsRegressor._onedal_predict START: X type={type(X)}", file=sys.stderr) + + # Dispatch between GPU and SKL prediction methods + # This logic matches onedal regressor predict() method but computation happens in sklearnex + gpu_device = queue is not None and getattr(queue.sycl_device, "is_gpu", False) + is_uniform_weights = getattr(self, "weights", "uniform") == "uniform" + + if gpu_device and is_uniform_weights: + # GPU path: call onedal backend directly + result = self._predict_gpu(X, queue=queue) + else: + # SKL path: call kneighbors (through sklearnex) then compute in sklearnex + result = self._predict_skl(X, queue=queue) + + print(f"DEBUG KNeighborsRegressor._onedal_predict END: result type={type(result)}", file=sys.stderr) + return result + + def _predict_gpu(self, X, queue=None): + """GPU prediction path - validates X and calls onedal backend.""" + import sys + print(f"DEBUG KNeighborsRegressor._predict_gpu START: X type={type(X)}", file=sys.stderr) # Validate and convert X (pandas to numpy if needed) only if X is not None if X is not None: X = validate_data( self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False ) - result = self._onedal_estimator.predict(X, queue=queue) - print(f"DEBUG KNeighborsRegressor._onedal_predict END: result type={type(result)}", file=sys.stderr) + # Call onedal backend for GPU prediction + result = self._onedal_estimator._predict_gpu(X) + print(f"DEBUG KNeighborsRegressor._predict_gpu END: result type={type(result)}", file=sys.stderr) + return result + + def _predict_skl(self, X, queue=None): + """SKL prediction path - calls kneighbors through sklearnex, computes prediction here.""" + import sys + print(f"DEBUG KNeighborsRegressor._predict_skl START: X type={type(X)}", file=sys.stderr) + + # Call kneighbors through sklearnex (self.kneighbors is the sklearnex method) + # This properly handles X=None case (LOOCV) with query_is_train logic + neigh_dist, neigh_ind = self.kneighbors(X) + + # Use the helper method to compute weighted prediction + result = self._compute_weighted_prediction(neigh_dist, neigh_ind, self.weights, self._y) + + print(f"DEBUG KNeighborsRegressor._predict_skl END: result type={type(result)}", file=sys.stderr) return result def _onedal_kneighbors( From 8cccb1dab64229b9b3ac2a638a66cae8afa6c955 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Thu, 16 Oct 2025 15:25:40 -0700 Subject: [PATCH 50/87] refacor: onedal prediciton entirely to sklearnex --- onedal/neighbors/neighbors.py | 320 +++++++++------------- sklearnex/neighbors/common.py | 43 ++- sklearnex/neighbors/knn_classification.py | 11 +- sklearnex/neighbors/knn_regression.py | 8 +- 4 files changed, 184 insertions(+), 198 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index 59a5c0bbac..fa7259fea2 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -106,42 +106,39 @@ def _onedal_fit(self, X, y): ... # return out - # TODO FUTURE REFACTORING: This method should not be in onedal layer - # The entire predict_proba and _predict_skl implementations should be moved to sklearnex layer - # Then _get_weights can be removed from onedal entirely (it already exists in sklearnex/neighbors/common.py) - # For now keeping it here to avoid circular dependency issues - def _get_weights(self, dist, weights): - # REFACTOR NOTE: Weight parameter validation (raise ValueError) should be in sklearnex - # But keeping entire method here temporarily until predict_proba/predict_skl are moved to sklearnex - if weights in (None, "uniform"): - return None - if weights == "distance": - # if user attempts to classify a point that was zero distance from one - # or more training points, those training points are weighted as 1.0 - # and the other points as 0.0 - if dist.dtype is np.dtype(object): - for point_dist_i, point_dist in enumerate(dist): - # check if point_dist is iterable - # (ex: RadiusNeighborClassifier.predict may set an element of - # dist to 1e-6 to represent an 'outlier') - if hasattr(point_dist, "__contains__") and 0.0 in point_dist: - dist[point_dist_i] = point_dist == 0.0 - else: - dist[point_dist_i] = 1.0 / point_dist - else: - with np.errstate(divide="ignore"): - dist = 1.0 / dist - inf_mask = np.isinf(dist) - inf_row = np.any(inf_mask, axis=1) - dist[inf_row] = inf_mask[inf_row] - return dist - elif callable(weights): - return weights(dist) - else: - raise ValueError( - "weights not recognized: should be 'uniform', " - "'distance', or a callable function" - ) + # REFACTOR: _get_weights moved to sklearnex/neighbors/common.py + # All prediction logic now in sklearnex layer, so this method is no longer needed in onedal + # Original code kept for reference only + # def _get_weights(self, dist, weights): + # if weights in (None, "uniform"): + # return None + # if weights == "distance": + # # if user attempts to classify a point that was zero distance from one + # # or more training points, those training points are weighted as 1.0 + # # and the other points as 0.0 + # if dist.dtype is np.dtype(object): + # for point_dist_i, point_dist in enumerate(dist): + # # check if point_dist is iterable + # # (ex: RadiusNeighborClassifier.predict may set an element of + # # dist to 1e-6 to represent an 'outlier') + # if hasattr(point_dist, "__contains__") and 0.0 in point_dist: + # dist[point_dist_i] = point_dist == 0.0 + # else: + # dist[point_dist_i] = 1.0 / point_dist + # else: + # with np.errstate(divide="ignore"): + # dist = 1.0 / dist + # inf_mask = np.isinf(dist) + # inf_row = np.any(inf_mask, axis=1) + # dist[inf_row] = inf_mask[inf_row] + # return dist + # elif callable(weights): + # return weights(dist) + # else: + # raise ValueError( + # "weights not recognized: should be 'uniform', " + # "'distance', or a callable function" + # ) def _get_onedal_params(self, X, y=None, n_neighbors=None): class_count = 0 if self.classes_ is None else len(self.classes_) @@ -333,7 +330,7 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): # ) # Still need n_features for _parse_auto_method call later - n_features = getattr(self, "n_features_in_", None) + # n_features = getattr(self, "n_features_in_", None) _check_is_fitted(self) @@ -396,9 +393,9 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): # ) # chunked_results = None - method = self._parse_auto_method( - self._fit_method, self.n_samples_fit_, n_features - ) + # method = self._parse_auto_method( + # self._fit_method, self.n_samples_fit_, n_features + # ) # REFACTOR: Following PCA pattern - onedal just calls backend and returns raw results # All post-processing (kd_tree sorting, removing self, return_distance decision) moved to sklearnex @@ -524,92 +521,100 @@ def _onedal_predict(self, model, X, params): def fit(self, X, y, queue=None): return self._fit(X, y) - @supports_queue - def predict(self, X, queue=None): - print(f"DEBUG KNeighborsClassifier.predict START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - - # REFACTOR: _check_array validation commented out - should be done in sklearnex layer - # Original validation code kept for reference: - # use_raw_input = _get_config().get("use_raw_input", False) is True - # if not use_raw_input: - # X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) - - onedal_model = getattr(self, "_onedal_model", None) - n_features = getattr(self, "n_features_in_", None) - n_samples_fit_ = getattr(self, "n_samples_fit_", None) - - # REFACTOR: Feature count validation commented out - should be done in sklearnex layer - # Original validation code kept for reference: - # shape = getattr(X, "shape", None) - # if n_features and shape and len(shape) > 1 and shape[1] != n_features: - # raise ValueError( - # ( - # f"X has {X.shape[1]} features, " - # f"but KNNClassifier is expecting " - # f"{n_features} features as input" - # ) - # ) - - _check_is_fitted(self) - - self._fit_method = self._parse_auto_method( - self.algorithm, n_samples_fit_, n_features - ) - - # REFACTOR NOTE: _validate_n_classes() is now called during fit in sklearnex layer - # No need to validate again during predict - # self._validate_n_classes() - - params = self._get_onedal_params(X) - prediction_result = self._onedal_predict(onedal_model, X, params) - responses = from_table(prediction_result.responses) - - result = self.classes_.take(np.asarray(responses.ravel(), dtype=np.intp)) - print(f"DEBUG KNeighborsClassifier.predict END: result type={type(result)}", file=sys.stderr) - return result - - @supports_queue - def predict_proba(self, X, queue=None): - print(f"DEBUG KNeighborsClassifier.predict_proba START: X type={type(X)}", file=sys.stderr) - neigh_dist, neigh_ind = self.kneighbors(X, queue=queue) - - classes_ = self.classes_ - _y = self._y - if not self.outputs_2d_: - _y = self._y.reshape((-1, 1)) - classes_ = [self.classes_] - - n_queries = _num_samples(X) - - print(f"DEBUG predict_proba: Calling _get_weights", file=sys.stderr) - weights = self._get_weights(neigh_dist, self.weights) - if weights is None: - print(f"DEBUG predict_proba: weights is None, using ones_like", file=sys.stderr) - weights = np.ones_like(neigh_ind) - else: - print(f"DEBUG predict_proba: weights calculated, type={type(weights)}", file=sys.stderr) - - all_rows = np.arange(n_queries) - probabilities = [] - for k, classes_k in enumerate(classes_): - pred_labels = _y[:, k][neigh_ind] - proba_k = np.zeros((n_queries, classes_k.size)) - - # a simple ':' index doesn't work right - for i, idx in enumerate(pred_labels.T): # loop is O(n_neighbors) - proba_k[all_rows, idx] += weights[:, i] - - # normalize 'votes' into real [0,1] probabilities - normalizer = proba_k.sum(axis=1)[:, np.newaxis] - normalizer[normalizer == 0.0] = 1.0 - proba_k /= normalizer - - probabilities.append(proba_k) - - if not self.outputs_2d_: - probabilities = probabilities[0] - - return probabilities + # REFACTOR: All prediction logic moved to sklearnex layer + # predict() and predict_proba() are no longer used - sklearnex calls kneighbors() and computes predictions + # Original code kept for reference only + # @supports_queue + # def predict(self, X, queue=None): + # print(f"DEBUG KNeighborsClassifier.predict START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + # + # # REFACTOR: _check_array validation commented out - should be done in sklearnex layer + # # Original validation code kept for reference: + # # use_raw_input = _get_config().get("use_raw_input", False) is True + # # if not use_raw_input: + # # X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + # + # onedal_model = getattr(self, "_onedal_model", None) + # n_features = getattr(self, "n_features_in_", None) + # n_samples_fit_ = getattr(self, "n_samples_fit_", None) + # + # # REFACTOR: Feature count validation commented out - should be done in sklearnex layer + # # Original validation code kept for reference: + # # shape = getattr(X, "shape", None) + # # if n_features and shape and len(shape) > 1 and shape[1] != n_features: + # # raise ValueError( + # # ( + # # f"X has {X.shape[1]} features, " + # # f"but KNNClassifier is expecting " + # # f"{n_features} features as input" + # # ) + # # ) + # + # _check_is_fitted(self) + # + # self._fit_method = self._parse_auto_method( + # self.algorithm, n_samples_fit_, n_features + # ) + # + # # REFACTOR NOTE: _validate_n_classes() is now called during fit in sklearnex layer + # # No need to validate again during predict + # # self._validate_n_classes() + # + # # Handle X=None case (LOOCV pattern) - use training data + # # This is needed because _get_onedal_params expects X to have .dtype attribute + # if X is None: + # X = self._fit_X + # + # params = self._get_onedal_params(X) + # prediction_result = self._onedal_predict(onedal_model, X, params) + # responses = from_table(prediction_result.responses) + # + # result = self.classes_.take(np.asarray(responses.ravel(), dtype=np.intp)) + # print(f"DEBUG KNeighborsClassifier.predict END: result type={type(result)}", file=sys.stderr) + # return result + # + # @supports_queue + # def predict_proba(self, X, queue=None): + # print(f"DEBUG KNeighborsClassifier.predict_proba START: X type={type(X)}", file=sys.stderr) + # neigh_dist, neigh_ind = self.kneighbors(X, queue=queue) + # + # classes_ = self.classes_ + # _y = self._y + # if not self.outputs_2d_: + # _y = self._y.reshape((-1, 1)) + # classes_ = [self.classes_] + # + # n_queries = _num_samples(X) + # + # print(f"DEBUG predict_proba: Calling _get_weights", file=sys.stderr) + # weights = self._get_weights(neigh_dist, self.weights) + # if weights is None: + # print(f"DEBUG predict_proba: weights is None, using ones_like", file=sys.stderr) + # weights = np.ones_like(neigh_ind) + # else: + # print(f"DEBUG predict_proba: weights calculated, type={type(weights)}", file=sys.stderr) + # + # all_rows = np.arange(n_queries) + # probabilities = [] + # for k, classes_k in enumerate(classes_): + # pred_labels = _y[:, k][neigh_ind] + # proba_k = np.zeros((n_queries, classes_k.size)) + # + # # a simple ':' index doesn't work right + # for i, idx in enumerate(pred_labels.T): # loop is O(n_neighbors) + # proba_k[all_rows, idx] += weights[:, i] + # + # # normalize 'votes' into real [0,1] probabilities + # normalizer = proba_k.sum(axis=1)[:, np.newaxis] + # normalizer[normalizer == 0.0] = 1.0 + # proba_k /= normalizer + # + # probabilities.append(proba_k) + # + # if not self.outputs_2d_: + # probabilities = probabilities[0] + # + # return probabilities @supports_queue def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): @@ -687,28 +692,14 @@ def fit(self, X, y, queue=None): def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): return self._kneighbors(X, n_neighbors, return_distance) + # REFACTOR: Keep _predict_gpu for GPU backend support (called by sklearnex) + # This is the ONLY prediction method needed in onedal - it calls the backend directly + # All computation logic (weights, averaging, etc.) is in sklearnex def _predict_gpu(self, X): - # REFACTOR: _check_array validation commented out - should be done in sklearnex layer - # Original validation code kept for reference: - # use_raw_input = _get_config().get("use_raw_input", False) is True - # if not use_raw_input: - # X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) - + # REFACTOR: Validation commented out - should be done in sklearnex layer before calling this onedal_model = getattr(self, "_onedal_model", None) n_features = getattr(self, "n_features_in_", None) n_samples_fit_ = getattr(self, "n_samples_fit_", None) - - # REFACTOR: Feature count validation commented out - should be done in sklearnex layer - # Original validation code kept for reference: - # shape = getattr(X, "shape", None) - # if n_features and shape and len(shape) > 1 and shape[1] != n_features: - # raise ValueError( - # ( - # f"X has {X.shape[1]} features, " - # f"but KNNClassifier is expecting " - # f"{n_features} features as input" - # ) - # ) _check_is_fitted(self) @@ -724,47 +715,6 @@ def _predict_gpu(self, X): return result - def _predict_skl(self, X): - print(f"DEBUG KNeighborsRegressor._predict_skl START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - neigh_dist, neigh_ind = self.kneighbors(X) - - print(f"DEBUG _predict_skl: Calling _get_weights", file=sys.stderr) - weights = self._get_weights(neigh_dist, self.weights) - print(f"DEBUG _predict_skl: weights result={type(weights) if weights is not None else 'None'}", file=sys.stderr) - - _y = self._y - if _y.ndim == 1: - _y = _y.reshape((-1, 1)) - - if weights is None: - y_pred = np.mean(_y[neigh_ind], axis=1) - else: - y_pred = np.empty((X.shape[0], _y.shape[1]), dtype=np.float64) - denom = np.sum(weights, axis=1) - - for j in range(_y.shape[1]): - num = np.sum(_y[neigh_ind, j] * weights, axis=1) - y_pred[:, j] = num / denom - - if self._y.ndim == 1: - y_pred = y_pred.ravel() - - print(f"DEBUG KNeighborsRegressor._predict_skl END: y_pred type={type(y_pred)}", file=sys.stderr) - return y_pred - - @supports_queue - def predict(self, X, queue=None): - print(f"DEBUG KNeighborsRegressor.predict START: X type={type(X)}, queue={queue}", file=sys.stderr) - gpu_device = queue is not None and getattr(queue.sycl_device, "is_gpu", False) - is_uniform_weights = getattr(self, "weights", "uniform") == "uniform" - print(f"DEBUG KNeighborsRegressor.predict: gpu_device={gpu_device}, is_uniform_weights={is_uniform_weights}", file=sys.stderr) - if gpu_device and is_uniform_weights: - result = self._predict_gpu(X) - else: - result = self._predict_skl(X) - print(f"DEBUG KNeighborsRegressor.predict END: result type={type(result)}", file=sys.stderr) - return result - class NearestNeighbors(NeighborsBase): def __init__( diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index d95a4cec1f..6799858738 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -207,6 +207,45 @@ def _compute_class_probabilities(self, neigh_dist, neigh_ind, weights_param, y_t probabilities = probabilities[0] return probabilities + + def _predict_skl_regression(self, X): + """SKL prediction path for regression - calls kneighbors, computes predictions. + + This method handles X=None (LOOCV) properly by calling self.kneighbors which + has the query_is_train logic. + + Args: + X: Query samples (or None for LOOCV) + Returns: + Predicted regression values + """ + neigh_dist, neigh_ind = self.kneighbors(X) + return self._compute_weighted_prediction( + neigh_dist, neigh_ind, self.weights, self._y + ) + + def _predict_skl_classification(self, X): + """SKL prediction path for classification - calls kneighbors, computes predictions. + + This method handles X=None (LOOCV) properly by calling self.kneighbors which + has the query_is_train logic. + + Args: + X: Query samples (or None for LOOCV) + Returns: + Predicted class labels + """ + neigh_dist, neigh_ind = self.kneighbors(X) + proba = self._compute_class_probabilities( + neigh_dist, neigh_ind, self.weights, self._y, self.classes_, self.outputs_2d_ + ) + if not self.outputs_2d_: + result = self.classes_[np.argmax(proba, axis=1)] + else: + result = [classes_k[np.argmax(proba_k, axis=1)] + for classes_k, proba_k in zip(self.classes_, proba.T)] + result = np.array(result).T + return result def _validate_targets(self, y, dtype): arr = _column_or_1d(y, warn=True) @@ -486,8 +525,10 @@ def _fit_validation(self, X, y=None): self.effective_metric_ = "chebyshev" if not isinstance(X, (KDTree, BallTree, _sklearn_NeighborsBase)): + # Don't validate for finite values here - this is just for shape/algorithm determination + # Actual validation happens in _onedal_fit (via validate_data) if onedal is used self._fit_X = _check_array( - X, dtype=[np.float64, np.float32], accept_sparse=True + X, dtype=[np.float64, np.float32], accept_sparse=True, force_all_finite=False ) self.n_samples_fit_ = _num_samples(self._fit_X) self.n_features_in_ = _num_features(self._fit_X) diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index 2cbac6923f..ca12afd8fe 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -215,12 +215,11 @@ def _onedal_fit(self, X, y, queue=None): def _onedal_predict(self, X, queue=None): import sys print(f"DEBUG KNeighborsClassifier._onedal_predict START: X type={type(X)}", file=sys.stderr) - # Validate and convert X (pandas to numpy if needed) only if X is not None - if X is not None: - X = validate_data( - self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False - ) - result = self._onedal_estimator.predict(X, queue=queue) + + # Use the unified helper from common.py (calls kneighbors + computes prediction) + # This properly handles X=None (LOOCV) case + result = self._predict_skl_classification(X) + print(f"DEBUG KNeighborsClassifier._onedal_predict END: result type={type(result)}", file=sys.stderr) return result diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 59122fac7f..01457234b0 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -240,12 +240,8 @@ def _predict_skl(self, X, queue=None): import sys print(f"DEBUG KNeighborsRegressor._predict_skl START: X type={type(X)}", file=sys.stderr) - # Call kneighbors through sklearnex (self.kneighbors is the sklearnex method) - # This properly handles X=None case (LOOCV) with query_is_train logic - neigh_dist, neigh_ind = self.kneighbors(X) - - # Use the helper method to compute weighted prediction - result = self._compute_weighted_prediction(neigh_dist, neigh_ind, self.weights, self._y) + # Use the unified helper from common.py (calls kneighbors + computes prediction) + result = self._predict_skl_regression(X) print(f"DEBUG KNeighborsRegressor._predict_skl END: result type={type(result)}", file=sys.stderr) return result From 5e01257f40a9a917b53ab46a278dd6a39eef3812 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Thu, 16 Oct 2025 16:58:55 -0700 Subject: [PATCH 51/87] feature: array api in common.py --- onedal/neighbors/neighbors.py | 2 +- sklearnex/neighbors/common.py | 100 ++++++++++++++++++++++------------ 2 files changed, 66 insertions(+), 36 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index fa7259fea2..e77586a029 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -216,7 +216,7 @@ def _fit(self, X, y): self, "effective_metric_params_", self.metric_params ) - _, xp, _ = _get_sycl_namespace(X) + # _, xp, _ = _get_sycl_namespace(X) # REFACTOR: _validate_data call commented out - validation now happens in sklearnex layer # Original code kept for reference: # use_raw_input = _get_config().get("use_raw_input", False) is True diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index 6799858738..fe86b0798b 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -98,10 +98,12 @@ def _get_weights(self, dist, weights): if weights in (None, "uniform"): return None if weights == "distance": + # Array API support: get namespace from dist array + xp, _ = get_namespace(dist) # if user attempts to classify a point that was zero distance from one # or more training points, those training points are weighted as 1.0 # and the other points as 0.0 - if dist.dtype is np.dtype(object): + if dist.dtype is xp.asarray(object).dtype: for point_dist_i, point_dist in enumerate(dist): # check if point_dist is iterable # (ex: RadiusNeighborClassifier.predict may set an element of @@ -111,10 +113,10 @@ def _get_weights(self, dist, weights): else: dist[point_dist_i] = 1.0 / point_dist else: - with np.errstate(divide="ignore"): + with xp.errstate(divide="ignore") if hasattr(xp, 'errstate') else np.errstate(divide="ignore"): dist = 1.0 / dist - inf_mask = np.isinf(dist) - inf_row = np.any(inf_mask, axis=1) + inf_mask = xp.isinf(dist) + inf_row = xp.any(inf_mask, axis=1) dist[inf_row] = inf_mask[inf_row] return dist elif callable(weights): @@ -137,24 +139,27 @@ def _compute_weighted_prediction(self, neigh_dist, neigh_ind, weights_param, y_t Returns: Predicted values """ + # Array API support: get namespace from input arrays + xp, _ = get_namespace(neigh_dist, neigh_ind, y_train) + weights = self._get_weights(neigh_dist, weights_param) _y = y_train if _y.ndim == 1: - _y = _y.reshape((-1, 1)) + _y = xp.reshape(_y, (-1, 1)) if weights is None: - y_pred = np.mean(_y[neigh_ind], axis=1) + y_pred = xp.mean(_y[neigh_ind], axis=1) else: - y_pred = np.empty((neigh_ind.shape[0], _y.shape[1]), dtype=np.float64) - denom = np.sum(weights, axis=1) + y_pred = xp.empty((neigh_ind.shape[0], _y.shape[1]), dtype=xp.float64) + denom = xp.sum(weights, axis=1) for j in range(_y.shape[1]): - num = np.sum(_y[neigh_ind, j] * weights, axis=1) + num = xp.sum(_y[neigh_ind, j] * weights, axis=1) y_pred[:, j] = num / denom if y_train.ndim == 1: - y_pred = y_pred.ravel() + y_pred = xp.reshape(y_pred, (-1,)) return y_pred @@ -174,30 +179,33 @@ def _compute_class_probabilities(self, neigh_dist, neigh_ind, weights_param, y_t """ from ..utils.validation import _num_samples + # Array API support: get namespace from input arrays + xp, _ = get_namespace(neigh_dist, neigh_ind, y_train) + _y = y_train classes_ = classes if not outputs_2d: - _y = y_train.reshape((-1, 1)) + _y = xp.reshape(y_train, (-1, 1)) classes_ = [classes] n_queries = neigh_ind.shape[0] weights = self._get_weights(neigh_dist, weights_param) if weights is None: - weights = np.ones_like(neigh_ind) + weights = xp.ones_like(neigh_ind) - all_rows = np.arange(n_queries) + all_rows = xp.arange(n_queries) probabilities = [] for k, classes_k in enumerate(classes_): pred_labels = _y[:, k][neigh_ind] - proba_k = np.zeros((n_queries, classes_k.size)) + proba_k = xp.zeros((n_queries, classes_k.size)) # a simple ':' index doesn't work right for i, idx in enumerate(pred_labels.T): # loop is O(n_neighbors) proba_k[all_rows, idx] += weights[:, i] # normalize 'votes' into real [0,1] probabilities - normalizer = proba_k.sum(axis=1)[:, np.newaxis] + normalizer = xp.sum(proba_k, axis=1)[:, xp.newaxis] normalizer[normalizer == 0.0] = 1.0 proba_k /= normalizer @@ -239,12 +247,17 @@ def _predict_skl_classification(self, X): proba = self._compute_class_probabilities( neigh_dist, neigh_ind, self.weights, self._y, self.classes_, self.outputs_2d_ ) + # Array API support: get namespace from probability array + xp, _ = get_namespace(proba) + if not self.outputs_2d_: - result = self.classes_[np.argmax(proba, axis=1)] + # Single output: classes_[argmax(proba, axis=1)] + result = self.classes_[xp.argmax(proba, axis=1)] else: - result = [classes_k[np.argmax(proba_k, axis=1)] + # Multi-output: apply argmax separately for each output + result = [classes_k[xp.argmax(proba_k, axis=1)] for classes_k, proba_k in zip(self.classes_, proba.T)] - result = np.array(result).T + result = xp.asarray(result).T return result def _validate_targets(self, y, dtype): @@ -381,15 +394,17 @@ def _kneighbors_post_processing(self, X, n_neighbors, return_distance, result, q Returns: Post-processed result: (distances, indices) if return_distance else indices """ + # Array API support: get namespace from result arrays # onedal always returns both distances and indices (backend computes both) distances, indices = result + xp, _ = get_namespace(distances, indices) # POST-PROCESSING STEP 1: kd_tree sorting (moved from onedal) # This happens BEFORE deciding what to return, using distances that are always available # Matches main branch: sorting uses distances even when return_distance=False if self._fit_method == "kd_tree": for i in range(distances.shape[0]): - seq = distances[i].argsort() + seq = xp.argsort(distances[i]) indices[i] = indices[i][seq] distances[i] = distances[i][seq] @@ -414,20 +429,20 @@ def _kneighbors_post_processing(self, X, n_neighbors, return_distance, result, q # X is self._fit_X in query_is_train case (set by caller) n_queries, _ = X.shape - sample_range = np.arange(n_queries)[:, None] + sample_range = xp.arange(n_queries)[:, xp.newaxis] sample_mask = neigh_ind != sample_range # Corner case: When the number of duplicates are more # than the number of neighbors, the first NN will not # be the sample, but a duplicate. # In that case mask the first duplicate. - dup_gr_nbrs = np.all(sample_mask, axis=1) + dup_gr_nbrs = xp.all(sample_mask, axis=1) sample_mask[:, 0][dup_gr_nbrs] = False - neigh_ind = np.reshape(neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) + neigh_ind = xp.reshape(neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) if return_distance: - neigh_dist = np.reshape(neigh_dist[sample_mask], (n_queries, n_neighbors - 1)) + neigh_dist = xp.reshape(neigh_dist[sample_mask], (n_queries, n_neighbors - 1)) return neigh_dist, neigh_ind return neigh_ind @@ -439,8 +454,11 @@ def _process_classification_targets(self, y): import sys print(f"DEBUG _process_classification_targets: y type={type(y)}, y shape={getattr(y, 'shape', 'NO_SHAPE')}", file=sys.stderr) + # Array API support: get namespace from y + xp, _ = get_namespace(y) + # y should already be numpy array from validate_data - y = np.asarray(y) + y = xp.asarray(y) # Handle shape processing shape = getattr(y, "shape", None) @@ -448,23 +466,27 @@ def _process_classification_targets(self, y): if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1: self.outputs_2d_ = False - y = y.reshape((-1, 1)) + y = xp.reshape(y, (-1, 1)) else: self.outputs_2d_ = True # Validate classification targets _check_classification_targets(y) - # Process classes + # Process classes - note: np.unique is used for class extraction + # This is acceptable as classes are typically numpy arrays in sklearn self.classes_ = [] - self._y = np.empty(y.shape, dtype=int) + self._y = xp.empty(y.shape, dtype=xp.int32) for k in range(self._y.shape[1]): - classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True) + # Use numpy unique for class extraction (standard sklearn pattern) + y_k = np.asarray(y[:, k]) + classes, indices = np.unique(y_k, return_inverse=True) self.classes_.append(classes) + self._y[:, k] = xp.asarray(indices) if not self.outputs_2d_: self.classes_ = self.classes_[0] - self._y = self._y.ravel() + self._y = xp.reshape(self._y, (-1,)) # Validate we have at least 2 classes self._validate_n_classes() @@ -659,9 +681,13 @@ def _onedal_supported(self, device, method_name, *data): y = None # To check multioutput, might be overhead if len(data) > 1: - y = np.asarray(data[1]) + # Array API support: get namespace from y + y_input = data[1] + xp, _ = get_namespace(y_input) + y = xp.asarray(y_input) if is_classifier: - class_count = len(np.unique(y)) + # Use numpy for unique (standard sklearn pattern) + class_count = len(np.unique(np.asarray(y))) if hasattr(self, "_onedal_estimator"): y = self._onedal_estimator._y if y is not None and hasattr(y, "ndim") and hasattr(y, "shape"): @@ -744,14 +770,18 @@ def kneighbors_graph(self, X=None, n_neighbors=None, mode="connectivity"): # requires moving data to host to construct the csr_matrix if mode == "connectivity": A_ind = self.kneighbors(X, n_neighbors, return_distance=False) + # Array API support: get namespace from A_ind + xp, _ = get_namespace(A_ind) _, (A_ind,) = _transfer_to_host(A_ind) n_queries = A_ind.shape[0] - A_data = np.ones(n_queries * n_neighbors) + A_data = xp.ones((n_queries * n_neighbors,), dtype=xp.float64) elif mode == "distance": A_data, A_ind = self.kneighbors(X, n_neighbors, return_distance=True) + # Array API support: get namespace from A_data + xp, _ = get_namespace(A_data, A_ind) _, (A_data, A_ind) = _transfer_to_host(A_data, A_ind) - A_data = np.reshape(A_data, (-1,)) + A_data = xp.reshape(A_data, (-1,)) else: raise ValueError( @@ -762,10 +792,10 @@ def kneighbors_graph(self, X=None, n_neighbors=None, mode="connectivity"): n_queries = A_ind.shape[0] n_samples_fit = self.n_samples_fit_ n_nonzero = n_queries * n_neighbors - A_indptr = np.arange(0, n_nonzero + 1, n_neighbors) + A_indptr = xp.arange(0, n_nonzero + 1, n_neighbors) kneighbors_graph = sp.csr_matrix( - (A_data, np.reshape(A_ind, (-1,)), A_indptr), shape=(n_queries, n_samples_fit) + (A_data, xp.reshape(A_ind, (-1,)), A_indptr), shape=(n_queries, n_samples_fit) ) return kneighbors_graph From 8bec3dc4d6c7607980bf2f97db5a365117756c58 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Fri, 17 Oct 2025 12:20:57 -0700 Subject: [PATCH 52/87] fix: assertion error --- sklearnex/neighbors/common.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index fe86b0798b..1f0ae3574e 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -770,18 +770,18 @@ def kneighbors_graph(self, X=None, n_neighbors=None, mode="connectivity"): # requires moving data to host to construct the csr_matrix if mode == "connectivity": A_ind = self.kneighbors(X, n_neighbors, return_distance=False) - # Array API support: get namespace from A_ind - xp, _ = get_namespace(A_ind) + # Transfer to host - after this, arrays are numpy _, (A_ind,) = _transfer_to_host(A_ind) n_queries = A_ind.shape[0] - A_data = xp.ones((n_queries * n_neighbors,), dtype=xp.float64) + # Use numpy after transfer to host + A_data = np.ones(n_queries * n_neighbors) elif mode == "distance": A_data, A_ind = self.kneighbors(X, n_neighbors, return_distance=True) - # Array API support: get namespace from A_data - xp, _ = get_namespace(A_data, A_ind) + # Transfer to host - after this, arrays are numpy _, (A_data, A_ind) = _transfer_to_host(A_data, A_ind) - A_data = xp.reshape(A_data, (-1,)) + # Use numpy after transfer to host + A_data = np.reshape(A_data, (-1,)) else: raise ValueError( @@ -792,10 +792,11 @@ def kneighbors_graph(self, X=None, n_neighbors=None, mode="connectivity"): n_queries = A_ind.shape[0] n_samples_fit = self.n_samples_fit_ n_nonzero = n_queries * n_neighbors - A_indptr = xp.arange(0, n_nonzero + 1, n_neighbors) + # Use numpy after transfer to host + A_indptr = np.arange(0, n_nonzero + 1, n_neighbors) kneighbors_graph = sp.csr_matrix( - (A_data, xp.reshape(A_ind, (-1,)), A_indptr), shape=(n_queries, n_samples_fit) + (A_data, np.reshape(A_ind, (-1,)), A_indptr), shape=(n_queries, n_samples_fit) ) return kneighbors_graph From bbab97ac74dd91bf43936702e479685b1d76c23b Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Fri, 17 Oct 2025 17:41:46 -0700 Subject: [PATCH 53/87] feature: add array api support to knn skleranex files --- sklearnex/neighbors/knn_classification.py | 9 +++++++-- sklearnex/neighbors/knn_regression.py | 12 +++++++++--- sklearnex/neighbors/knn_unsupervised.py | 11 +++++++++-- 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index ca12afd8fe..39bd21551c 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -27,10 +27,11 @@ from onedal.neighbors import KNeighborsClassifier as onedal_KNeighborsClassifier from .._device_offload import dispatch, wrap_output_data +from ..utils._array_api import enable_array_api, get_namespace from ..utils.validation import check_feature_names, validate_data from .common import KNeighborsDispatchingBase - +@enable_array_api @control_n_jobs( decorated_methods=["fit", "predict", "predict_proba", "kneighbors", "score"] ) @@ -170,9 +171,13 @@ def _onedal_fit(self, X, y, queue=None): import sys print(f"DEBUG KNeighborsClassifier._onedal_fit START: X type={type(X)}, y type={type(y)}", file=sys.stderr) + # Get array namespace for array API support + xp, _ = get_namespace(X) + print(f"DEBUG: Array namespace: {xp}", file=sys.stderr) + # REFACTOR: Use validate_data from sklearnex.utils.validation to convert pandas to numpy X, y = validate_data( - self, X, y, dtype=[np.float64, np.float32], accept_sparse="csr" + self, X, y, dtype=[xp.float64, xp.float32], accept_sparse="csr" ) print(f"DEBUG: After validate_data, X type={type(X)}, y type={type(y)}", file=sys.stderr) diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 01457234b0..254bce38f8 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -27,10 +27,12 @@ from onedal.neighbors import KNeighborsRegressor as onedal_KNeighborsRegressor from .._device_offload import dispatch, wrap_output_data +from ..utils._array_api import enable_array_api, get_namespace from ..utils.validation import check_feature_names, validate_data from .common import KNeighborsDispatchingBase +@enable_array_api @control_n_jobs(decorated_methods=["fit", "predict", "kneighbors", "score"]) class KNeighborsRegressor(KNeighborsDispatchingBase, _sklearn_KNeighborsRegressor): __doc__ = _sklearn_KNeighborsRegressor.__doc__ @@ -150,9 +152,13 @@ def _onedal_fit(self, X, y, queue=None): import sys print(f"DEBUG KNeighborsRegressor._onedal_fit START: X type={type(X)}, y type={type(y)}", file=sys.stderr) + # Get array namespace for array API support + xp, _ = get_namespace(X) + print(f"DEBUG: Array namespace: {xp}", file=sys.stderr) + # REFACTOR: Use validate_data from sklearnex.utils.validation to convert pandas to numpy for X only X = validate_data( - self, X, dtype=[np.float64, np.float32], accept_sparse="csr" + self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr" ) print(f"DEBUG: After validate_data, X type={type(X)}, y type={type(y)}", file=sys.stderr) @@ -192,7 +198,6 @@ def _onedal_fit(self, X, y, queue=None): # _, xp, _ = _get_sycl_namespace(X) # self._y = y if self._shape is None else xp.reshape(y, self._shape) # Now doing this in sklearnex layer - from ..utils._array_api import get_namespace if y is not None: xp, _ = get_namespace(y) self._y = y if self._shape is None else xp.reshape(y, self._shape) @@ -227,8 +232,9 @@ def _predict_gpu(self, X, queue=None): print(f"DEBUG KNeighborsRegressor._predict_gpu START: X type={type(X)}", file=sys.stderr) # Validate and convert X (pandas to numpy if needed) only if X is not None if X is not None: + xp, _ = get_namespace(X) X = validate_data( - self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False + self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False ) # Call onedal backend for GPU prediction result = self._onedal_estimator._predict_gpu(X) diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 7d28ff0bb0..202dda775e 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -26,10 +26,12 @@ from onedal.neighbors import NearestNeighbors as onedal_NearestNeighbors from .._device_offload import dispatch, wrap_output_data +from ..utils._array_api import enable_array_api, get_namespace from ..utils.validation import check_feature_names, validate_data from .common import KNeighborsDispatchingBase +@enable_array_api @control_n_jobs(decorated_methods=["fit", "kneighbors", "radius_neighbors"]) class NearestNeighbors(KNeighborsDispatchingBase, _sklearn_NearestNeighbors): __doc__ = _sklearn_NearestNeighbors.__doc__ @@ -154,9 +156,13 @@ def radius_neighbors_graph( def _onedal_fit(self, X, y=None, queue=None): print(f"DEBUG NearestNeighbors._onedal_fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", file=sys.stderr) + # Get array namespace for array API support + xp, _ = get_namespace(X) + print(f"DEBUG: Array namespace: {xp}", file=sys.stderr) + # REFACTOR: Use validate_data from sklearnex.utils.validation to convert pandas to numpy X = validate_data( - self, X, dtype=[np.float64, np.float32], accept_sparse="csr" + self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr" ) print(f"DEBUG: After validate_data, X type={type(X)}", file=sys.stderr) @@ -181,8 +187,9 @@ def _onedal_fit(self, X, y=None, queue=None): def _onedal_predict(self, X, queue=None): # Validate and convert X (pandas to numpy if needed) only if X is not None if X is not None: + xp, _ = get_namespace(X) X = validate_data( - self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False + self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False ) return self._onedal_estimator.predict(X, queue=queue) From aab0100745acc33dc3493aec9b714caabd9343a4 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Sun, 19 Oct 2025 23:50:47 -0700 Subject: [PATCH 54/87] fix: compatiibilty for array api --- onedal/neighbors/neighbors.py | 39 ++++++++--- sklearnex/neighbors/common.py | 81 ++++++++++++++++++----- sklearnex/neighbors/knn_classification.py | 15 ++++- sklearnex/neighbors/knn_regression.py | 18 ++++- sklearnex/neighbors/knn_unsupervised.py | 6 +- 5 files changed, 126 insertions(+), 33 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index e77586a029..32989289be 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -180,13 +180,17 @@ def __init__( self.p = p self.metric_params = metric_params - def _validate_targets(self, y, dtype): - arr = _column_or_1d(y, warn=True) - - try: - return arr.astype(dtype, copy=False) - except ValueError: - return arr + # REFACTOR: _validate_targets commented out - all data conversion/validation moved to sklearnex layer + # Following PCA pattern: onedal should not do any data type conversion + # The sklearnex layer prepares data in the correct format before calling onedal + # Original code kept for reference: + # def _validate_targets(self, y, dtype): + # arr = _column_or_1d(y, warn=True) + # + # try: + # return arr.astype(dtype, copy=False) + # except ValueError: + # return arr # REFACTOR NOTE: _validate_n_classes moved to sklearnex/neighbors/common.py # This method is no longer used in the onedal layer - all validation happens in sklearnex @@ -299,8 +303,18 @@ def _fit(self, X, y): gpu_device = queue is not None and queue.sycl_device.is_gpu print(f"DEBUG oneDAL _fit: Before _onedal_fit, X type={type(X)}, _fit_y type={type(_fit_y)}", file=sys.stderr) + # REFACTOR: All data preparation including reshaping moved to sklearnex layer + # Following PCA pattern: onedal is a thin wrapper, no data manipulation + # sklearnex prepares self._y in the correct shape before calling fit() + # Original code kept for reference: + # if _is_classifier(self) or (_is_regressor(self) and gpu_device): + # _fit_y = self._validate_targets(self._y, X.dtype).reshape((-1, 1)) + # OR for refactor without _validate_targets: + # _fit_y = self._y.reshape((-1, 1)) + + # REFACTOR: Just pass self._y as-is - sklearnex should have already reshaped it if _is_classifier(self) or (_is_regressor(self) and gpu_device): - _fit_y = self._validate_targets(self._y, X.dtype).reshape((-1, 1)) + _fit_y = self._y result = self._onedal_fit(X, _fit_y) print(f"DEBUG oneDAL _fit: After _onedal_fit, self._fit_X type={type(self._fit_X)}, shape={getattr(self._fit_X, 'shape', 'NO_SHAPE')}", file=sys.stderr) @@ -504,8 +518,10 @@ def infer(self, *args, **kwargs): ... def _onedal_fit(self, X, y): # global queue is set as per user configuration (`target_offload`) or from data prior to calling this internal function queue = QM.get_global_queue() - params = self._get_onedal_params(X, y) + # REFACTOR: Convert to table FIRST, then get params from table (following PCA pattern) + # This ensures dtype is normalized (array API dtype -> numpy dtype) X_table, y_table = to_table(X, y, queue=queue) + params = self._get_onedal_params(X_table, y) return self.train(params, X_table, y_table).model def _onedal_predict(self, model, X, params): @@ -746,8 +762,11 @@ def infer(self, *arg, **kwargs): ... def _onedal_fit(self, X, y): # global queue is set as per user configuration (`target_offload`) or from data prior to calling this internal function queue = QM.get_global_queue() + # REFACTOR: Convert to table FIRST, then get params from table (following PCA pattern) + # This ensures dtype is normalized (array API dtype -> numpy dtype) + # Note: NearestNeighbors has no y, so only convert X to avoid y becoming a table + X = to_table(X, queue=queue) params = self._get_onedal_params(X, y) - X, y = to_table(X, y, queue=queue) return self.train(params, X).model def _onedal_predict(self, model, X, params): diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index 1f0ae3574e..a2e64a1baa 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -149,14 +149,33 @@ def _compute_weighted_prediction(self, neigh_dist, neigh_ind, weights_param, y_t _y = xp.reshape(_y, (-1, 1)) if weights is None: - y_pred = xp.mean(_y[neigh_ind], axis=1) + # Array API: Use take() per row since array API take() only supports 1-D indices + # Build result by gathering rows one at a time + gathered_list = [] + for i in range(neigh_ind.shape[0]): + # Get indices for this sample's neighbors + sample_indices = neigh_ind[i, ...] # Shape: (n_neighbors,) + # Gather those rows from _y + sample_neighbors = xp.take(_y, sample_indices, axis=0) # Shape: (n_neighbors, n_outputs) + gathered_list.append(sample_neighbors) + # Stack and compute mean + gathered = xp.stack(gathered_list, axis=0) # Shape: (n_samples, n_neighbors, n_outputs) + y_pred = xp.mean(gathered, axis=1) else: y_pred = xp.empty((neigh_ind.shape[0], _y.shape[1]), dtype=xp.float64) denom = xp.sum(weights, axis=1) for j in range(_y.shape[1]): - num = xp.sum(_y[neigh_ind, j] * weights, axis=1) - y_pred[:, j] = num / denom + # Array API: Iterate over samples to gather values + y_col_j = _y[:, j, ...] # Shape: (n_train_samples,) + gathered_vals = [] + for i in range(neigh_ind.shape[0]): + sample_indices = neigh_ind[i, ...] # Shape: (n_neighbors,) + sample_vals = xp.take(y_col_j, sample_indices, axis=0) # Shape: (n_neighbors,) + gathered_vals.append(sample_vals) + gathered_j = xp.stack(gathered_vals, axis=0) # Shape: (n_samples, n_neighbors) + num = xp.sum(gathered_j * weights, axis=1) + y_pred[:, j, ...] = num / denom if y_train.ndim == 1: y_pred = xp.reshape(y_pred, (-1,)) @@ -192,17 +211,42 @@ def _compute_class_probabilities(self, neigh_dist, neigh_ind, weights_param, y_t weights = self._get_weights(neigh_dist, weights_param) if weights is None: - weights = xp.ones_like(neigh_ind) + # REFACTOR: Ensure weights is float for array API type promotion + # neigh_ind is int, so ones_like would give int, but we need float + weights = xp.ones_like(neigh_ind, dtype=xp.float64) - all_rows = xp.arange(n_queries) probabilities = [] for k, classes_k in enumerate(classes_): - pred_labels = _y[:, k][neigh_ind] - proba_k = xp.zeros((n_queries, classes_k.size)) + # Get predicted labels for each neighbor: shape (n_samples, n_neighbors) + # _y[:, k] gives training labels for output k, then gather using neigh_ind + y_col_k = _y[:, k, ...] - # a simple ':' index doesn't work right - for i, idx in enumerate(pred_labels.T): # loop is O(n_neighbors) - proba_k[all_rows, idx] += weights[:, i] + # Array API: Use take() with iteration since take() only supports 1-D indices + pred_labels_list = [] + for i in range(neigh_ind.shape[0]): + sample_indices = neigh_ind[i, ...] + sample_labels = xp.take(y_col_k, sample_indices, axis=0) + pred_labels_list.append(sample_labels) + pred_labels = xp.stack(pred_labels_list, axis=0) # Shape: (n_queries, n_neighbors) + + proba_k = xp.zeros((n_queries, classes_k.size), dtype=xp.float64) + + # Array API: Cannot use fancy indexing __setitem__ like proba_k[all_rows, idx] = ... + # Instead, build probabilities sample by sample + proba_list = [] + for sample_idx in range(n_queries): + sample_proba = xp.zeros((classes_k.size,), dtype=xp.float64) + # For this sample, accumulate weights for each neighbor's predicted class + for neighbor_idx in range(pred_labels.shape[1]): + class_label = int(pred_labels[sample_idx, neighbor_idx]) + weight = weights[sample_idx, neighbor_idx] + # Update probability for this class + sample_proba = xp.asarray([ + sample_proba[i] + weight if i == class_label else sample_proba[i] + for i in range(classes_k.size) + ]) + proba_list.append(sample_proba) + proba_k = xp.stack(proba_list, axis=0) # Shape: (n_queries, n_classes) # normalize 'votes' into real [0,1] probabilities normalizer = xp.sum(proba_k, axis=1)[:, xp.newaxis] @@ -258,6 +302,7 @@ def _predict_skl_classification(self, X): result = [classes_k[xp.argmax(proba_k, axis=1)] for classes_k, proba_k in zip(self.classes_, proba.T)] result = xp.asarray(result).T + return result def _validate_targets(self, y, dtype): @@ -348,9 +393,11 @@ def _prepare_kneighbors_inputs(self, X, n_neighbors): query_is_train = X is None if X is not None: - # Validate and convert X (pandas to numpy if needed) - X = validate_data( - self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False + # Get the array namespace to use correct dtypes + xp, _ = get_namespace(X) + # Use _check_array like main branch, with array API dtype support + X = _check_array( + X, dtype=[xp.float64, xp.float32], accept_sparse="csr" ) else: X = self._fit_X @@ -547,10 +594,12 @@ def _fit_validation(self, X, y=None): self.effective_metric_ = "chebyshev" if not isinstance(X, (KDTree, BallTree, _sklearn_NeighborsBase)): - # Don't validate for finite values here - this is just for shape/algorithm determination - # Actual validation happens in _onedal_fit (via validate_data) if onedal is used + # Use _check_array like main branch, but with array API dtype support + # Get array namespace for array API support + # Don't check for NaN - let oneDAL handle it (will fallback to sklearn if needed) + xp, _ = get_namespace(X) self._fit_X = _check_array( - X, dtype=[np.float64, np.float32], accept_sparse=True, force_all_finite=False + X, dtype=[xp.float64, xp.float32], accept_sparse=True, force_all_finite=False ) self.n_samples_fit_ = _num_samples(self._fit_X) self.n_features_in_ = _num_features(self._fit_X) diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index 39bd21551c..050957d9e2 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -175,7 +175,8 @@ def _onedal_fit(self, X, y, queue=None): xp, _ = get_namespace(X) print(f"DEBUG: Array namespace: {xp}", file=sys.stderr) - # REFACTOR: Use validate_data from sklearnex.utils.validation to convert pandas to numpy + # REFACTOR: Use validate_data to convert pandas to numpy and validate types + # force_all_finite=False to allow nan_euclidean metric to work (will fallback to sklearn) X, y = validate_data( self, X, y, dtype=[xp.float64, xp.float32], accept_sparse="csr" ) @@ -221,6 +222,13 @@ def _onedal_predict(self, X, queue=None): import sys print(f"DEBUG KNeighborsClassifier._onedal_predict START: X type={type(X)}", file=sys.stderr) + # Validate X to convert array API to numpy + if X is not None: + xp, _ = get_namespace(X) + X = validate_data( + self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False + ) + # Use the unified helper from common.py (calls kneighbors + computes prediction) # This properly handles X=None (LOOCV) case result = self._predict_skl_classification(X) @@ -268,6 +276,11 @@ def _onedal_kneighbors( def _onedal_score(self, X, y, sample_weight=None, queue=None): import sys print(f"DEBUG KNeighborsClassifier._onedal_score START: X type={type(X)}, y type={type(y)}", file=sys.stderr) + # Convert array API to numpy for sklearn's accuracy_score + # Note: validate_data does NOT convert array API to numpy, so we do it explicitly + y = np.asarray(y) + if sample_weight is not None: + sample_weight = np.asarray(sample_weight) result = accuracy_score( y, self._onedal_predict(X, queue=queue), sample_weight=sample_weight ) diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 254bce38f8..665e22c87f 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -156,7 +156,8 @@ def _onedal_fit(self, X, y, queue=None): xp, _ = get_namespace(X) print(f"DEBUG: Array namespace: {xp}", file=sys.stderr) - # REFACTOR: Use validate_data from sklearnex.utils.validation to convert pandas to numpy for X only + # REFACTOR: Use validate_data to convert pandas to numpy and validate types for X only + # force_all_finite=False to allow nan_euclidean metric to work (will fallback to sklearn) X = validate_data( self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr" ) @@ -182,9 +183,20 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ # REFACTOR: Pass pre-processed shape and _y to onedal + # For GPU backend, reshape _y to (-1, 1) before passing to onedal + from onedal.utils import _sycl_queue_manager as QM + queue_instance = QM.get_global_queue() + gpu_device = queue_instance is not None and queue_instance.sycl_device.is_gpu + self._onedal_estimator._shape = self._shape - self._onedal_estimator._y = self._y + # REFACTOR: Reshape _y for GPU backend (needs column vector) + # Following PCA pattern: all data preparation in sklearnex + if gpu_device: + self._onedal_estimator._y = xp.reshape(self._y, (-1, 1)) + else: + self._onedal_estimator._y = self._y print(f"DEBUG: Set onedal_estimator._shape={self._onedal_estimator._shape}", file=sys.stderr) + print(f"DEBUG: GPU device={gpu_device}, _y shape={self._onedal_estimator._y.shape}", file=sys.stderr) print(f"DEBUG KNeighborsRegressor._onedal_fit: Calling onedal_estimator.fit", file=sys.stderr) self._onedal_estimator.fit(X, y, queue=queue) @@ -234,7 +246,7 @@ def _predict_gpu(self, X, queue=None): if X is not None: xp, _ = get_namespace(X) X = validate_data( - self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False + self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False, force_all_finite=False ) # Call onedal backend for GPU prediction result = self._onedal_estimator._predict_gpu(X) diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 202dda775e..e8f6e46840 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -158,9 +158,9 @@ def _onedal_fit(self, X, y=None, queue=None): # Get array namespace for array API support xp, _ = get_namespace(X) - print(f"DEBUG: Array namespace: {xp}", file=sys.stderr) - # REFACTOR: Use validate_data from sklearnex.utils.validation to convert pandas to numpy + # REFACTOR: Use validate_data to convert pandas to numpy and validate types + # force_all_finite=False to allow nan_euclidean metric to work (will fallback to sklearn) X = validate_data( self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr" ) @@ -189,7 +189,7 @@ def _onedal_predict(self, X, queue=None): if X is not None: xp, _ = get_namespace(X) X = validate_data( - self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False + self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False, force_all_finite=False ) return self._onedal_estimator.predict(X, queue=queue) From 7574ef53cc9e947f96f96f2b28b16d4edebcbab9 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 20 Oct 2025 11:45:09 -0700 Subject: [PATCH 55/87] fix: remove validate data tests from deseleted tests --- sklearnex/neighbors/_lof.py | 14 +++------ sklearnex/neighbors/common.py | 4 ++- sklearnex/neighbors/knn_classification.py | 26 ++++++++--------- sklearnex/neighbors/knn_regression.py | 29 +++++++++---------- sklearnex/neighbors/knn_unsupervised.py | 13 ++++++--- sklearnex/tests/test_common.py | 35 ----------------------- 6 files changed, 42 insertions(+), 79 deletions(-) diff --git a/sklearnex/neighbors/_lof.py b/sklearnex/neighbors/_lof.py index 7a47f25ffb..6b05c181fe 100644 --- a/sklearnex/neighbors/_lof.py +++ b/sklearnex/neighbors/_lof.py @@ -58,12 +58,7 @@ def _onedal_fit(self, X, y, queue=None): if sklearn_check_version("1.2"): self._validate_params() - # REFACTOR: Use validate_data from sklearnex.utils.validation to convert pandas to numpy - X = validate_data( - self, X, dtype=[np.float64, np.float32], accept_sparse="csr" - ) - print(f"DEBUG: After validate_data, X type={type(X)}", file=sys.stderr) - + # Let _onedal_knn_fit (NearestNeighbors._onedal_fit) handle validation print(f"DEBUG LocalOutlierFactor._onedal_fit: Calling _onedal_knn_fit", file=sys.stderr) self._onedal_knn_fit(X, y, queue=queue) @@ -178,8 +173,6 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): self._validate_n_neighbors(n_neighbors) check_is_fitted(self) - if X is not None: - check_feature_names(self, X, reset=False) # Validate kneighbors parameters (inherited from KNeighborsDispatchingBase) self._kneighbors_validation(X, n_neighbors) @@ -209,11 +202,12 @@ def score_samples(self, X): check_is_fitted(self) # Validate and convert X (pandas to numpy if needed) + xp, _ = get_namespace(X) X = validate_data( - self, X, dtype=[np.float64, np.float32], accept_sparse="csr", reset=False + self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False, ensure_all_finite=False ) - check_feature_names(self, X, reset=False) + # check_feature_names(self, X, reset=False) distances_X, neighbors_indices_X = self._kneighbors( X, n_neighbors=self.n_neighbors_ diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index a2e64a1baa..8184d5979a 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -14,6 +14,7 @@ # limitations under the License. # ============================================================================== +import sys import warnings from numbers import Integral @@ -561,9 +562,10 @@ def _process_regression_targets(self, y): return y def _fit_validation(self, X, y=None): + print(f"DEBUG _fit_validation CALLED: X type={type(X)}, y type={type(y)}", file=sys.stderr) if sklearn_check_version("1.2"): self._validate_params() - check_feature_names(self, X, reset=True) + # check_feature_names(self, X, reset=True) # Validate n_neighbors parameter self._validate_n_neighbors(self.n_neighbors) if self.metric_params is not None and "p" in self.metric_params: diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index 050957d9e2..8d4caa086a 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -86,7 +86,7 @@ def predict(self, X): import sys print(f"DEBUG KNeighborsClassifier.predict START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) check_is_fitted(self) - check_feature_names(self, X, reset=False) + result = dispatch( self, "predict", @@ -104,7 +104,7 @@ def predict_proba(self, X): import sys print(f"DEBUG KNeighborsClassifier.predict_proba START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) check_is_fitted(self) - check_feature_names(self, X, reset=False) + result = dispatch( self, "predict_proba", @@ -122,7 +122,7 @@ def score(self, X, y, sample_weight=None): import sys print(f"DEBUG KNeighborsClassifier.score START: X type={type(X)}, y type={type(y)}", file=sys.stderr) check_is_fitted(self) - check_feature_names(self, X, reset=False) + result = dispatch( self, "score", @@ -147,8 +147,6 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): self._validate_n_neighbors(n_neighbors) check_is_fitted(self) - if X is not None: - check_feature_names(self, X, reset=False) # Validate kneighbors parameters (inherited from KNeighborsDispatchingBase) self._kneighbors_validation(X, n_neighbors) @@ -176,9 +174,9 @@ def _onedal_fit(self, X, y, queue=None): print(f"DEBUG: Array namespace: {xp}", file=sys.stderr) # REFACTOR: Use validate_data to convert pandas to numpy and validate types - # force_all_finite=False to allow nan_euclidean metric to work (will fallback to sklearn) + # ensure_all_finite=False to allow nan_euclidean metric to work (will fallback to sklearn) X, y = validate_data( - self, X, y, dtype=[xp.float64, xp.float32], accept_sparse="csr" + self, X, y, dtype=[xp.float64, xp.float32], accept_sparse="csr", ensure_all_finite=False ) print(f"DEBUG: After validate_data, X type={type(X)}, y type={type(y)}", file=sys.stderr) @@ -222,13 +220,6 @@ def _onedal_predict(self, X, queue=None): import sys print(f"DEBUG KNeighborsClassifier._onedal_predict START: X type={type(X)}", file=sys.stderr) - # Validate X to convert array API to numpy - if X is not None: - xp, _ = get_namespace(X) - X = validate_data( - self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False - ) - # Use the unified helper from common.py (calls kneighbors + computes prediction) # This properly handles X=None (LOOCV) case result = self._predict_skl_classification(X) @@ -258,6 +249,13 @@ def _onedal_kneighbors( import sys print(f"DEBUG KNeighborsClassifier._onedal_kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) + # Validate X to convert array API/pandas to numpy and check feature names (only if X is not None) + if X is not None: + xp, _ = get_namespace(X) + X = validate_data( + self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False, ensure_all_finite=False + ) + # REFACTOR: All post-processing now in sklearnex following PCA pattern # Prepare inputs and handle query_is_train case X, n_neighbors, query_is_train = self._prepare_kneighbors_inputs(X, n_neighbors) diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 665e22c87f..28551460d4 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -83,9 +83,9 @@ def fit(self, X, y): @wrap_output_data def predict(self, X): import sys - print(f"DEBUG KNeighborsRegressor.predict START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + print(f"DEBUG KNeighborsRegressor.predict START: X type={type(X)}", file=sys.stderr) check_is_fitted(self) - check_feature_names(self, X, reset=False) + result = dispatch( self, "predict", @@ -103,7 +103,7 @@ def score(self, X, y, sample_weight=None): import sys print(f"DEBUG KNeighborsRegressor.score START: X type={type(X)}, y type={type(y)}", file=sys.stderr) check_is_fitted(self) - check_feature_names(self, X, reset=False) + result = dispatch( self, "score", @@ -128,8 +128,6 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): self._validate_n_neighbors(n_neighbors) check_is_fitted(self) - if X is not None: - check_feature_names(self, X, reset=False) # Validate kneighbors parameters (inherited from KNeighborsDispatchingBase) self._kneighbors_validation(X, n_neighbors) @@ -157,9 +155,9 @@ def _onedal_fit(self, X, y, queue=None): print(f"DEBUG: Array namespace: {xp}", file=sys.stderr) # REFACTOR: Use validate_data to convert pandas to numpy and validate types for X only - # force_all_finite=False to allow nan_euclidean metric to work (will fallback to sklearn) + # ensure_all_finite=False to allow nan_euclidean metric to work (will fallback to sklearn) X = validate_data( - self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr" + self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", ensure_all_finite=False ) print(f"DEBUG: After validate_data, X type={type(X)}, y type={type(y)}", file=sys.stderr) @@ -239,16 +237,10 @@ def _onedal_predict(self, X, queue=None): return result def _predict_gpu(self, X, queue=None): - """GPU prediction path - validates X and calls onedal backend.""" + """GPU prediction path - calls onedal backend.""" import sys print(f"DEBUG KNeighborsRegressor._predict_gpu START: X type={type(X)}", file=sys.stderr) - # Validate and convert X (pandas to numpy if needed) only if X is not None - if X is not None: - xp, _ = get_namespace(X) - X = validate_data( - self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False, force_all_finite=False - ) - # Call onedal backend for GPU prediction + # Call onedal backend for GPU prediction (X is already validated by predict()) result = self._onedal_estimator._predict_gpu(X) print(f"DEBUG KNeighborsRegressor._predict_gpu END: result type={type(result)}", file=sys.stderr) return result @@ -270,6 +262,13 @@ def _onedal_kneighbors( import sys print(f"DEBUG KNeighborsRegressor._onedal_kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) + # Validate X to convert array API/pandas to numpy and check feature names (only if X is not None) + if X is not None: + xp, _ = get_namespace(X) + X = validate_data( + self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False, ensure_all_finite=False + ) + # REFACTOR: All post-processing now in sklearnex following PCA pattern # Prepare inputs and handle query_is_train case X, n_neighbors, query_is_train = self._prepare_kneighbors_inputs(X, n_neighbors) diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index e8f6e46840..8c9421843b 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -87,8 +87,6 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): self._validate_n_neighbors(n_neighbors) check_is_fitted(self) - if X is not None: - check_feature_names(self, X, reset=False) # Validate kneighbors parameters (inherited from KNeighborsDispatchingBase) self._kneighbors_validation(X, n_neighbors) @@ -160,9 +158,9 @@ def _onedal_fit(self, X, y=None, queue=None): xp, _ = get_namespace(X) # REFACTOR: Use validate_data to convert pandas to numpy and validate types - # force_all_finite=False to allow nan_euclidean metric to work (will fallback to sklearn) + # ensure_all_finite=False to allow nan_euclidean metric to work (will fallback to sklearn) X = validate_data( - self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr" + self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", ensure_all_finite=False ) print(f"DEBUG: After validate_data, X type={type(X)}", file=sys.stderr) @@ -199,6 +197,13 @@ def _onedal_kneighbors( import sys print(f"DEBUG NearestNeighbors._onedal_kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) + # Validate X to convert array API/pandas to numpy and check feature names (only if X is not None) + if X is not None: + xp, _ = get_namespace(X) + X = validate_data( + self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False, ensure_all_finite=False + ) + # REFACTOR: All post-processing now in sklearnex following PCA pattern # Prepare inputs and handle query_is_train case (includes validation AFTER +=1) X, n_neighbors, query_is_train = self._prepare_kneighbors_inputs(X, n_neighbors) diff --git a/sklearnex/tests/test_common.py b/sklearnex/tests/test_common.py index a0b1d90476..cbde5190b4 100644 --- a/sklearnex/tests/test_common.py +++ b/sklearnex/tests/test_common.py @@ -103,41 +103,6 @@ "LogisticRegression(solver='newton-cg')-predict-n_jobs_check": "uses daal4py for cpu in sklearnex", "LogisticRegression(solver='newton-cg')-predict_log_proba-n_jobs_check": "uses daal4py for cpu in sklearnex", "LogisticRegression(solver='newton-cg')-predict_proba-n_jobs_check": "uses daal4py for cpu in sklearnex", - # KNeighborsClassifier validate_data issues - will be fixed later - "KNeighborsClassifier-fit-call_validate_data": "validate_data implementation needs fixing", - "KNeighborsClassifier-predict_proba-call_validate_data": "validate_data implementation needs fixing", - "KNeighborsClassifier-score-call_validate_data": "validate_data implementation needs fixing", - "KNeighborsClassifier-kneighbors-call_validate_data": "validate_data implementation needs fixing", - "KNeighborsClassifier-kneighbors_graph-call_validate_data": "validate_data implementation needs fixing", - "KNeighborsClassifier-predict-call_validate_data": "validate_data implementation needs fixing", - "KNeighborsRegressor-fit-call_validate_data": "validate_data implementation needs fixing", - "KNeighborsRegressor-score-call_validate_data": "validate_data implementation needs fixing", - "KNeighborsRegressor-kneighbors-call_validate_data": "validate_data implementation needs fixing", - "KNeighborsRegressor-kneighbors_graph-call_validate_data": "validate_data implementation needs fixing", - "KNeighborsRegressor-predict-call_validate_data": "validate_data implementation needs fixing", - "NearestNeighbors-fit-call_validate_data": "validate_data implementation needs fixing", - "NearestNeighbors-kneighbors-call_validate_data": "validate_data implementation needs fixing", - "NearestNeighbors-kneighbors_graph-call_validate_data": "validate_data implementation needs fixing", - "LocalOutlierFactor-fit-call_validate_data": "validate_data implementation needs fixing", - "LocalOutlierFactor-kneighbors-call_validate_data": "validate_data implementation needs fixing", - "LocalOutlierFactor-kneighbors_graph-call_validate_data": "validate_data implementation needs fixing", - "LocalOutlierFactor(novelty=True)-fit-call_validate_data": "validate_data implementation needs fixing", - "LocalOutlierFactor(novelty=True)-kneighbors-call_validate_data": "validate_data implementation needs fixing", - "LocalOutlierFactor(novelty=True)-kneighbors_graph-call_validate_data": "validate_data implementation needs fixing", - "KNeighborsClassifier(algorithm='brute')-fit-call_validate_data": "validate_data implementation needs fixing", - "KNeighborsClassifier(algorithm='brute')-predict_proba-call_validate_data": "validate_data implementation needs fixing", - "KNeighborsClassifier(algorithm='brute')-score-call_validate_data": "validate_data implementation needs fixing", - "KNeighborsClassifier(algorithm='brute')-kneighbors-call_validate_data": "validate_data implementation needs fixing", - "KNeighborsClassifier(algorithm='brute')-kneighbors_graph-call_validate_data": "validate_data implementation needs fixing", - "KNeighborsClassifier(algorithm='brute')-predict-call_validate_data": "validate_data implementation needs fixing", - "KNeighborsRegressor(algorithm='brute')-fit-call_validate_data": "validate_data implementation needs fixing", - "KNeighborsRegressor(algorithm='brute')-score-call_validate_data": "validate_data implementation needs fixing", - "KNeighborsRegressor(algorithm='brute')-kneighbors-call_validate_data": "validate_data implementation needs fixing", - "KNeighborsRegressor(algorithm='brute')-kneighbors_graph-call_validate_data": "validate_data implementation needs fixing", - "KNeighborsRegressor(algorithm='brute')-predict-call_validate_data": "validate_data implementation needs fixing", - "NearestNeighbors(algorithm='brute')-fit-call_validate_data": "validate_data implementation needs fixing", - "NearestNeighbors(algorithm='brute')-kneighbors-call_validate_data": "validate_data implementation needs fixing", - "NearestNeighbors(algorithm='brute')-kneighbors_graph-call_validate_data": "validate_data implementation needs fixing", } From 591eb563a01caed78fae45af8d213a63bcd99636 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 20 Oct 2025 11:49:06 -0700 Subject: [PATCH 56/87] fix: format --- onedal/neighbors/neighbors.py | 50 +++-- sklearnex/neighbors/_lof.py | 96 +++++++--- sklearnex/neighbors/common.py | 210 ++++++++++++-------- sklearnex/neighbors/knn_classification.py | 218 +++++++++++++++------ sklearnex/neighbors/knn_regression.py | 224 ++++++++++++++++------ sklearnex/neighbors/knn_unsupervised.py | 155 +++++++++++---- 6 files changed, 674 insertions(+), 279 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index 32989289be..281caf6d63 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -14,11 +14,11 @@ # limitations under the License. # ============================================================================== +import sys from abc import ABCMeta, abstractmethod from numbers import Integral import numpy as np -import sys from onedal._device_offload import supports_queue from onedal.common._backend import bind_default_backend @@ -203,17 +203,20 @@ def __init__( # ) def _fit(self, X, y): - print(f"DEBUG oneDAL _fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", file=sys.stderr) + print( + f"DEBUG oneDAL _fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", + file=sys.stderr, + ) self._onedal_model = None self._tree = None # REFACTOR: Shape processing moved to sklearnex layer # _shape should be set by _process_classification_targets or _process_regression_targets in sklearnex # self._shape = None - if not hasattr(self, '_shape'): + if not hasattr(self, "_shape"): self._shape = None # REFACTOR STEP 1: Don't reset classes_ - it may have been set by sklearnex layer # self.classes_ = None - if not hasattr(self, 'classes_'): + if not hasattr(self, "classes_"): self.classes_ = None self.effective_metric_ = getattr(self, "effective_metric_", self.metric) self.effective_metric_params_ = getattr( @@ -239,18 +242,21 @@ def _fit(self, X, y): # This code is now commented out - processing MUST happen in sklearnex before calling fit # Assertion: Verify that sklearnex has done the preprocessing if _is_classifier(self): - if not hasattr(self, 'classes_') or self.classes_ is None: + if not hasattr(self, "classes_") or self.classes_ is None: raise ValueError( "Classification target processing must be done in sklearnex layer before calling onedal fit. " "classes_ attribute is not set. This indicates the refactoring is incomplete." ) - if not hasattr(self, '_y') or self._y is None: + if not hasattr(self, "_y") or self._y is None: raise ValueError( "Classification target processing must be done in sklearnex layer before calling onedal fit. " "_y attribute is not set. This indicates the refactoring is incomplete." ) - print(f"DEBUG oneDAL: Using pre-processed classification targets from sklearnex (classes_={self.classes_})", file=sys.stderr) - + print( + f"DEBUG oneDAL: Using pre-processed classification targets from sklearnex (classes_={self.classes_})", + file=sys.stderr, + ) + # Original classification processing code - NOW COMMENTED OUT (moved to sklearnex) # if _is_classifier(self): # if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1: @@ -302,7 +308,10 @@ def _fit(self, X, y): queue = QM.get_global_queue() gpu_device = queue is not None and queue.sycl_device.is_gpu - print(f"DEBUG oneDAL _fit: Before _onedal_fit, X type={type(X)}, _fit_y type={type(_fit_y)}", file=sys.stderr) + print( + f"DEBUG oneDAL _fit: Before _onedal_fit, X type={type(X)}, _fit_y type={type(_fit_y)}", + file=sys.stderr, + ) # REFACTOR: All data preparation including reshaping moved to sklearnex layer # Following PCA pattern: onedal is a thin wrapper, no data manipulation # sklearnex prepares self._y in the correct shape before calling fit() @@ -311,12 +320,15 @@ def _fit(self, X, y): # _fit_y = self._validate_targets(self._y, X.dtype).reshape((-1, 1)) # OR for refactor without _validate_targets: # _fit_y = self._y.reshape((-1, 1)) - + # REFACTOR: Just pass self._y as-is - sklearnex should have already reshaped it if _is_classifier(self) or (_is_regressor(self) and gpu_device): _fit_y = self._y result = self._onedal_fit(X, _fit_y) - print(f"DEBUG oneDAL _fit: After _onedal_fit, self._fit_X type={type(self._fit_X)}, shape={getattr(self._fit_X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + print( + f"DEBUG oneDAL _fit: After _onedal_fit, self._fit_X type={type(self._fit_X)}, shape={getattr(self._fit_X, 'shape', 'NO_SHAPE')}", + file=sys.stderr, + ) # REFACTOR: Shape-based y reshaping commented out - y should already be properly shaped by sklearnex # Original code kept for reference: @@ -342,7 +354,7 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): # f"{n_features} features as input" # ) # ) - + # Still need n_features for _parse_auto_method call later # n_features = getattr(self, "n_features_in_", None) @@ -373,7 +385,7 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): # # Include an extra neighbor to account for the sample itself being # # returned, which is removed later # n_neighbors += 1 - + # REFACTOR: query_is_train handling moved to sklearnex layer # All post-processing now happens in sklearnex._kneighbors_post_processing() # Original code kept for reference: @@ -433,7 +445,7 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): # results = distances, indices # else: # results = indices - + # Always return both - sklearnex will decide what to return to user results = distances, indices @@ -477,7 +489,7 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): # neigh_dist = np.reshape(neigh_dist[sample_mask], (n_queries, n_neighbors - 1)) # return neigh_dist, neigh_ind # return neigh_ind - + # Return raw results - sklearnex will do all post-processing return results @@ -543,17 +555,17 @@ def fit(self, X, y, queue=None): # @supports_queue # def predict(self, X, queue=None): # print(f"DEBUG KNeighborsClassifier.predict START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - # + # # # REFACTOR: _check_array validation commented out - should be done in sklearnex layer # # Original validation code kept for reference: # # use_raw_input = _get_config().get("use_raw_input", False) is True # # if not use_raw_input: # # X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) - # + # # onedal_model = getattr(self, "_onedal_model", None) # n_features = getattr(self, "n_features_in_", None) # n_samples_fit_ = getattr(self, "n_samples_fit_", None) - # + # # # REFACTOR: Feature count validation commented out - should be done in sklearnex layer # # Original validation code kept for reference: # # shape = getattr(X, "shape", None) @@ -781,4 +793,4 @@ def fit(self, X, y=None, queue=None): @supports_queue def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): - return self._kneighbors(X, n_neighbors, return_distance) \ No newline at end of file + return self._kneighbors(X, n_neighbors, return_distance) diff --git a/sklearnex/neighbors/_lof.py b/sklearnex/neighbors/_lof.py index 6b05c181fe..0676b6988f 100644 --- a/sklearnex/neighbors/_lof.py +++ b/sklearnex/neighbors/_lof.py @@ -54,12 +54,19 @@ class LocalOutlierFactor(KNeighborsDispatchingBase, _sklearn_LocalOutlierFactor) def _onedal_fit(self, X, y, queue=None): import sys - print(f"DEBUG LocalOutlierFactor._onedal_fit START: X type={type(X)}, y type={type(y)}", file=sys.stderr) + + print( + f"DEBUG LocalOutlierFactor._onedal_fit START: X type={type(X)}, y type={type(y)}", + file=sys.stderr, + ) if sklearn_check_version("1.2"): self._validate_params() # Let _onedal_knn_fit (NearestNeighbors._onedal_fit) handle validation - print(f"DEBUG LocalOutlierFactor._onedal_fit: Calling _onedal_knn_fit", file=sys.stderr) + print( + f"DEBUG LocalOutlierFactor._onedal_fit: Calling _onedal_knn_fit", + file=sys.stderr, + ) self._onedal_knn_fit(X, y, queue=queue) if self.contamination != "auto": @@ -79,7 +86,10 @@ def _onedal_fit(self, X, y, queue=None): ) self.n_neighbors_ = max(1, min(self.n_neighbors, n_samples - 1)) - print(f"DEBUG LocalOutlierFactor._onedal_fit: Calling _onedal_kneighbors", file=sys.stderr) + print( + f"DEBUG LocalOutlierFactor._onedal_fit: Calling _onedal_kneighbors", + file=sys.stderr, + ) ( self._distances_fit_X_, _neighbors_indices_fit_X_, @@ -114,12 +124,19 @@ def _onedal_fit(self, X, y, queue=None): "Increase the number of neighbors for more accurate results." ) - print(f"DEBUG LocalOutlierFactor._onedal_fit END: _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print( + f"DEBUG LocalOutlierFactor._onedal_fit END: _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", + file=sys.stderr, + ) return self def fit(self, X, y=None): import sys - print(f"DEBUG LocalOutlierFactor.fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + + print( + f"DEBUG LocalOutlierFactor.fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", + file=sys.stderr, + ) result = dispatch( self, "fit", @@ -130,12 +147,18 @@ def fit(self, X, y=None): X, None, ) - print(f"DEBUG LocalOutlierFactor.fit END: result type={type(result)}", file=sys.stderr) + print( + f"DEBUG LocalOutlierFactor.fit END: result type={type(result)}", + file=sys.stderr, + ) return result def _predict(self, X=None): import sys - print(f"DEBUG LocalOutlierFactor._predict START: X type={type(X)}", file=sys.stderr) + + print( + f"DEBUG LocalOutlierFactor._predict START: X type={type(X)}", file=sys.stderr + ) check_is_fitted(self) if X is not None: @@ -147,7 +170,10 @@ def _predict(self, X=None): is_inlier = np.ones(self.n_samples_fit_, dtype=int) is_inlier[self.negative_outlier_factor_ < self.offset_] = -1 - print(f"DEBUG LocalOutlierFactor._predict END: is_inlier type={type(is_inlier)}", file=sys.stderr) + print( + f"DEBUG LocalOutlierFactor._predict END: is_inlier type={type(is_inlier)}", + file=sys.stderr, + ) return is_inlier # This had to be done because predict loses the queue when no @@ -159,24 +185,35 @@ def _predict(self, X=None): @wrap_output_data def fit_predict(self, X, y=None): import sys - print(f"DEBUG LocalOutlierFactor.fit_predict START: X type={type(X)}", file=sys.stderr) + + print( + f"DEBUG LocalOutlierFactor.fit_predict START: X type={type(X)}", + file=sys.stderr, + ) result = self.fit(X)._predict() - print(f"DEBUG LocalOutlierFactor.fit_predict END: result type={type(result)}", file=sys.stderr) + print( + f"DEBUG LocalOutlierFactor.fit_predict END: result type={type(result)}", + file=sys.stderr, + ) return result def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): import sys - print(f"DEBUG LocalOutlierFactor._kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) - + + print( + f"DEBUG LocalOutlierFactor._kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", + file=sys.stderr, + ) + # Validate n_neighbors parameter first (before check_is_fitted) if n_neighbors is not None: self._validate_n_neighbors(n_neighbors) - + check_is_fitted(self) - + # Validate kneighbors parameters (inherited from KNeighborsDispatchingBase) self._kneighbors_validation(X, n_neighbors) - + result = dispatch( self, "kneighbors", @@ -188,7 +225,10 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors=n_neighbors, return_distance=return_distance, ) - print(f"DEBUG LocalOutlierFactor._kneighbors END: result type={type(result)}", file=sys.stderr) + print( + f"DEBUG LocalOutlierFactor._kneighbors END: result type={type(result)}", + file=sys.stderr, + ) return result kneighbors = wrap_output_data(_kneighbors) @@ -198,15 +238,24 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): @wrap_output_data def score_samples(self, X): import sys - print(f"DEBUG LocalOutlierFactor.score_samples START: X type={type(X)}", file=sys.stderr) + + print( + f"DEBUG LocalOutlierFactor.score_samples START: X type={type(X)}", + file=sys.stderr, + ) check_is_fitted(self) - + # Validate and convert X (pandas to numpy if needed) xp, _ = get_namespace(X) X = validate_data( - self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False, ensure_all_finite=False + self, + X, + dtype=[xp.float64, xp.float32], + accept_sparse="csr", + reset=False, + ensure_all_finite=False, ) - + # check_feature_names(self, X, reset=False) distances_X, neighbors_indices_X = self._kneighbors( @@ -221,8 +270,11 @@ def score_samples(self, X): lrd_ratios_array = self._lrd[neighbors_indices_X] / X_lrd[:, np.newaxis] result = -np.mean(lrd_ratios_array, axis=1) - print(f"DEBUG LocalOutlierFactor.score_samples END: result type={type(result)}", file=sys.stderr) + print( + f"DEBUG LocalOutlierFactor.score_samples END: result type={type(result)}", + file=sys.stderr, + ) return result fit.__doc__ = _sklearn_LocalOutlierFactor.fit.__doc__ - kneighbors.__doc__ = _sklearn_LocalOutlierFactor.kneighbors.__doc__ \ No newline at end of file + kneighbors.__doc__ = _sklearn_LocalOutlierFactor.kneighbors.__doc__ diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index 8184d5979a..3948a32121 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -28,8 +28,6 @@ from daal4py.sklearn._n_jobs_support import control_n_jobs from daal4py.sklearn._utils import sklearn_check_version - -from ..utils.validation import validate_data from onedal._device_offload import _transfer_to_host from onedal.utils.validation import ( _check_array, @@ -43,7 +41,7 @@ from .._utils import PatchingConditionsChain from ..base import oneDALEstimator from ..utils._array_api import get_namespace -from ..utils.validation import check_feature_names +from ..utils.validation import check_feature_names, validate_data class KNeighborsDispatchingBase(oneDALEstimator): @@ -114,7 +112,11 @@ def _get_weights(self, dist, weights): else: dist[point_dist_i] = 1.0 / point_dist else: - with xp.errstate(divide="ignore") if hasattr(xp, 'errstate') else np.errstate(divide="ignore"): + with ( + xp.errstate(divide="ignore") + if hasattr(xp, "errstate") + else np.errstate(divide="ignore") + ): dist = 1.0 / dist inf_mask = xp.isinf(dist) inf_row = xp.any(inf_mask, axis=1) @@ -127,28 +129,28 @@ def _get_weights(self, dist, weights): "weights not recognized: should be 'uniform', " "'distance', or a callable function" ) - + def _compute_weighted_prediction(self, neigh_dist, neigh_ind, weights_param, y_train): """Compute weighted prediction for regression. - + Args: neigh_dist: Distances to neighbors neigh_ind: Indices of neighbors weights_param: Weight parameter ('uniform', 'distance', or callable) y_train: Training target values - + Returns: Predicted values """ # Array API support: get namespace from input arrays xp, _ = get_namespace(neigh_dist, neigh_ind, y_train) - + weights = self._get_weights(neigh_dist, weights_param) - + _y = y_train if _y.ndim == 1: _y = xp.reshape(_y, (-1, 1)) - + if weights is None: # Array API: Use take() per row since array API take() only supports 1-D indices # Build result by gathering rows one at a time @@ -157,35 +159,45 @@ def _compute_weighted_prediction(self, neigh_dist, neigh_ind, weights_param, y_t # Get indices for this sample's neighbors sample_indices = neigh_ind[i, ...] # Shape: (n_neighbors,) # Gather those rows from _y - sample_neighbors = xp.take(_y, sample_indices, axis=0) # Shape: (n_neighbors, n_outputs) + sample_neighbors = xp.take( + _y, sample_indices, axis=0 + ) # Shape: (n_neighbors, n_outputs) gathered_list.append(sample_neighbors) # Stack and compute mean - gathered = xp.stack(gathered_list, axis=0) # Shape: (n_samples, n_neighbors, n_outputs) + gathered = xp.stack( + gathered_list, axis=0 + ) # Shape: (n_samples, n_neighbors, n_outputs) y_pred = xp.mean(gathered, axis=1) else: y_pred = xp.empty((neigh_ind.shape[0], _y.shape[1]), dtype=xp.float64) denom = xp.sum(weights, axis=1) - + for j in range(_y.shape[1]): # Array API: Iterate over samples to gather values y_col_j = _y[:, j, ...] # Shape: (n_train_samples,) gathered_vals = [] for i in range(neigh_ind.shape[0]): sample_indices = neigh_ind[i, ...] # Shape: (n_neighbors,) - sample_vals = xp.take(y_col_j, sample_indices, axis=0) # Shape: (n_neighbors,) + sample_vals = xp.take( + y_col_j, sample_indices, axis=0 + ) # Shape: (n_neighbors,) gathered_vals.append(sample_vals) - gathered_j = xp.stack(gathered_vals, axis=0) # Shape: (n_samples, n_neighbors) + gathered_j = xp.stack( + gathered_vals, axis=0 + ) # Shape: (n_samples, n_neighbors) num = xp.sum(gathered_j * weights, axis=1) y_pred[:, j, ...] = num / denom - + if y_train.ndim == 1: y_pred = xp.reshape(y_pred, (-1,)) - + return y_pred - - def _compute_class_probabilities(self, neigh_dist, neigh_ind, weights_param, y_train, classes, outputs_2d): + + def _compute_class_probabilities( + self, neigh_dist, neigh_ind, weights_param, y_train, classes, outputs_2d + ): """Compute class probabilities for classification. - + Args: neigh_dist: Distances to neighbors neigh_ind: Indices of neighbors @@ -193,45 +205,47 @@ def _compute_class_probabilities(self, neigh_dist, neigh_ind, weights_param, y_t y_train: Encoded training labels classes: Class labels outputs_2d: Whether output is 2D (multi-output) - + Returns: Class probabilities """ from ..utils.validation import _num_samples - + # Array API support: get namespace from input arrays xp, _ = get_namespace(neigh_dist, neigh_ind, y_train) - + _y = y_train classes_ = classes if not outputs_2d: _y = xp.reshape(y_train, (-1, 1)) classes_ = [classes] - + n_queries = neigh_ind.shape[0] - + weights = self._get_weights(neigh_dist, weights_param) if weights is None: # REFACTOR: Ensure weights is float for array API type promotion # neigh_ind is int, so ones_like would give int, but we need float weights = xp.ones_like(neigh_ind, dtype=xp.float64) - + probabilities = [] for k, classes_k in enumerate(classes_): # Get predicted labels for each neighbor: shape (n_samples, n_neighbors) # _y[:, k] gives training labels for output k, then gather using neigh_ind y_col_k = _y[:, k, ...] - + # Array API: Use take() with iteration since take() only supports 1-D indices pred_labels_list = [] for i in range(neigh_ind.shape[0]): sample_indices = neigh_ind[i, ...] sample_labels = xp.take(y_col_k, sample_indices, axis=0) pred_labels_list.append(sample_labels) - pred_labels = xp.stack(pred_labels_list, axis=0) # Shape: (n_queries, n_neighbors) - + pred_labels = xp.stack( + pred_labels_list, axis=0 + ) # Shape: (n_queries, n_neighbors) + proba_k = xp.zeros((n_queries, classes_k.size), dtype=xp.float64) - + # Array API: Cannot use fancy indexing __setitem__ like proba_k[all_rows, idx] = ... # Instead, build probabilities sample by sample proba_list = [] @@ -242,31 +256,37 @@ def _compute_class_probabilities(self, neigh_dist, neigh_ind, weights_param, y_t class_label = int(pred_labels[sample_idx, neighbor_idx]) weight = weights[sample_idx, neighbor_idx] # Update probability for this class - sample_proba = xp.asarray([ - sample_proba[i] + weight if i == class_label else sample_proba[i] - for i in range(classes_k.size) - ]) + sample_proba = xp.asarray( + [ + ( + sample_proba[i] + weight + if i == class_label + else sample_proba[i] + ) + for i in range(classes_k.size) + ] + ) proba_list.append(sample_proba) proba_k = xp.stack(proba_list, axis=0) # Shape: (n_queries, n_classes) - + # normalize 'votes' into real [0,1] probabilities normalizer = xp.sum(proba_k, axis=1)[:, xp.newaxis] normalizer[normalizer == 0.0] = 1.0 proba_k /= normalizer - + probabilities.append(proba_k) - + if not outputs_2d: probabilities = probabilities[0] - + return probabilities - + def _predict_skl_regression(self, X): """SKL prediction path for regression - calls kneighbors, computes predictions. - + This method handles X=None (LOOCV) properly by calling self.kneighbors which has the query_is_train logic. - + Args: X: Query samples (or None for LOOCV) Returns: @@ -279,10 +299,10 @@ def _predict_skl_regression(self, X): def _predict_skl_classification(self, X): """SKL prediction path for classification - calls kneighbors, computes predictions. - + This method handles X=None (LOOCV) properly by calling self.kneighbors which has the query_is_train logic. - + Args: X: Query samples (or None for LOOCV) Returns: @@ -294,16 +314,18 @@ def _predict_skl_classification(self, X): ) # Array API support: get namespace from probability array xp, _ = get_namespace(proba) - + if not self.outputs_2d_: # Single output: classes_[argmax(proba, axis=1)] result = self.classes_[xp.argmax(proba, axis=1)] else: # Multi-output: apply argmax separately for each output - result = [classes_k[xp.argmax(proba_k, axis=1)] - for classes_k, proba_k in zip(self.classes_, proba.T)] + result = [ + classes_k[xp.argmax(proba_k, axis=1)] + for classes_k, proba_k in zip(self.classes_, proba.T) + ] result = xp.asarray(result).T - + return result def _validate_targets(self, y, dtype): @@ -360,7 +382,7 @@ def _validate_kneighbors_bounds(self, n_neighbors, query_is_train, X): def _kneighbors_validation(self, X, n_neighbors): """Shared validation for kneighbors method called from sklearnex layer. - + Validates: - Feature count matches training data if X is provided - n_neighbors is within valid bounds if provided @@ -368,23 +390,25 @@ def _kneighbors_validation(self, X, n_neighbors): # Validate feature count if X is provided if X is not None: self._validate_feature_count(X) - + # Validate n_neighbors bounds if provided if n_neighbors is not None: # Determine if query is the training set - query_is_train = X is None or (hasattr(self, '_fit_X') and X is self._fit_X) - self._validate_kneighbors_bounds(n_neighbors, query_is_train, X if X is not None else self._fit_X) + query_is_train = X is None or (hasattr(self, "_fit_X") and X is self._fit_X) + self._validate_kneighbors_bounds( + n_neighbors, query_is_train, X if X is not None else self._fit_X + ) def _prepare_kneighbors_inputs(self, X, n_neighbors): """Prepare inputs for kneighbors call to onedal backend. - + Handles query_is_train case: when X=None, sets X to training data and adds +1 to n_neighbors. Validates n_neighbors bounds AFTER adding +1 (replicates original onedal behavior). - + Args: X: Query data or None n_neighbors: Number of neighbors or None - + Returns: Tuple of (X, n_neighbors, query_is_train) - X: Processed query data (self._fit_X if original X was None) @@ -392,14 +416,12 @@ def _prepare_kneighbors_inputs(self, X, n_neighbors): - query_is_train: Boolean flag indicating if original X was None """ query_is_train = X is None - + if X is not None: # Get the array namespace to use correct dtypes xp, _ = get_namespace(X) # Use _check_array like main branch, with array API dtype support - X = _check_array( - X, dtype=[xp.float64, xp.float32], accept_sparse="csr" - ) + X = _check_array(X, dtype=[xp.float64, xp.float32], accept_sparse="csr") else: X = self._fit_X # Include an extra neighbor to account for the sample itself being @@ -407,38 +429,42 @@ def _prepare_kneighbors_inputs(self, X, n_neighbors): if n_neighbors is None: n_neighbors = self.n_neighbors n_neighbors += 1 - + # Validate bounds AFTER adding +1 (replicates original onedal behavior) # Original code in onedal had validation after n_neighbors += 1 n_samples_fit = self.n_samples_fit_ if n_neighbors > n_samples_fit: - n_neighbors_for_msg = n_neighbors - 1 # for error message, show original value + n_neighbors_for_msg = ( + n_neighbors - 1 + ) # for error message, show original value raise ValueError( f"Expected n_neighbors < n_samples_fit, but " f"n_neighbors = {n_neighbors_for_msg}, n_samples_fit = {n_samples_fit}, " f"n_samples = {X.shape[0]}" ) - + return X, n_neighbors, query_is_train - def _kneighbors_post_processing(self, X, n_neighbors, return_distance, result, query_is_train): + def _kneighbors_post_processing( + self, X, n_neighbors, return_distance, result, query_is_train + ): """Shared post-processing for kneighbors results. - + Following PCA pattern: all post-processing in sklearnex, onedal returns raw results. Replicates exact logic from main branch onedal._kneighbors() method. - + Handles (in order, matching main branch): 1. kd_tree sorting: sorts results by distance (BEFORE deciding what to return) 2. query_is_train case (X=None): removes self from results 3. return_distance decision: return distances+indices or just indices - + Args: X: Query data (self._fit_X if query_is_train) n_neighbors: Number of neighbors (already includes +1 if query_is_train) return_distance: Whether to return distances to user result: Raw result from onedal backend - always (distances, indices) query_is_train: Boolean indicating if original X was None - + Returns: Post-processed result: (distances, indices) if return_distance else indices """ @@ -446,7 +472,7 @@ def _kneighbors_post_processing(self, X, n_neighbors, return_distance, result, q # onedal always returns both distances and indices (backend computes both) distances, indices = result xp, _ = get_namespace(distances, indices) - + # POST-PROCESSING STEP 1: kd_tree sorting (moved from onedal) # This happens BEFORE deciding what to return, using distances that are always available # Matches main branch: sorting uses distances even when return_distance=False @@ -455,40 +481,40 @@ def _kneighbors_post_processing(self, X, n_neighbors, return_distance, result, q seq = xp.argsort(distances[i]) indices[i] = indices[i][seq] distances[i] = distances[i][seq] - + # POST-PROCESSING STEP 2: Decide what to return (moved from onedal) # This happens AFTER kd_tree sorting if return_distance: results = distances, indices else: results = indices - + # POST-PROCESSING STEP 3: Remove self from results when query_is_train (moved from onedal) # This happens LAST, after sorting and after deciding format if not query_is_train: return results - + # If the query data is the same as the indexed data, we would like # to ignore the first nearest neighbor of every sample, i.e the sample itself. if return_distance: neigh_dist, neigh_ind = results else: neigh_ind = results - + # X is self._fit_X in query_is_train case (set by caller) n_queries, _ = X.shape sample_range = xp.arange(n_queries)[:, xp.newaxis] sample_mask = neigh_ind != sample_range - + # Corner case: When the number of duplicates are more # than the number of neighbors, the first NN will not # be the sample, but a duplicate. # In that case mask the first duplicate. dup_gr_nbrs = xp.all(sample_mask, axis=1) sample_mask[:, 0][dup_gr_nbrs] = False - + neigh_ind = xp.reshape(neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) - + if return_distance: neigh_dist = xp.reshape(neigh_dist[sample_mask], (n_queries, n_neighbors - 1)) return neigh_dist, neigh_ind @@ -496,15 +522,19 @@ def _kneighbors_post_processing(self, X, n_neighbors, return_distance, result, q def _process_classification_targets(self, y): """Process classification targets and set class-related attributes. - + Note: y should already be converted to numpy array via validate_data before calling this. """ import sys - print(f"DEBUG _process_classification_targets: y type={type(y)}, y shape={getattr(y, 'shape', 'NO_SHAPE')}", file=sys.stderr) - + + print( + f"DEBUG _process_classification_targets: y type={type(y)}, y shape={getattr(y, 'shape', 'NO_SHAPE')}", + file=sys.stderr, + ) + # Array API support: get namespace from y xp, _ = get_namespace(y) - + # y should already be numpy array from validate_data y = xp.asarray(y) @@ -520,7 +550,7 @@ def _process_classification_targets(self, y): # Validate classification targets _check_classification_targets(y) - + # Process classes - note: np.unique is used for class extraction # This is acceptable as classes are typically numpy arrays in sklearn self.classes_ = [] @@ -543,26 +573,33 @@ def _process_classification_targets(self, y): def _process_regression_targets(self, y): """Process regression targets and set shape-related attributes. - + REFACTOR: This replicates the EXACT shape processing that was in onedal _fit. Original onedal code: shape = getattr(y, "shape", None) self._shape = shape if shape is not None else y.shape # (later, after fit) self._y = y if self._shape is None else xp.reshape(y, self._shape) - + For now, just store _shape and _y as-is. The reshape happens after onedal fit is complete. """ import sys + # EXACT replication of original onedal shape processing shape = getattr(y, "shape", None) self._shape = shape if shape is not None else y.shape self._y = y - print(f"DEBUG _process_regression_targets: _y type={type(self._y)}, _shape={self._shape}", file=sys.stderr) + print( + f"DEBUG _process_regression_targets: _y type={type(self._y)}, _shape={self._shape}", + file=sys.stderr, + ) return y def _fit_validation(self, X, y=None): - print(f"DEBUG _fit_validation CALLED: X type={type(X)}, y type={type(y)}", file=sys.stderr) + print( + f"DEBUG _fit_validation CALLED: X type={type(X)}, y type={type(y)}", + file=sys.stderr, + ) if sklearn_check_version("1.2"): self._validate_params() # check_feature_names(self, X, reset=True) @@ -601,7 +638,10 @@ def _fit_validation(self, X, y=None): # Don't check for NaN - let oneDAL handle it (will fallback to sklearn if needed) xp, _ = get_namespace(X) self._fit_X = _check_array( - X, dtype=[xp.float64, xp.float32], accept_sparse=True, force_all_finite=False + X, + dtype=[xp.float64, xp.float32], + accept_sparse=True, + force_all_finite=False, ) self.n_samples_fit_ = _num_samples(self._fit_X) self.n_features_in_ = _num_features(self._fit_X) @@ -852,4 +892,4 @@ def kneighbors_graph(self, X=None, n_neighbors=None, mode="connectivity"): return kneighbors_graph - kneighbors_graph.__doc__ = KNeighborsMixin.kneighbors_graph.__doc__ \ No newline at end of file + kneighbors_graph.__doc__ = KNeighborsMixin.kneighbors_graph.__doc__ diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index 8d4caa086a..8c4db1931d 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -31,6 +31,7 @@ from ..utils.validation import check_feature_names, validate_data from .common import KNeighborsDispatchingBase + @enable_array_api @control_n_jobs( decorated_methods=["fit", "predict", "predict_proba", "kneighbors", "score"] @@ -67,7 +68,11 @@ def __init__( def fit(self, X, y): import sys - print(f"DEBUG KNeighborsClassifier.fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", file=sys.stderr) + + print( + f"DEBUG KNeighborsClassifier.fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", + file=sys.stderr, + ) dispatch( self, "fit", @@ -78,15 +83,22 @@ def fit(self, X, y): X, y, ) - print(f"DEBUG KNeighborsClassifier.fit END: _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print( + f"DEBUG KNeighborsClassifier.fit END: _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", + file=sys.stderr, + ) return self @wrap_output_data def predict(self, X): import sys - print(f"DEBUG KNeighborsClassifier.predict START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + + print( + f"DEBUG KNeighborsClassifier.predict START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", + file=sys.stderr, + ) check_is_fitted(self) - + result = dispatch( self, "predict", @@ -96,15 +108,22 @@ def predict(self, X): }, X, ) - print(f"DEBUG KNeighborsClassifier.predict END: result type={type(result)}", file=sys.stderr) + print( + f"DEBUG KNeighborsClassifier.predict END: result type={type(result)}", + file=sys.stderr, + ) return result @wrap_output_data def predict_proba(self, X): import sys - print(f"DEBUG KNeighborsClassifier.predict_proba START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) + + print( + f"DEBUG KNeighborsClassifier.predict_proba START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", + file=sys.stderr, + ) check_is_fitted(self) - + result = dispatch( self, "predict_proba", @@ -114,15 +133,22 @@ def predict_proba(self, X): }, X, ) - print(f"DEBUG KNeighborsClassifier.predict_proba END: result type={type(result)}", file=sys.stderr) + print( + f"DEBUG KNeighborsClassifier.predict_proba END: result type={type(result)}", + file=sys.stderr, + ) return result @wrap_output_data def score(self, X, y, sample_weight=None): import sys - print(f"DEBUG KNeighborsClassifier.score START: X type={type(X)}, y type={type(y)}", file=sys.stderr) + + print( + f"DEBUG KNeighborsClassifier.score START: X type={type(X)}, y type={type(y)}", + file=sys.stderr, + ) check_is_fitted(self) - + result = dispatch( self, "score", @@ -140,17 +166,21 @@ def score(self, X, y, sample_weight=None): @wrap_output_data def kneighbors(self, X=None, n_neighbors=None, return_distance=True): import sys - print(f"DEBUG KNeighborsClassifier.kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) - + + print( + f"DEBUG KNeighborsClassifier.kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", + file=sys.stderr, + ) + # Validate n_neighbors parameter first (before check_is_fitted) if n_neighbors is not None: self._validate_n_neighbors(n_neighbors) - + check_is_fitted(self) - + # Validate kneighbors parameters (inherited from KNeighborsDispatchingBase) self._kneighbors_validation(X, n_neighbors) - + result = dispatch( self, "kneighbors", @@ -162,29 +192,47 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors=n_neighbors, return_distance=return_distance, ) - print(f"DEBUG KNeighborsClassifier.kneighbors END: result type={type(result)}", file=sys.stderr) + print( + f"DEBUG KNeighborsClassifier.kneighbors END: result type={type(result)}", + file=sys.stderr, + ) return result def _onedal_fit(self, X, y, queue=None): import sys - print(f"DEBUG KNeighborsClassifier._onedal_fit START: X type={type(X)}, y type={type(y)}", file=sys.stderr) - + + print( + f"DEBUG KNeighborsClassifier._onedal_fit START: X type={type(X)}, y type={type(y)}", + file=sys.stderr, + ) + # Get array namespace for array API support xp, _ = get_namespace(X) print(f"DEBUG: Array namespace: {xp}", file=sys.stderr) - + # REFACTOR: Use validate_data to convert pandas to numpy and validate types # ensure_all_finite=False to allow nan_euclidean metric to work (will fallback to sklearn) X, y = validate_data( - self, X, y, dtype=[xp.float64, xp.float32], accept_sparse="csr", ensure_all_finite=False + self, + X, + y, + dtype=[xp.float64, xp.float32], + accept_sparse="csr", + ensure_all_finite=False, + ) + print( + f"DEBUG: After validate_data, X type={type(X)}, y type={type(y)}", + file=sys.stderr, ) - print(f"DEBUG: After validate_data, X type={type(X)}, y type={type(y)}", file=sys.stderr) - + # REFACTOR STEP 1: Process classification targets in sklearnex before passing to onedal print(f"DEBUG: Processing classification targets in sklearnex", file=sys.stderr) y_processed = self._process_classification_targets(y) - print(f"DEBUG: After _process_classification_targets, y_processed type={type(y_processed)}", file=sys.stderr) - + print( + f"DEBUG: After _process_classification_targets, y_processed type={type(y_processed)}", + file=sys.stderr, + ) + onedal_params = { "n_neighbors": self.n_neighbors, "weights": self.weights, @@ -197,83 +245,133 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ - + # REFACTOR: Pass both original and processed targets to onedal # onedal needs the processed classes_ and _y attributes that we just set self._onedal_estimator.classes_ = self.classes_ self._onedal_estimator._y = self._y self._onedal_estimator.outputs_2d_ = self.outputs_2d_ self._onedal_estimator._shape = self._shape # Pass shape from sklearnex - print(f"DEBUG: Set onedal_estimator.classes_={self._onedal_estimator.classes_}", file=sys.stderr) - print(f"DEBUG: Set onedal_estimator._y shape={self._onedal_estimator._y.shape}", file=sys.stderr) - print(f"DEBUG: Set onedal_estimator._shape={self._onedal_estimator._shape}", file=sys.stderr) - - print(f"DEBUG KNeighborsClassifier._onedal_fit: Calling onedal_estimator.fit with X and original y", file=sys.stderr) + print( + f"DEBUG: Set onedal_estimator.classes_={self._onedal_estimator.classes_}", + file=sys.stderr, + ) + print( + f"DEBUG: Set onedal_estimator._y shape={self._onedal_estimator._y.shape}", + file=sys.stderr, + ) + print( + f"DEBUG: Set onedal_estimator._shape={self._onedal_estimator._shape}", + file=sys.stderr, + ) + + print( + f"DEBUG KNeighborsClassifier._onedal_fit: Calling onedal_estimator.fit with X and original y", + file=sys.stderr, + ) # Pass original y to onedal - it will use the pre-set classes_ and _y attributes we just assigned self._onedal_estimator.fit(X, y, queue=queue) - print(f"DEBUG KNeighborsClassifier._onedal_fit: After fit, calling _save_attributes", file=sys.stderr) + print( + f"DEBUG KNeighborsClassifier._onedal_fit: After fit, calling _save_attributes", + file=sys.stderr, + ) self._save_attributes() - print(f"DEBUG KNeighborsClassifier._onedal_fit END: self._fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print( + f"DEBUG KNeighborsClassifier._onedal_fit END: self._fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", + file=sys.stderr, + ) def _onedal_predict(self, X, queue=None): import sys - print(f"DEBUG KNeighborsClassifier._onedal_predict START: X type={type(X)}", file=sys.stderr) - + + print( + f"DEBUG KNeighborsClassifier._onedal_predict START: X type={type(X)}", + file=sys.stderr, + ) + # Use the unified helper from common.py (calls kneighbors + computes prediction) # This properly handles X=None (LOOCV) case result = self._predict_skl_classification(X) - - print(f"DEBUG KNeighborsClassifier._onedal_predict END: result type={type(result)}", file=sys.stderr) + + print( + f"DEBUG KNeighborsClassifier._onedal_predict END: result type={type(result)}", + file=sys.stderr, + ) return result def _onedal_predict_proba(self, X, queue=None): import sys - print(f"DEBUG KNeighborsClassifier._onedal_predict_proba START: X type={type(X)}", file=sys.stderr) - + + print( + f"DEBUG KNeighborsClassifier._onedal_predict_proba START: X type={type(X)}", + file=sys.stderr, + ) + # Call kneighbors through sklearnex (self.kneighbors is the sklearnex method) # This properly handles X=None case (LOOCV) with query_is_train logic neigh_dist, neigh_ind = self.kneighbors(X) - + # Use the helper method to compute class probabilities result = self._compute_class_probabilities( neigh_dist, neigh_ind, self.weights, self._y, self.classes_, self.outputs_2d_ ) - - print(f"DEBUG KNeighborsClassifier._onedal_predict_proba END: result type={type(result)}", file=sys.stderr) + + print( + f"DEBUG KNeighborsClassifier._onedal_predict_proba END: result type={type(result)}", + file=sys.stderr, + ) return result def _onedal_kneighbors( self, X=None, n_neighbors=None, return_distance=True, queue=None ): import sys - print(f"DEBUG KNeighborsClassifier._onedal_kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) - + + print( + f"DEBUG KNeighborsClassifier._onedal_kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", + file=sys.stderr, + ) + # Validate X to convert array API/pandas to numpy and check feature names (only if X is not None) if X is not None: xp, _ = get_namespace(X) X = validate_data( - self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False, ensure_all_finite=False + self, + X, + dtype=[xp.float64, xp.float32], + accept_sparse="csr", + reset=False, + ensure_all_finite=False, ) - + # REFACTOR: All post-processing now in sklearnex following PCA pattern # Prepare inputs and handle query_is_train case X, n_neighbors, query_is_train = self._prepare_kneighbors_inputs(X, n_neighbors) - + # Get raw results from onedal backend result = self._onedal_estimator.kneighbors( X, n_neighbors, return_distance, queue=queue ) - + # Apply post-processing (kd_tree sorting, removing self from results) - result = self._kneighbors_post_processing(X, n_neighbors, return_distance, result, query_is_train) - - print(f"DEBUG KNeighborsClassifier._onedal_kneighbors END: result type={type(result)}", file=sys.stderr) + result = self._kneighbors_post_processing( + X, n_neighbors, return_distance, result, query_is_train + ) + + print( + f"DEBUG KNeighborsClassifier._onedal_kneighbors END: result type={type(result)}", + file=sys.stderr, + ) return result def _onedal_score(self, X, y, sample_weight=None, queue=None): import sys - print(f"DEBUG KNeighborsClassifier._onedal_score START: X type={type(X)}, y type={type(y)}", file=sys.stderr) + + print( + f"DEBUG KNeighborsClassifier._onedal_score START: X type={type(X)}, y type={type(y)}", + file=sys.stderr, + ) # Convert array API to numpy for sklearn's accuracy_score # Note: validate_data does NOT convert array API to numpy, so we do it explicitly y = np.asarray(y) @@ -282,19 +380,29 @@ def _onedal_score(self, X, y, sample_weight=None, queue=None): result = accuracy_score( y, self._onedal_predict(X, queue=queue), sample_weight=sample_weight ) - print(f"DEBUG KNeighborsClassifier._onedal_score END: result={result}", file=sys.stderr) + print( + f"DEBUG KNeighborsClassifier._onedal_score END: result={result}", + file=sys.stderr, + ) return result def _save_attributes(self): import sys + print(f"DEBUG KNeighborsClassifier._save_attributes START", file=sys.stderr) self.classes_ = self._onedal_estimator.classes_ self.n_features_in_ = self._onedal_estimator.n_features_in_ self.n_samples_fit_ = self._onedal_estimator.n_samples_fit_ self._fit_X = self._onedal_estimator._fit_X - print(f"DEBUG KNeighborsClassifier._save_attributes: _fit_X type={type(self._fit_X)}", file=sys.stderr) + print( + f"DEBUG KNeighborsClassifier._save_attributes: _fit_X type={type(self._fit_X)}", + file=sys.stderr, + ) self._y = self._onedal_estimator._y - print(f"DEBUG KNeighborsClassifier._save_attributes: _y type={type(self._y)}", file=sys.stderr) + print( + f"DEBUG KNeighborsClassifier._save_attributes: _y type={type(self._y)}", + file=sys.stderr, + ) self._fit_method = self._onedal_estimator._fit_method self.outputs_2d_ = self._onedal_estimator.outputs_2d_ self._tree = self._onedal_estimator._tree @@ -304,4 +412,4 @@ def _save_attributes(self): predict.__doc__ = _sklearn_KNeighborsClassifier.predict.__doc__ predict_proba.__doc__ = _sklearn_KNeighborsClassifier.predict_proba.__doc__ score.__doc__ = _sklearn_KNeighborsClassifier.score.__doc__ - kneighbors.__doc__ = _sklearn_KNeighborsClassifier.kneighbors.__doc__ \ No newline at end of file + kneighbors.__doc__ = _sklearn_KNeighborsClassifier.kneighbors.__doc__ diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 28551460d4..411227d2ca 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -66,7 +66,11 @@ def __init__( def fit(self, X, y): import sys - print(f"DEBUG KNeighborsRegressor.fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", file=sys.stderr) + + print( + f"DEBUG KNeighborsRegressor.fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", + file=sys.stderr, + ) dispatch( self, "fit", @@ -77,15 +81,21 @@ def fit(self, X, y): X, y, ) - print(f"DEBUG KNeighborsRegressor.fit END: _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print( + f"DEBUG KNeighborsRegressor.fit END: _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", + file=sys.stderr, + ) return self @wrap_output_data def predict(self, X): import sys - print(f"DEBUG KNeighborsRegressor.predict START: X type={type(X)}", file=sys.stderr) + + print( + f"DEBUG KNeighborsRegressor.predict START: X type={type(X)}", file=sys.stderr + ) check_is_fitted(self) - + result = dispatch( self, "predict", @@ -95,15 +105,22 @@ def predict(self, X): }, X, ) - print(f"DEBUG KNeighborsRegressor.predict END: result type={type(result)}", file=sys.stderr) + print( + f"DEBUG KNeighborsRegressor.predict END: result type={type(result)}", + file=sys.stderr, + ) return result @wrap_output_data def score(self, X, y, sample_weight=None): import sys - print(f"DEBUG KNeighborsRegressor.score START: X type={type(X)}, y type={type(y)}", file=sys.stderr) + + print( + f"DEBUG KNeighborsRegressor.score START: X type={type(X)}, y type={type(y)}", + file=sys.stderr, + ) check_is_fitted(self) - + result = dispatch( self, "score", @@ -121,17 +138,21 @@ def score(self, X, y, sample_weight=None): @wrap_output_data def kneighbors(self, X=None, n_neighbors=None, return_distance=True): import sys - print(f"DEBUG KNeighborsRegressor.kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) - + + print( + f"DEBUG KNeighborsRegressor.kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", + file=sys.stderr, + ) + # Validate n_neighbors parameter first (before check_is_fitted) if n_neighbors is not None: self._validate_n_neighbors(n_neighbors) - + check_is_fitted(self) - + # Validate kneighbors parameters (inherited from KNeighborsDispatchingBase) self._kneighbors_validation(X, n_neighbors) - + result = dispatch( self, "kneighbors", @@ -143,30 +164,47 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors=n_neighbors, return_distance=return_distance, ) - print(f"DEBUG KNeighborsRegressor.kneighbors END: result type={type(result)}", file=sys.stderr) + print( + f"DEBUG KNeighborsRegressor.kneighbors END: result type={type(result)}", + file=sys.stderr, + ) return result def _onedal_fit(self, X, y, queue=None): import sys - print(f"DEBUG KNeighborsRegressor._onedal_fit START: X type={type(X)}, y type={type(y)}", file=sys.stderr) - + + print( + f"DEBUG KNeighborsRegressor._onedal_fit START: X type={type(X)}, y type={type(y)}", + file=sys.stderr, + ) + # Get array namespace for array API support xp, _ = get_namespace(X) print(f"DEBUG: Array namespace: {xp}", file=sys.stderr) - + # REFACTOR: Use validate_data to convert pandas to numpy and validate types for X only # ensure_all_finite=False to allow nan_euclidean metric to work (will fallback to sklearn) X = validate_data( - self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", ensure_all_finite=False + self, + X, + dtype=[xp.float64, xp.float32], + accept_sparse="csr", + ensure_all_finite=False, ) - print(f"DEBUG: After validate_data, X type={type(X)}, y type={type(y)}", file=sys.stderr) - + print( + f"DEBUG: After validate_data, X type={type(X)}, y type={type(y)}", + file=sys.stderr, + ) + # REFACTOR: Process regression targets in sklearnex before passing to onedal # This sets _shape and _y attributes print(f"DEBUG: Processing regression targets in sklearnex", file=sys.stderr) y_processed = self._process_regression_targets(y) - print(f"DEBUG: After _process_regression_targets, _shape={self._shape}, _y type={type(self._y)}", file=sys.stderr) - + print( + f"DEBUG: After _process_regression_targets, _shape={self._shape}, _y type={type(self._y)}", + file=sys.stderr, + ) + onedal_params = { "n_neighbors": self.n_neighbors, "weights": self.weights, @@ -179,13 +217,14 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ - + # REFACTOR: Pass pre-processed shape and _y to onedal # For GPU backend, reshape _y to (-1, 1) before passing to onedal from onedal.utils import _sycl_queue_manager as QM + queue_instance = QM.get_global_queue() gpu_device = queue_instance is not None and queue_instance.sycl_device.is_gpu - + self._onedal_estimator._shape = self._shape # REFACTOR: Reshape _y for GPU backend (needs column vector) # Following PCA pattern: all data preparation in sklearnex @@ -193,15 +232,27 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator._y = xp.reshape(self._y, (-1, 1)) else: self._onedal_estimator._y = self._y - print(f"DEBUG: Set onedal_estimator._shape={self._onedal_estimator._shape}", file=sys.stderr) - print(f"DEBUG: GPU device={gpu_device}, _y shape={self._onedal_estimator._y.shape}", file=sys.stderr) - - print(f"DEBUG KNeighborsRegressor._onedal_fit: Calling onedal_estimator.fit", file=sys.stderr) + print( + f"DEBUG: Set onedal_estimator._shape={self._onedal_estimator._shape}", + file=sys.stderr, + ) + print( + f"DEBUG: GPU device={gpu_device}, _y shape={self._onedal_estimator._y.shape}", + file=sys.stderr, + ) + + print( + f"DEBUG KNeighborsRegressor._onedal_fit: Calling onedal_estimator.fit", + file=sys.stderr, + ) self._onedal_estimator.fit(X, y, queue=queue) - print(f"DEBUG KNeighborsRegressor._onedal_fit: After fit, calling _save_attributes", file=sys.stderr) + print( + f"DEBUG KNeighborsRegressor._onedal_fit: After fit, calling _save_attributes", + file=sys.stderr, + ) self._save_attributes() - + # REFACTOR: Replicate the EXACT post-fit reshaping from original onedal code # Original onedal code (after fit): # if y is not None and _is_regressor(self): @@ -213,95 +264,150 @@ def _onedal_fit(self, X, y, queue=None): self._y = y if self._shape is None else xp.reshape(y, self._shape) # Also update the onedal estimator's _y since that's what gets used in predict self._onedal_estimator._y = self._y - print(f"DEBUG: After reshape, self._y type={type(self._y)}, shape={getattr(self._y, 'shape', 'NO_SHAPE')}", file=sys.stderr) - - print(f"DEBUG KNeighborsRegressor._onedal_fit END: self._fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print( + f"DEBUG: After reshape, self._y type={type(self._y)}, shape={getattr(self._y, 'shape', 'NO_SHAPE')}", + file=sys.stderr, + ) + + print( + f"DEBUG KNeighborsRegressor._onedal_fit END: self._fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", + file=sys.stderr, + ) def _onedal_predict(self, X, queue=None): import sys - print(f"DEBUG KNeighborsRegressor._onedal_predict START: X type={type(X)}", file=sys.stderr) - + + print( + f"DEBUG KNeighborsRegressor._onedal_predict START: X type={type(X)}", + file=sys.stderr, + ) + # Dispatch between GPU and SKL prediction methods # This logic matches onedal regressor predict() method but computation happens in sklearnex gpu_device = queue is not None and getattr(queue.sycl_device, "is_gpu", False) is_uniform_weights = getattr(self, "weights", "uniform") == "uniform" - + if gpu_device and is_uniform_weights: # GPU path: call onedal backend directly result = self._predict_gpu(X, queue=queue) else: # SKL path: call kneighbors (through sklearnex) then compute in sklearnex result = self._predict_skl(X, queue=queue) - - print(f"DEBUG KNeighborsRegressor._onedal_predict END: result type={type(result)}", file=sys.stderr) + + print( + f"DEBUG KNeighborsRegressor._onedal_predict END: result type={type(result)}", + file=sys.stderr, + ) return result - + def _predict_gpu(self, X, queue=None): """GPU prediction path - calls onedal backend.""" import sys - print(f"DEBUG KNeighborsRegressor._predict_gpu START: X type={type(X)}", file=sys.stderr) + + print( + f"DEBUG KNeighborsRegressor._predict_gpu START: X type={type(X)}", + file=sys.stderr, + ) # Call onedal backend for GPU prediction (X is already validated by predict()) result = self._onedal_estimator._predict_gpu(X) - print(f"DEBUG KNeighborsRegressor._predict_gpu END: result type={type(result)}", file=sys.stderr) + print( + f"DEBUG KNeighborsRegressor._predict_gpu END: result type={type(result)}", + file=sys.stderr, + ) return result - + def _predict_skl(self, X, queue=None): """SKL prediction path - calls kneighbors through sklearnex, computes prediction here.""" import sys - print(f"DEBUG KNeighborsRegressor._predict_skl START: X type={type(X)}", file=sys.stderr) - + + print( + f"DEBUG KNeighborsRegressor._predict_skl START: X type={type(X)}", + file=sys.stderr, + ) + # Use the unified helper from common.py (calls kneighbors + computes prediction) result = self._predict_skl_regression(X) - - print(f"DEBUG KNeighborsRegressor._predict_skl END: result type={type(result)}", file=sys.stderr) + + print( + f"DEBUG KNeighborsRegressor._predict_skl END: result type={type(result)}", + file=sys.stderr, + ) return result def _onedal_kneighbors( self, X=None, n_neighbors=None, return_distance=True, queue=None ): import sys - print(f"DEBUG KNeighborsRegressor._onedal_kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) - + + print( + f"DEBUG KNeighborsRegressor._onedal_kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", + file=sys.stderr, + ) + # Validate X to convert array API/pandas to numpy and check feature names (only if X is not None) if X is not None: xp, _ = get_namespace(X) X = validate_data( - self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False, ensure_all_finite=False + self, + X, + dtype=[xp.float64, xp.float32], + accept_sparse="csr", + reset=False, + ensure_all_finite=False, ) - + # REFACTOR: All post-processing now in sklearnex following PCA pattern # Prepare inputs and handle query_is_train case X, n_neighbors, query_is_train = self._prepare_kneighbors_inputs(X, n_neighbors) - + # Get raw results from onedal backend result = self._onedal_estimator.kneighbors( X, n_neighbors, return_distance, queue=queue ) - + # Apply post-processing (kd_tree sorting, removing self from results) - result = self._kneighbors_post_processing(X, n_neighbors, return_distance, result, query_is_train) - - print(f"DEBUG KNeighborsRegressor._onedal_kneighbors END: result type={type(result)}", file=sys.stderr) + result = self._kneighbors_post_processing( + X, n_neighbors, return_distance, result, query_is_train + ) + + print( + f"DEBUG KNeighborsRegressor._onedal_kneighbors END: result type={type(result)}", + file=sys.stderr, + ) return result def _onedal_score(self, X, y, sample_weight=None, queue=None): import sys - print(f"DEBUG KNeighborsRegressor._onedal_score START: X type={type(X)}, y type={type(y)}", file=sys.stderr) + + print( + f"DEBUG KNeighborsRegressor._onedal_score START: X type={type(X)}, y type={type(y)}", + file=sys.stderr, + ) result = r2_score( y, self._onedal_predict(X, queue=queue), sample_weight=sample_weight ) - print(f"DEBUG KNeighborsRegressor._onedal_score END: result={result}", file=sys.stderr) + print( + f"DEBUG KNeighborsRegressor._onedal_score END: result={result}", + file=sys.stderr, + ) return result def _save_attributes(self): import sys + print(f"DEBUG KNeighborsRegressor._save_attributes START", file=sys.stderr) self.n_features_in_ = self._onedal_estimator.n_features_in_ self.n_samples_fit_ = self._onedal_estimator.n_samples_fit_ self._fit_X = self._onedal_estimator._fit_X - print(f"DEBUG KNeighborsRegressor._save_attributes: _fit_X type={type(self._fit_X)}", file=sys.stderr) + print( + f"DEBUG KNeighborsRegressor._save_attributes: _fit_X type={type(self._fit_X)}", + file=sys.stderr, + ) self._y = self._onedal_estimator._y - print(f"DEBUG KNeighborsRegressor._save_attributes: _y type={type(self._y)}", file=sys.stderr) + print( + f"DEBUG KNeighborsRegressor._save_attributes: _y type={type(self._y)}", + file=sys.stderr, + ) self._fit_method = self._onedal_estimator._fit_method self._tree = self._onedal_estimator._tree print(f"DEBUG KNeighborsRegressor._save_attributes END", file=sys.stderr) @@ -309,4 +415,4 @@ def _save_attributes(self): fit.__doc__ = _sklearn_KNeighborsRegressor.__doc__ predict.__doc__ = _sklearn_KNeighborsRegressor.predict.__doc__ kneighbors.__doc__ = _sklearn_KNeighborsRegressor.kneighbors.__doc__ - score.__doc__ = _sklearn_KNeighborsRegressor.score.__doc__ \ No newline at end of file + score.__doc__ = _sklearn_KNeighborsRegressor.score.__doc__ diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 8c9421843b..a8e8988bf8 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -64,7 +64,10 @@ def __init__( ) def fit(self, X, y=None): - print(f"DEBUG NearestNeighbors.fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", file=sys.stderr) + print( + f"DEBUG NearestNeighbors.fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", + file=sys.stderr, + ) dispatch( self, "fit", @@ -75,22 +78,28 @@ def fit(self, X, y=None): X, None, ) - print(f"DEBUG NearestNeighbors.fit END: _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}, _fit_X shape={getattr(getattr(self, '_fit_X', None), 'shape', 'NO_SHAPE')}", file=sys.stderr) + print( + f"DEBUG NearestNeighbors.fit END: _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}, _fit_X shape={getattr(getattr(self, '_fit_X', None), 'shape', 'NO_SHAPE')}", + file=sys.stderr, + ) return self @wrap_output_data def kneighbors(self, X=None, n_neighbors=None, return_distance=True): - print(f"DEBUG NearestNeighbors.kneighbors START: X type={type(X)}, _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) - + print( + f"DEBUG NearestNeighbors.kneighbors START: X type={type(X)}, _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", + file=sys.stderr, + ) + # Validate n_neighbors parameter first (before check_is_fitted) if n_neighbors is not None: self._validate_n_neighbors(n_neighbors) - + check_is_fitted(self) - + # Validate kneighbors parameters (inherited from KNeighborsDispatchingBase) self._kneighbors_validation(X, n_neighbors) - + result = dispatch( self, "kneighbors", @@ -102,23 +111,38 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors=n_neighbors, return_distance=return_distance, ) - print(f"DEBUG NearestNeighbors.kneighbors END: result type={type(result)}", file=sys.stderr) + print( + f"DEBUG NearestNeighbors.kneighbors END: result type={type(result)}", + file=sys.stderr, + ) return result @wrap_output_data def radius_neighbors( self, X=None, radius=None, return_distance=True, sort_results=False ): - print(f"DEBUG NearestNeighbors.radius_neighbors START: X type={type(X)}, _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}, _fit_X shape={getattr(getattr(self, '_fit_X', None), 'shape', 'NO_SHAPE')}", file=sys.stderr) - print(f"DEBUG radius_neighbors: hasattr _onedal_estimator={hasattr(self, '_onedal_estimator')}, _tree={getattr(self, '_tree', 'NOT_SET')}, _fit_method={getattr(self, '_fit_method', 'NOT_SET')}", file=sys.stderr) + print( + f"DEBUG NearestNeighbors.radius_neighbors START: X type={type(X)}, _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}, _fit_X shape={getattr(getattr(self, '_fit_X', None), 'shape', 'NO_SHAPE')}", + file=sys.stderr, + ) + print( + f"DEBUG radius_neighbors: hasattr _onedal_estimator={hasattr(self, '_onedal_estimator')}, _tree={getattr(self, '_tree', 'NOT_SET')}, _fit_method={getattr(self, '_fit_method', 'NOT_SET')}", + file=sys.stderr, + ) if ( hasattr(self, "_onedal_estimator") or getattr(self, "_tree", 0) is None and self._fit_method == "kd_tree" ): - print(f"DEBUG radius_neighbors: Calling sklearn fit with _fit_X type={type(self._fit_X)}", file=sys.stderr) + print( + f"DEBUG radius_neighbors: Calling sklearn fit with _fit_X type={type(self._fit_X)}", + file=sys.stderr, + ) _sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, "_y", None)) - print(f"DEBUG radius_neighbors: sklearn fit completed, _fit_X type now={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print( + f"DEBUG radius_neighbors: sklearn fit completed, _fit_X type now={type(getattr(self, '_fit_X', 'NOT_SET'))}", + file=sys.stderr, + ) check_is_fitted(self) result = dispatch( self, @@ -132,7 +156,10 @@ def radius_neighbors( return_distance=return_distance, sort_results=sort_results, ) - print(f"DEBUG NearestNeighbors.radius_neighbors END: result type={type(result)}", file=sys.stderr) + print( + f"DEBUG NearestNeighbors.radius_neighbors END: result type={type(result)}", + file=sys.stderr, + ) return result def radius_neighbors_graph( @@ -152,18 +179,25 @@ def radius_neighbors_graph( ) def _onedal_fit(self, X, y=None, queue=None): - print(f"DEBUG NearestNeighbors._onedal_fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", file=sys.stderr) - + print( + f"DEBUG NearestNeighbors._onedal_fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", + file=sys.stderr, + ) + # Get array namespace for array API support xp, _ = get_namespace(X) - + # REFACTOR: Use validate_data to convert pandas to numpy and validate types # ensure_all_finite=False to allow nan_euclidean metric to work (will fallback to sklearn) X = validate_data( - self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", ensure_all_finite=False + self, + X, + dtype=[xp.float64, xp.float32], + accept_sparse="csr", + ensure_all_finite=False, ) print(f"DEBUG: After validate_data, X type={type(X)}", file=sys.stderr) - + onedal_params = { "n_neighbors": self.n_neighbors, "algorithm": self.algorithm, @@ -175,19 +209,33 @@ def _onedal_fit(self, X, y=None, queue=None): self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ - print(f"DEBUG NearestNeighbors._onedal_fit: Calling onedal_estimator.fit", file=sys.stderr) + print( + f"DEBUG NearestNeighbors._onedal_fit: Calling onedal_estimator.fit", + file=sys.stderr, + ) self._onedal_estimator.fit(X, y, queue=queue) - print(f"DEBUG NearestNeighbors._onedal_fit: After fit, onedal_estimator._fit_X type={type(getattr(self._onedal_estimator, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print( + f"DEBUG NearestNeighbors._onedal_fit: After fit, onedal_estimator._fit_X type={type(getattr(self._onedal_estimator, '_fit_X', 'NOT_SET'))}", + file=sys.stderr, + ) self._save_attributes() - print(f"DEBUG NearestNeighbors._onedal_fit END: self._fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print( + f"DEBUG NearestNeighbors._onedal_fit END: self._fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", + file=sys.stderr, + ) def _onedal_predict(self, X, queue=None): # Validate and convert X (pandas to numpy if needed) only if X is not None if X is not None: xp, _ = get_namespace(X) X = validate_data( - self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False, force_all_finite=False + self, + X, + dtype=[xp.float64, xp.float32], + accept_sparse="csr", + reset=False, + force_all_finite=False, ) return self._onedal_estimator.predict(X, queue=queue) @@ -195,50 +243,79 @@ def _onedal_kneighbors( self, X=None, n_neighbors=None, return_distance=True, queue=None ): import sys - print(f"DEBUG NearestNeighbors._onedal_kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", file=sys.stderr) - + + print( + f"DEBUG NearestNeighbors._onedal_kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", + file=sys.stderr, + ) + # Validate X to convert array API/pandas to numpy and check feature names (only if X is not None) if X is not None: xp, _ = get_namespace(X) X = validate_data( - self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False, ensure_all_finite=False + self, + X, + dtype=[xp.float64, xp.float32], + accept_sparse="csr", + reset=False, + ensure_all_finite=False, ) - + # REFACTOR: All post-processing now in sklearnex following PCA pattern # Prepare inputs and handle query_is_train case (includes validation AFTER +=1) X, n_neighbors, query_is_train = self._prepare_kneighbors_inputs(X, n_neighbors) - + # Get raw results from onedal backend result = self._onedal_estimator.kneighbors( X, n_neighbors, return_distance, queue=queue ) - + # Apply post-processing (kd_tree sorting, removing self from results) - result = self._kneighbors_post_processing(X, n_neighbors, return_distance, result, query_is_train) - - print(f"DEBUG NearestNeighbors._onedal_kneighbors END: result type={type(result)}", file=sys.stderr) + result = self._kneighbors_post_processing( + X, n_neighbors, return_distance, result, query_is_train + ) + + print( + f"DEBUG NearestNeighbors._onedal_kneighbors END: result type={type(result)}", + file=sys.stderr, + ) return result def _save_attributes(self): - print(f"DEBUG NearestNeighbors._save_attributes START: onedal_estimator._fit_X type={type(getattr(self._onedal_estimator, '_fit_X', 'NOT_SET'))}", file=sys.stderr) - if hasattr(self._onedal_estimator, '_fit_X'): + print( + f"DEBUG NearestNeighbors._save_attributes START: onedal_estimator._fit_X type={type(getattr(self._onedal_estimator, '_fit_X', 'NOT_SET'))}", + file=sys.stderr, + ) + if hasattr(self._onedal_estimator, "_fit_X"): fit_x_preview = str(self._onedal_estimator._fit_X)[:200] - print(f"DEBUG _save_attributes: _fit_X value preview={fit_x_preview}", file=sys.stderr) + print( + f"DEBUG _save_attributes: _fit_X value preview={fit_x_preview}", + file=sys.stderr, + ) self.classes_ = self._onedal_estimator.classes_ self.n_features_in_ = self._onedal_estimator.n_features_in_ self.n_samples_fit_ = self._onedal_estimator.n_samples_fit_ # ORIGINAL MAIN BRANCH: Direct assignment without any tuple extraction self._fit_X = self._onedal_estimator._fit_X - print(f"DEBUG _save_attributes: AFTER assignment - self._fit_X type={type(self._fit_X)}, has shape attr={hasattr(self._fit_X, 'shape')}", file=sys.stderr) - if hasattr(self._fit_X, 'shape'): - print(f"DEBUG _save_attributes: self._fit_X.shape={self._fit_X.shape}", file=sys.stderr) + print( + f"DEBUG _save_attributes: AFTER assignment - self._fit_X type={type(self._fit_X)}, has shape attr={hasattr(self._fit_X, 'shape')}", + file=sys.stderr, + ) + if hasattr(self._fit_X, "shape"): + print( + f"DEBUG _save_attributes: self._fit_X.shape={self._fit_X.shape}", + file=sys.stderr, + ) self._fit_method = self._onedal_estimator._fit_method self._tree = self._onedal_estimator._tree - print(f"DEBUG NearestNeighbors._save_attributes END: _fit_method={self._fit_method}, _tree={self._tree}", file=sys.stderr) + print( + f"DEBUG NearestNeighbors._save_attributes END: _fit_method={self._fit_method}, _tree={self._tree}", + file=sys.stderr, + ) fit.__doc__ = _sklearn_NearestNeighbors.__doc__ kneighbors.__doc__ = _sklearn_NearestNeighbors.kneighbors.__doc__ radius_neighbors.__doc__ = _sklearn_NearestNeighbors.radius_neighbors.__doc__ radius_neighbors_graph.__doc__ = ( _sklearn_NearestNeighbors.radius_neighbors_graph.__doc__ - ) \ No newline at end of file + ) From 342b838273a3b9c5e6dfd1fc885cb8e562aceb65 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 20 Oct 2025 14:37:08 -0700 Subject: [PATCH 57/87] fix: remove ensure finite and reformat --- .../tests/test_knn_classification.py | 47 +++++++++++++++---- sklearnex/neighbors/_lof.py | 1 - sklearnex/neighbors/knn_classification.py | 3 -- sklearnex/neighbors/knn_regression.py | 3 -- sklearnex/neighbors/knn_unsupervised.py | 3 -- 5 files changed, 37 insertions(+), 20 deletions(-) diff --git a/onedal/neighbors/tests/test_knn_classification.py b/onedal/neighbors/tests/test_knn_classification.py index 783d9d6e24..80b0816cde 100755 --- a/onedal/neighbors/tests/test_knn_classification.py +++ b/onedal/neighbors/tests/test_knn_classification.py @@ -19,23 +19,34 @@ from numpy.testing import assert_array_equal from sklearn import datasets +from onedal.tests.utils._device_selection import get_queues + # REFACTOR: Import from sklearnex instead of onedal # Classification processing now happens in sklearnex layer from sklearnex.neighbors import KNeighborsClassifier -from onedal.tests.utils._device_selection import get_queues @pytest.mark.parametrize("queue", get_queues()) def test_iris(queue): import sys + print(f"\n=== DEBUG test_iris START: queue={queue} ===", file=sys.stderr) # REFACTOR NOTE: queue parameter not used with sklearnex, but kept for test parametrization iris = datasets.load_iris() - print(f"DEBUG test: iris.data type={type(iris.data)}, shape={iris.data.shape}", file=sys.stderr) - print(f"DEBUG test: iris.target type={type(iris.target)}, shape={iris.target.shape}", file=sys.stderr) + print( + f"DEBUG test: iris.data type={type(iris.data)}, shape={iris.data.shape}", + file=sys.stderr, + ) + print( + f"DEBUG test: iris.target type={type(iris.target)}, shape={iris.target.shape}", + file=sys.stderr, + ) print(f"DEBUG test: Creating KNeighborsClassifier and calling fit", file=sys.stderr) clf = KNeighborsClassifier(2).fit(iris.data, iris.target) - print(f"DEBUG test: fit completed, clf._fit_X type={type(getattr(clf, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print( + f"DEBUG test: fit completed, clf._fit_X type={type(getattr(clf, '_fit_X', 'NOT_SET'))}", + file=sys.stderr, + ) print(f"DEBUG test: Calling score", file=sys.stderr) score = clf.score(iris.data, iris.target) print(f"DEBUG test: score completed, score={score}", file=sys.stderr) @@ -47,19 +58,32 @@ def test_iris(queue): @pytest.mark.parametrize("queue", get_queues()) def test_pickle(queue): import sys + print(f"\n=== DEBUG test_pickle START: queue={queue} ===", file=sys.stderr) # REFACTOR NOTE: queue parameter not used with sklearnex, but kept for test parametrization if queue and queue.sycl_device.is_gpu: pytest.skip("KNN classifier pickling for the GPU sycl_queue is buggy.") iris = datasets.load_iris() - print(f"DEBUG test: iris.data type={type(iris.data)}, shape={iris.data.shape}", file=sys.stderr) - print(f"DEBUG test: iris.target type={type(iris.target)}, shape={iris.target.shape}", file=sys.stderr) + print( + f"DEBUG test: iris.data type={type(iris.data)}, shape={iris.data.shape}", + file=sys.stderr, + ) + print( + f"DEBUG test: iris.target type={type(iris.target)}, shape={iris.target.shape}", + file=sys.stderr, + ) print(f"DEBUG test: Creating KNeighborsClassifier and calling fit", file=sys.stderr) clf = KNeighborsClassifier(2).fit(iris.data, iris.target) - print(f"DEBUG test: fit completed, clf._fit_X type={type(getattr(clf, '_fit_X', 'NOT_SET'))}", file=sys.stderr) + print( + f"DEBUG test: fit completed, clf._fit_X type={type(getattr(clf, '_fit_X', 'NOT_SET'))}", + file=sys.stderr, + ) print(f"DEBUG test: Calling predict", file=sys.stderr) expected = clf.predict(iris.data) - print(f"DEBUG test: predict completed, expected type={type(expected)}, shape={expected.shape}", file=sys.stderr) + print( + f"DEBUG test: predict completed, expected type={type(expected)}, shape={expected.shape}", + file=sys.stderr, + ) import pickle @@ -71,6 +95,9 @@ def test_pickle(queue): assert type(clf2) == clf.__class__ print(f"DEBUG test: Calling predict on unpickled classifier", file=sys.stderr) result = clf2.predict(iris.data) - print(f"DEBUG test: predict completed, result type={type(result)}, shape={result.shape}", file=sys.stderr) + print( + f"DEBUG test: predict completed, result type={type(result)}, shape={result.shape}", + file=sys.stderr, + ) assert_array_equal(expected, result) - print(f"=== DEBUG test_pickle END ===\n", file=sys.stderr) \ No newline at end of file + print(f"=== DEBUG test_pickle END ===\n", file=sys.stderr) diff --git a/sklearnex/neighbors/_lof.py b/sklearnex/neighbors/_lof.py index 0676b6988f..374ae9c1bb 100644 --- a/sklearnex/neighbors/_lof.py +++ b/sklearnex/neighbors/_lof.py @@ -253,7 +253,6 @@ def score_samples(self, X): dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False, - ensure_all_finite=False, ) # check_feature_names(self, X, reset=False) diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index 8c4db1931d..183ef1f4ba 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -211,14 +211,12 @@ def _onedal_fit(self, X, y, queue=None): print(f"DEBUG: Array namespace: {xp}", file=sys.stderr) # REFACTOR: Use validate_data to convert pandas to numpy and validate types - # ensure_all_finite=False to allow nan_euclidean metric to work (will fallback to sklearn) X, y = validate_data( self, X, y, dtype=[xp.float64, xp.float32], accept_sparse="csr", - ensure_all_finite=False, ) print( f"DEBUG: After validate_data, X type={type(X)}, y type={type(y)}", @@ -342,7 +340,6 @@ def _onedal_kneighbors( dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False, - ensure_all_finite=False, ) # REFACTOR: All post-processing now in sklearnex following PCA pattern diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 411227d2ca..9caf14af4b 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -183,13 +183,11 @@ def _onedal_fit(self, X, y, queue=None): print(f"DEBUG: Array namespace: {xp}", file=sys.stderr) # REFACTOR: Use validate_data to convert pandas to numpy and validate types for X only - # ensure_all_finite=False to allow nan_euclidean metric to work (will fallback to sklearn) X = validate_data( self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", - ensure_all_finite=False, ) print( f"DEBUG: After validate_data, X type={type(X)}, y type={type(y)}", @@ -353,7 +351,6 @@ def _onedal_kneighbors( dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False, - ensure_all_finite=False, ) # REFACTOR: All post-processing now in sklearnex following PCA pattern diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index a8e8988bf8..731b36e7cc 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -188,13 +188,11 @@ def _onedal_fit(self, X, y=None, queue=None): xp, _ = get_namespace(X) # REFACTOR: Use validate_data to convert pandas to numpy and validate types - # ensure_all_finite=False to allow nan_euclidean metric to work (will fallback to sklearn) X = validate_data( self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", - ensure_all_finite=False, ) print(f"DEBUG: After validate_data, X type={type(X)}", file=sys.stderr) @@ -258,7 +256,6 @@ def _onedal_kneighbors( dtype=[xp.float64, xp.float32], accept_sparse="csr", reset=False, - ensure_all_finite=False, ) # REFACTOR: All post-processing now in sklearnex following PCA pattern From a46cc59d02a6083c9ee75c6ff0306191478f464f Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 20 Oct 2025 14:39:47 -0700 Subject: [PATCH 58/87] fix: format --- sklearnex/tests/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/tests/test_common.py b/sklearnex/tests/test_common.py index 7921b1e24b..435d7359da 100644 --- a/sklearnex/tests/test_common.py +++ b/sklearnex/tests/test_common.py @@ -569,4 +569,4 @@ def test_estimator(estimator, method, design_pattern, estimator_trace): if key in _DESIGN_RULE_VIOLATIONS: pytest.xfail(_DESIGN_RULE_VIOLATIONS[key]) else: - raise \ No newline at end of file + raise From 43283cde6548438866c98103ecb40ea82f909cad Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 20 Oct 2025 15:31:31 -0700 Subject: [PATCH 59/87] fix: fix patching type error --- sklearnex/neighbors/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index 3948a32121..0a6f7b09d3 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -560,7 +560,7 @@ def _process_classification_targets(self, y): y_k = np.asarray(y[:, k]) classes, indices = np.unique(y_k, return_inverse=True) self.classes_.append(classes) - self._y[:, k] = xp.asarray(indices) + self._y[:, k] = xp.asarray(indices, dtype=xp.int32) if not self.outputs_2d_: self.classes_ = self.classes_[0] From d734e1f64f89695904653a02b129623db58eceea Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 20 Oct 2025 15:36:05 -0700 Subject: [PATCH 60/87] fix: update doc --- doc/sources/array_api.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/sources/array_api.rst b/doc/sources/array_api.rst index b2eb7a8bee..9ed34ea49e 100644 --- a/doc/sources/array_api.rst +++ b/doc/sources/array_api.rst @@ -96,6 +96,10 @@ The following patched classes have support for array API inputs: - :obj:`sklearn.linear_model.Ridge` - :obj:`sklearnex.linear_model.IncrementalLinearRegression` - :obj:`sklearnex.linear_model.IncrementalRidge` +- :obj:`sklearn.neighbors.KNeighborsClassifier` +- :obj:`sklearn.neighbors.KNeighborsRegressor` +- :obj:`sklearn.neighbors.NearestNeighbors` +- :obj:`sklearn.neighbors.LocalOutlierFactor` .. note:: While full array API support is currently not implemented for all classes, :external+dpnp:doc:`dpnp.ndarray ` From 8c9246dc1d26d073771ee1091f3b90c4ccc62ab8 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 20 Oct 2025 16:28:06 -0700 Subject: [PATCH 61/87] fix: fix patching error --- sklearnex/neighbors/common.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index 0a6f7b09d3..197739d3e3 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -255,17 +255,10 @@ def _compute_class_probabilities( for neighbor_idx in range(pred_labels.shape[1]): class_label = int(pred_labels[sample_idx, neighbor_idx]) weight = weights[sample_idx, neighbor_idx] - # Update probability for this class - sample_proba = xp.asarray( - [ - ( - sample_proba[i] + weight - if i == class_label - else sample_proba[i] - ) - for i in range(classes_k.size) - ] - ) + # Update probability for this class using array indexing + # Create a mask for this class and add weight where mask is True + mask = xp.arange(classes_k.size) == class_label + sample_proba = sample_proba + xp.where(mask, weight, 0.0) proba_list.append(sample_proba) proba_k = xp.stack(proba_list, axis=0) # Shape: (n_queries, n_classes) From 4cb7ed34831c2ab948b30aeef0ef992a95d23583 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 20 Oct 2025 22:07:50 -0700 Subject: [PATCH 62/87] fix: attribute error --- sklearnex/neighbors/common.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index 197739d3e3..51169306c4 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -249,6 +249,7 @@ def _compute_class_probabilities( # Array API: Cannot use fancy indexing __setitem__ like proba_k[all_rows, idx] = ... # Instead, build probabilities sample by sample proba_list = [] + zero_weight = xp.asarray(0.0, dtype=xp.float64) for sample_idx in range(n_queries): sample_proba = xp.zeros((classes_k.size,), dtype=xp.float64) # For this sample, accumulate weights for each neighbor's predicted class @@ -258,13 +259,16 @@ def _compute_class_probabilities( # Update probability for this class using array indexing # Create a mask for this class and add weight where mask is True mask = xp.arange(classes_k.size) == class_label - sample_proba = sample_proba + xp.where(mask, weight, 0.0) + sample_proba = sample_proba + xp.where(mask, weight, zero_weight) proba_list.append(sample_proba) proba_k = xp.stack(proba_list, axis=0) # Shape: (n_queries, n_classes) # normalize 'votes' into real [0,1] probabilities normalizer = xp.sum(proba_k, axis=1)[:, xp.newaxis] - normalizer[normalizer == 0.0] = 1.0 + # Use array scalar for comparison and assignment + zero_scalar = xp.asarray(0.0, dtype=xp.float64) + one_scalar = xp.asarray(1.0, dtype=xp.float64) + normalizer = xp.where(normalizer == zero_scalar, one_scalar, normalizer) proba_k /= normalizer probabilities.append(proba_k) From 95fff2141e99bc6228402abcd1c41ef65760f158 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 20 Oct 2025 23:19:10 -0700 Subject: [PATCH 63/87] fix: patchnig AttributeError --- sklearnex/neighbors/knn_regression.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 9caf14af4b..cdce6a9df9 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -376,13 +376,20 @@ def _onedal_kneighbors( def _onedal_score(self, X, y, sample_weight=None, queue=None): import sys + from onedal._device_offload import _transfer_to_host + print( f"DEBUG KNeighborsRegressor._onedal_score START: X type={type(X)}, y type={type(y)}", file=sys.stderr, ) - result = r2_score( - y, self._onedal_predict(X, queue=queue), sample_weight=sample_weight - ) + y_pred = self._onedal_predict(X, queue=queue) + + # Convert array API/USM arrays back to numpy for r2_score + # r2_score doesn't support Array API, following PCA's pattern with _transfer_to_host + _, host_data = _transfer_to_host(y, y_pred, sample_weight) + y, y_pred, sample_weight = host_data + + result = r2_score(y, y_pred, sample_weight=sample_weight) print( f"DEBUG KNeighborsRegressor._onedal_score END: result={result}", file=sys.stderr, From b250c46507fdb77b3f76b2cc49c73bd8bbb3c9e7 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 21 Oct 2025 00:53:56 -0700 Subject: [PATCH 64/87] fix: remove print and commented code --- onedal/neighbors/neighbors.py | 412 +----------------- .../tests/test_knn_classification.py | 55 +-- sklearnex/neighbors/_lof.py | 77 +--- sklearnex/neighbors/common.py | 46 -- sklearnex/neighbors/knn_classification.py | 163 +------ sklearnex/neighbors/knn_regression.py | 162 +------ sklearnex/neighbors/knn_unsupervised.py | 106 +---- 7 files changed, 18 insertions(+), 1003 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index 281caf6d63..6efbe366db 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -14,29 +14,13 @@ # limitations under the License. # ============================================================================== -import sys from abc import ABCMeta, abstractmethod -from numbers import Integral - -import numpy as np - from onedal._device_offload import supports_queue from onedal.common._backend import bind_default_backend from onedal.utils import _sycl_queue_manager as QM - -from .._config import _get_config from ..common._estimator_checks import _check_is_fitted, _is_classifier, _is_regressor from ..common._mixin import ClassifierMixin, RegressorMixin from ..datatypes import from_table, to_table -from ..utils._array_api import _get_sycl_namespace -from ..utils.validation import ( - _check_array, - _check_classification_targets, - _check_n_features, - _check_X_y, - _column_or_1d, - _num_samples, -) class NeighborsCommonBase(metaclass=ABCMeta): @@ -77,69 +61,6 @@ def infer(self, *args, **kwargs): ... @abstractmethod def _onedal_fit(self, X, y): ... - # def _validate_data( - # self, X, y=None, reset=True, validate_separately=None, **check_params - # ): - # if y is None: - # if self.requires_y: - # raise ValueError( - # f"This {self.__class__.__name__} estimator " - # f"requires y to be passed, but the target y is None." - # ) - # X = _check_array(X, **check_params) - # out = X, y - # else: - # if validate_separately: - # # We need this because some estimators validate X and y - # # separately, and in general, separately calling _check_array() - # # on X and y isn't equivalent to just calling _check_X_y() - # # :( - # check_X_params, check_y_params = validate_separately - # X = _check_array(X, **check_X_params) - # y = _check_array(y, **check_y_params) - # else: - # X, y = _check_X_y(X, y, **check_params) - # out = X, y - - # if check_params.get("ensure_2d", True): - # _check_n_features(self, X, reset=reset) - - # return out - - # REFACTOR: _get_weights moved to sklearnex/neighbors/common.py - # All prediction logic now in sklearnex layer, so this method is no longer needed in onedal - # Original code kept for reference only - # def _get_weights(self, dist, weights): - # if weights in (None, "uniform"): - # return None - # if weights == "distance": - # # if user attempts to classify a point that was zero distance from one - # # or more training points, those training points are weighted as 1.0 - # # and the other points as 0.0 - # if dist.dtype is np.dtype(object): - # for point_dist_i, point_dist in enumerate(dist): - # # check if point_dist is iterable - # # (ex: RadiusNeighborClassifier.predict may set an element of - # # dist to 1e-6 to represent an 'outlier') - # if hasattr(point_dist, "__contains__") and 0.0 in point_dist: - # dist[point_dist_i] = point_dist == 0.0 - # else: - # dist[point_dist_i] = 1.0 / point_dist - # else: - # with np.errstate(divide="ignore"): - # dist = 1.0 / dist - # inf_mask = np.isinf(dist) - # inf_row = np.any(inf_mask, axis=1) - # dist[inf_row] = inf_mask[inf_row] - # return dist - # elif callable(weights): - # return weights(dist) - # else: - # raise ValueError( - # "weights not recognized: should be 'uniform', " - # "'distance', or a callable function" - # ) - def _get_onedal_params(self, X, y=None, n_neighbors=None): class_count = 0 if self.classes_ is None else len(self.classes_) weights = getattr(self, "weights", "uniform") @@ -180,33 +101,7 @@ def __init__( self.p = p self.metric_params = metric_params - # REFACTOR: _validate_targets commented out - all data conversion/validation moved to sklearnex layer - # Following PCA pattern: onedal should not do any data type conversion - # The sklearnex layer prepares data in the correct format before calling onedal - # Original code kept for reference: - # def _validate_targets(self, y, dtype): - # arr = _column_or_1d(y, warn=True) - # - # try: - # return arr.astype(dtype, copy=False) - # except ValueError: - # return arr - - # REFACTOR NOTE: _validate_n_classes moved to sklearnex/neighbors/common.py - # This method is no longer used in the onedal layer - all validation happens in sklearnex - # Commented out for reference only - # def _validate_n_classes(self): - # length = 0 if self.classes_ is None else len(self.classes_) - # if length < 2: - # raise ValueError( - # f"The number of classes has to be greater than one; got {length}" - # ) - def _fit(self, X, y): - print( - f"DEBUG oneDAL _fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", - file=sys.stderr, - ) self._onedal_model = None self._tree = None # REFACTOR: Shape processing moved to sklearnex layer @@ -223,21 +118,10 @@ def _fit(self, X, y): self, "effective_metric_params_", self.metric_params ) - # _, xp, _ = _get_sycl_namespace(X) # REFACTOR: _validate_data call commented out - validation now happens in sklearnex layer # Original code kept for reference: # use_raw_input = _get_config().get("use_raw_input", False) is True if y is not None or self.requires_y: - # REFACTOR: Shape processing commented out - should be done in sklearnex layer - # Original code kept for reference: - # shape = getattr(y, "shape", None) - # REFACTOR: _validate_data call commented out - validation now happens in sklearnex layer - # if not use_raw_input: - # X, y = super()._validate_data( - # X, y, dtype=[np.float64, np.float32], accept_sparse="csr" - # ) - # self._shape = shape if shape is not None else y.shape - # REFACTOR: Classification target processing moved to sklearnex layer # This code is now commented out - processing MUST happen in sklearnex before calling fit # Assertion: Verify that sklearnex has done the preprocessing @@ -252,54 +136,12 @@ def _fit(self, X, y): "Classification target processing must be done in sklearnex layer before calling onedal fit. " "_y attribute is not set. This indicates the refactoring is incomplete." ) - print( - f"DEBUG oneDAL: Using pre-processed classification targets from sklearnex (classes_={self.classes_})", - file=sys.stderr, - ) - - # Original classification processing code - NOW COMMENTED OUT (moved to sklearnex) - # if _is_classifier(self): - # if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1: - # self.outputs_2d_ = False - # y = y.reshape((-1, 1)) - # else: - # self.outputs_2d_ = True - - # _check_classification_targets(y) - # self.classes_ = [] - # self._y = np.empty(y.shape, dtype=int) - # for k in range(self._y.shape[1]): - # classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True) - # self.classes_.append(classes) - - # if not self.outputs_2d_: - # self.classes_ = self.classes_[0] - # self._y = self._y.ravel() - - # self._validate_n_classes() - # else: else: # For regressors, just store y self._y = y - # REFACTOR: _validate_data call commented out - validation now happens in sklearnex layer - # elif not use_raw_input: - # X, _ = super()._validate_data(X, dtype=[np.float64, np.float32]) - self.n_samples_fit_ = X.shape[0] self.n_features_in_ = X.shape[1] self._fit_X = X - - # REFACTOR: n_neighbors validation commented out - should be done in sklearnex layer - # Original code kept for reference: - # if self.n_neighbors is not None: - # if self.n_neighbors <= 0: - # raise ValueError("Expected n_neighbors > 0. Got %d" % self.n_neighbors) - # if not isinstance(self.n_neighbors, Integral): - # raise TypeError( - # "n_neighbors does not take %s value, " - # "enter integer value" % type(self.n_neighbors) - # ) - self._fit_method = super()._parse_auto_method( self.algorithm, self.n_samples_fit_, self.n_features_in_ ) @@ -307,54 +149,16 @@ def _fit(self, X, y): _fit_y = None queue = QM.get_global_queue() gpu_device = queue is not None and queue.sycl_device.is_gpu - - print( - f"DEBUG oneDAL _fit: Before _onedal_fit, X type={type(X)}, _fit_y type={type(_fit_y)}", - file=sys.stderr, - ) - # REFACTOR: All data preparation including reshaping moved to sklearnex layer - # Following PCA pattern: onedal is a thin wrapper, no data manipulation - # sklearnex prepares self._y in the correct shape before calling fit() - # Original code kept for reference: - # if _is_classifier(self) or (_is_regressor(self) and gpu_device): - # _fit_y = self._validate_targets(self._y, X.dtype).reshape((-1, 1)) - # OR for refactor without _validate_targets: - # _fit_y = self._y.reshape((-1, 1)) - - # REFACTOR: Just pass self._y as-is - sklearnex should have already reshaped it + # Just pass self._y as-is - sklearnex should have already reshaped it if _is_classifier(self) or (_is_regressor(self) and gpu_device): _fit_y = self._y result = self._onedal_fit(X, _fit_y) - print( - f"DEBUG oneDAL _fit: After _onedal_fit, self._fit_X type={type(self._fit_X)}, shape={getattr(self._fit_X, 'shape', 'NO_SHAPE')}", - file=sys.stderr, - ) - - # REFACTOR: Shape-based y reshaping commented out - y should already be properly shaped by sklearnex - # Original code kept for reference: - # if y is not None and _is_regressor(self): - # self._y = y if self._shape is None else xp.reshape(y, self._shape) - self._onedal_model = result result = self return result def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): - # REFACTOR: Feature count validation commented out - should be done in sklearnex layer - # Original validation code kept for reference: - # use_raw_input = _get_config().get("use_raw_input", False) is True - # n_features = getattr(self, "n_features_in_", None) - # shape = getattr(X, "shape", None) - # if n_features and shape and len(shape) > 1 and shape[1] != n_features: - # raise ValueError( - # ( - # f"X has {X.shape[1]} features, " - # f"but kneighbors is expecting " - # f"{n_features} features as input" - # ) - # ) - # Still need n_features for _parse_auto_method call later # n_features = getattr(self, "n_features_in_", None) @@ -362,134 +166,21 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): if n_neighbors is None: n_neighbors = self.n_neighbors - # REFACTOR: n_neighbors validation commented out - should be done in sklearnex layer - # Original validation code kept for reference: - # elif n_neighbors <= 0: - # raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors) - # else: - # if not isinstance(n_neighbors, Integral): - # raise TypeError( - # "n_neighbors does not take %s value, " - # "enter integer value" % type(n_neighbors) - # ) - - # REFACTOR: X array validation commented out - should be done in sklearnex layer - # Original validation code kept for reference: - # if X is not None: - # query_is_train = False - # if not use_raw_input: - # X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) - # else: - # query_is_train = True - # X = self._fit_X - # # Include an extra neighbor to account for the sample itself being - # # returned, which is removed later - # n_neighbors += 1 - - # REFACTOR: query_is_train handling moved to sklearnex layer - # All post-processing now happens in sklearnex._kneighbors_post_processing() - # Original code kept for reference: - # if X is not None: - # query_is_train = False - # else: - # query_is_train = True - # X = self._fit_X - # # Include an extra neighbor to account for the sample itself being - # # returned, which is removed later - # n_neighbors += 1 - - # REFACTOR: onedal now just returns raw results, sklearnex does all processing + + # onedal now just returns raw results, sklearnex does all processing # Following PCA pattern: simple onedal layer if X is None: X = self._fit_X - # n_samples_fit = self.n_samples_fit_ - # REFACTOR: n_neighbors bounds validation moved to sklearnex layer (_onedal_kneighbors) - # Original validation code kept for reference: - # if n_neighbors > n_samples_fit: - # if query_is_train: - # n_neighbors -= 1 # ok to modify inplace because an error is raised - # inequality_str = "n_neighbors < n_samples_fit" - # else: - # inequality_str = "n_neighbors <= n_samples_fit" - # raise ValueError( - # f"Expected {inequality_str}, but " - # f"n_neighbors = {n_neighbors}, n_samples_fit = {n_samples_fit}, " - # f"n_samples = {X.shape[0]}" # include n_samples for common tests - # ) - - # chunked_results = None - # method = self._parse_auto_method( - # self._fit_method, self.n_samples_fit_, n_features - # ) - - # REFACTOR: Following PCA pattern - onedal just calls backend and returns raw results + # onedal just calls backend and returns raw results # All post-processing (kd_tree sorting, removing self, return_distance decision) moved to sklearnex params = super()._get_onedal_params(X, n_neighbors=n_neighbors) prediction_results = self._onedal_predict(self._onedal_model, X, params) distances = from_table(prediction_results.distances) indices = from_table(prediction_results.indices) - # REFACTOR: kd_tree sorting moved to sklearnex._kneighbors_post_processing() - # Original code kept for reference: - # if method == "kd_tree": - # for i in range(distances.shape[0]): - # seq = distances[i].argsort() - # indices[i] = indices[i][seq] - # distances[i] = distances[i][seq] - - # REFACTOR: return_distance decision moved to sklearnex._kneighbors_post_processing() - # onedal always returns both distances and indices (backend always computes both) - # Original code kept for reference: - # if return_distance: - # results = distances, indices - # else: - # results = indices - # Always return both - sklearnex will decide what to return to user results = distances, indices - - # REFACTOR: chunked_results vstack moved to sklearnex (was dead code anyway) - # Original code kept for reference: - # if chunked_results is not None: - # if return_distance: - # neigh_dist, neigh_ind = zip(*chunked_results) - # results = np.vstack(neigh_dist), np.vstack(neigh_ind) - # else: - # results = np.vstack(chunked_results) - - # REFACTOR: Removing self from results moved to sklearnex._kneighbors_post_processing() - # All query_is_train post-processing now in sklearnex layer - # Original code kept for reference: - # if not query_is_train: - # return results - # - # # If the query data is the same as the indexed data, we would like - # # to ignore the first nearest neighbor of every sample, i.e - # # the sample itself. - # if return_distance: - # neigh_dist, neigh_ind = results - # else: - # neigh_ind = results - # - # n_queries, _ = X.shape - # sample_range = np.arange(n_queries)[:, None] - # sample_mask = neigh_ind != sample_range - # - # # Corner case: When the number of duplicates are more - # # than the number of neighbors, the first NN will not - # # be the sample, but a duplicate. - # # In that case mask the first duplicate. - # dup_gr_nbrs = np.all(sample_mask, axis=1) - # sample_mask[:, 0][dup_gr_nbrs] = False - # - # neigh_ind = np.reshape(neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) - # - # if return_distance: - # neigh_dist = np.reshape(neigh_dist[sample_mask], (n_queries, n_neighbors - 1)) - # return neigh_dist, neigh_ind - # return neigh_ind - # Return raw results - sklearnex will do all post-processing return results @@ -549,101 +240,6 @@ def _onedal_predict(self, model, X, params): def fit(self, X, y, queue=None): return self._fit(X, y) - # REFACTOR: All prediction logic moved to sklearnex layer - # predict() and predict_proba() are no longer used - sklearnex calls kneighbors() and computes predictions - # Original code kept for reference only - # @supports_queue - # def predict(self, X, queue=None): - # print(f"DEBUG KNeighborsClassifier.predict START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", file=sys.stderr) - # - # # REFACTOR: _check_array validation commented out - should be done in sklearnex layer - # # Original validation code kept for reference: - # # use_raw_input = _get_config().get("use_raw_input", False) is True - # # if not use_raw_input: - # # X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) - # - # onedal_model = getattr(self, "_onedal_model", None) - # n_features = getattr(self, "n_features_in_", None) - # n_samples_fit_ = getattr(self, "n_samples_fit_", None) - # - # # REFACTOR: Feature count validation commented out - should be done in sklearnex layer - # # Original validation code kept for reference: - # # shape = getattr(X, "shape", None) - # # if n_features and shape and len(shape) > 1 and shape[1] != n_features: - # # raise ValueError( - # # ( - # # f"X has {X.shape[1]} features, " - # # f"but KNNClassifier is expecting " - # # f"{n_features} features as input" - # # ) - # # ) - # - # _check_is_fitted(self) - # - # self._fit_method = self._parse_auto_method( - # self.algorithm, n_samples_fit_, n_features - # ) - # - # # REFACTOR NOTE: _validate_n_classes() is now called during fit in sklearnex layer - # # No need to validate again during predict - # # self._validate_n_classes() - # - # # Handle X=None case (LOOCV pattern) - use training data - # # This is needed because _get_onedal_params expects X to have .dtype attribute - # if X is None: - # X = self._fit_X - # - # params = self._get_onedal_params(X) - # prediction_result = self._onedal_predict(onedal_model, X, params) - # responses = from_table(prediction_result.responses) - # - # result = self.classes_.take(np.asarray(responses.ravel(), dtype=np.intp)) - # print(f"DEBUG KNeighborsClassifier.predict END: result type={type(result)}", file=sys.stderr) - # return result - # - # @supports_queue - # def predict_proba(self, X, queue=None): - # print(f"DEBUG KNeighborsClassifier.predict_proba START: X type={type(X)}", file=sys.stderr) - # neigh_dist, neigh_ind = self.kneighbors(X, queue=queue) - # - # classes_ = self.classes_ - # _y = self._y - # if not self.outputs_2d_: - # _y = self._y.reshape((-1, 1)) - # classes_ = [self.classes_] - # - # n_queries = _num_samples(X) - # - # print(f"DEBUG predict_proba: Calling _get_weights", file=sys.stderr) - # weights = self._get_weights(neigh_dist, self.weights) - # if weights is None: - # print(f"DEBUG predict_proba: weights is None, using ones_like", file=sys.stderr) - # weights = np.ones_like(neigh_ind) - # else: - # print(f"DEBUG predict_proba: weights calculated, type={type(weights)}", file=sys.stderr) - # - # all_rows = np.arange(n_queries) - # probabilities = [] - # for k, classes_k in enumerate(classes_): - # pred_labels = _y[:, k][neigh_ind] - # proba_k = np.zeros((n_queries, classes_k.size)) - # - # # a simple ':' index doesn't work right - # for i, idx in enumerate(pred_labels.T): # loop is O(n_neighbors) - # proba_k[all_rows, idx] += weights[:, i] - # - # # normalize 'votes' into real [0,1] probabilities - # normalizer = proba_k.sum(axis=1)[:, np.newaxis] - # normalizer[normalizer == 0.0] = 1.0 - # proba_k /= normalizer - # - # probabilities.append(proba_k) - # - # if not self.outputs_2d_: - # probabilities = probabilities[0] - # - # return probabilities - @supports_queue def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): return self._kneighbors(X, n_neighbors, return_distance) diff --git a/onedal/neighbors/tests/test_knn_classification.py b/onedal/neighbors/tests/test_knn_classification.py index 80b0816cde..a5fb812f4f 100755 --- a/onedal/neighbors/tests/test_knn_classification.py +++ b/onedal/neighbors/tests/test_knn_classification.py @@ -20,84 +20,31 @@ from sklearn import datasets from onedal.tests.utils._device_selection import get_queues - -# REFACTOR: Import from sklearnex instead of onedal # Classification processing now happens in sklearnex layer from sklearnex.neighbors import KNeighborsClassifier @pytest.mark.parametrize("queue", get_queues()) def test_iris(queue): - import sys - - print(f"\n=== DEBUG test_iris START: queue={queue} ===", file=sys.stderr) - # REFACTOR NOTE: queue parameter not used with sklearnex, but kept for test parametrization + # queue parameter not used with sklearnex, but kept for test parametrization iris = datasets.load_iris() - print( - f"DEBUG test: iris.data type={type(iris.data)}, shape={iris.data.shape}", - file=sys.stderr, - ) - print( - f"DEBUG test: iris.target type={type(iris.target)}, shape={iris.target.shape}", - file=sys.stderr, - ) - print(f"DEBUG test: Creating KNeighborsClassifier and calling fit", file=sys.stderr) clf = KNeighborsClassifier(2).fit(iris.data, iris.target) - print( - f"DEBUG test: fit completed, clf._fit_X type={type(getattr(clf, '_fit_X', 'NOT_SET'))}", - file=sys.stderr, - ) - print(f"DEBUG test: Calling score", file=sys.stderr) score = clf.score(iris.data, iris.target) - print(f"DEBUG test: score completed, score={score}", file=sys.stderr) assert score > 0.9 assert_array_equal(clf.classes_, np.sort(clf.classes_)) - print(f"=== DEBUG test_iris END ===\n", file=sys.stderr) @pytest.mark.parametrize("queue", get_queues()) def test_pickle(queue): - import sys - - print(f"\n=== DEBUG test_pickle START: queue={queue} ===", file=sys.stderr) - # REFACTOR NOTE: queue parameter not used with sklearnex, but kept for test parametrization if queue and queue.sycl_device.is_gpu: pytest.skip("KNN classifier pickling for the GPU sycl_queue is buggy.") iris = datasets.load_iris() - print( - f"DEBUG test: iris.data type={type(iris.data)}, shape={iris.data.shape}", - file=sys.stderr, - ) - print( - f"DEBUG test: iris.target type={type(iris.target)}, shape={iris.target.shape}", - file=sys.stderr, - ) - print(f"DEBUG test: Creating KNeighborsClassifier and calling fit", file=sys.stderr) clf = KNeighborsClassifier(2).fit(iris.data, iris.target) - print( - f"DEBUG test: fit completed, clf._fit_X type={type(getattr(clf, '_fit_X', 'NOT_SET'))}", - file=sys.stderr, - ) - print(f"DEBUG test: Calling predict", file=sys.stderr) expected = clf.predict(iris.data) - print( - f"DEBUG test: predict completed, expected type={type(expected)}, shape={expected.shape}", - file=sys.stderr, - ) - import pickle - - print(f"DEBUG test: Pickling classifier", file=sys.stderr) dump = pickle.dumps(clf) - print(f"DEBUG test: Unpickling classifier", file=sys.stderr) clf2 = pickle.loads(dump) assert type(clf2) == clf.__class__ - print(f"DEBUG test: Calling predict on unpickled classifier", file=sys.stderr) result = clf2.predict(iris.data) - print( - f"DEBUG test: predict completed, result type={type(result)}, shape={result.shape}", - file=sys.stderr, - ) assert_array_equal(expected, result) - print(f"=== DEBUG test_pickle END ===\n", file=sys.stderr) diff --git a/sklearnex/neighbors/_lof.py b/sklearnex/neighbors/_lof.py index 374ae9c1bb..4ce835e61e 100644 --- a/sklearnex/neighbors/_lof.py +++ b/sklearnex/neighbors/_lof.py @@ -53,20 +53,10 @@ class LocalOutlierFactor(KNeighborsDispatchingBase, _sklearn_LocalOutlierFactor) _onedal_kneighbors = NearestNeighbors._onedal_kneighbors def _onedal_fit(self, X, y, queue=None): - import sys - - print( - f"DEBUG LocalOutlierFactor._onedal_fit START: X type={type(X)}, y type={type(y)}", - file=sys.stderr, - ) if sklearn_check_version("1.2"): self._validate_params() # Let _onedal_knn_fit (NearestNeighbors._onedal_fit) handle validation - print( - f"DEBUG LocalOutlierFactor._onedal_fit: Calling _onedal_knn_fit", - file=sys.stderr, - ) self._onedal_knn_fit(X, y, queue=queue) if self.contamination != "auto": @@ -85,11 +75,6 @@ def _onedal_fit(self, X, y, queue=None): % (self.n_neighbors, n_samples) ) self.n_neighbors_ = max(1, min(self.n_neighbors, n_samples - 1)) - - print( - f"DEBUG LocalOutlierFactor._onedal_fit: Calling _onedal_kneighbors", - file=sys.stderr, - ) ( self._distances_fit_X_, _neighbors_indices_fit_X_, @@ -123,20 +108,9 @@ def _onedal_fit(self, X, y, queue=None): "Duplicate values are leading to incorrect results. " "Increase the number of neighbors for more accurate results." ) - - print( - f"DEBUG LocalOutlierFactor._onedal_fit END: _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", - file=sys.stderr, - ) return self def fit(self, X, y=None): - import sys - - print( - f"DEBUG LocalOutlierFactor.fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", - file=sys.stderr, - ) result = dispatch( self, "fit", @@ -147,18 +121,9 @@ def fit(self, X, y=None): X, None, ) - print( - f"DEBUG LocalOutlierFactor.fit END: result type={type(result)}", - file=sys.stderr, - ) return result def _predict(self, X=None): - import sys - - print( - f"DEBUG LocalOutlierFactor._predict START: X type={type(X)}", file=sys.stderr - ) check_is_fitted(self) if X is not None: @@ -169,11 +134,6 @@ def _predict(self, X=None): else: is_inlier = np.ones(self.n_samples_fit_, dtype=int) is_inlier[self.negative_outlier_factor_ < self.offset_] = -1 - - print( - f"DEBUG LocalOutlierFactor._predict END: is_inlier type={type(is_inlier)}", - file=sys.stderr, - ) return is_inlier # This had to be done because predict loses the queue when no @@ -184,28 +144,11 @@ def _predict(self, X=None): @wraps(_sklearn_LocalOutlierFactor.fit_predict, assigned=["__doc__"]) @wrap_output_data def fit_predict(self, X, y=None): - import sys - - print( - f"DEBUG LocalOutlierFactor.fit_predict START: X type={type(X)}", - file=sys.stderr, - ) result = self.fit(X)._predict() - print( - f"DEBUG LocalOutlierFactor.fit_predict END: result type={type(result)}", - file=sys.stderr, - ) return result def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): - import sys - - print( - f"DEBUG LocalOutlierFactor._kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", - file=sys.stderr, - ) - - # Validate n_neighbors parameter first (before check_is_fitted) + # Validate n_neighbors parameter first if n_neighbors is not None: self._validate_n_neighbors(n_neighbors) @@ -225,10 +168,6 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors=n_neighbors, return_distance=return_distance, ) - print( - f"DEBUG LocalOutlierFactor._kneighbors END: result type={type(result)}", - file=sys.stderr, - ) return result kneighbors = wrap_output_data(_kneighbors) @@ -237,15 +176,9 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): @wraps(_sklearn_LocalOutlierFactor.score_samples, assigned=["__doc__"]) @wrap_output_data def score_samples(self, X): - import sys - - print( - f"DEBUG LocalOutlierFactor.score_samples START: X type={type(X)}", - file=sys.stderr, - ) check_is_fitted(self) - # Validate and convert X (pandas to numpy if needed) + # Validate and convert X xp, _ = get_namespace(X) X = validate_data( self, @@ -255,8 +188,6 @@ def score_samples(self, X): reset=False, ) - # check_feature_names(self, X, reset=False) - distances_X, neighbors_indices_X = self._kneighbors( X, n_neighbors=self.n_neighbors_ ) @@ -269,10 +200,6 @@ def score_samples(self, X): lrd_ratios_array = self._lrd[neighbors_indices_X] / X_lrd[:, np.newaxis] result = -np.mean(lrd_ratios_array, axis=1) - print( - f"DEBUG LocalOutlierFactor.score_samples END: result type={type(result)}", - file=sys.stderr, - ) return result fit.__doc__ = _sklearn_LocalOutlierFactor.fit.__doc__ diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index 51169306c4..010175ebff 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -62,37 +62,6 @@ def _parse_auto_method(self, method, n_samples, n_features): return result_method - # def _validate_data( - # self, X, y=None, reset=True, validate_separately=None, **check_params - # ): - # if y is None: - # if getattr(self, "requires_y", False): - # raise ValueError( - # f"This {self.__class__.__name__} estimator " - # f"requires y to be passed, but the target y is None." - # ) - # X = _check_array(X, **check_params) - # out = X, y - # else: - # if validate_separately: - # # We need this because some estimators validate X and y - # # separately, and in general, separately calling _check_array() - # # on X and y isn't equivalent to just calling _check_X_y() - # # :( - # check_X_params, check_y_params = validate_separately - # X = _check_array(X, **check_X_params) - # y = _check_array(y, **check_y_params) - # else: - # X, y = _check_X_y(X, y, **check_params) - # out = X, y - - # if check_params.get("ensure_2d", True): - # from onedal.utils.validation import _check_n_features - - # _check_n_features(self, X, reset=reset) - - # return out - def _get_weights(self, dist, weights): if weights in (None, "uniform"): return None @@ -522,13 +491,6 @@ def _process_classification_targets(self, y): Note: y should already be converted to numpy array via validate_data before calling this. """ - import sys - - print( - f"DEBUG _process_classification_targets: y type={type(y)}, y shape={getattr(y, 'shape', 'NO_SHAPE')}", - file=sys.stderr, - ) - # Array API support: get namespace from y xp, _ = get_namespace(y) @@ -586,17 +548,9 @@ def _process_regression_targets(self, y): shape = getattr(y, "shape", None) self._shape = shape if shape is not None else y.shape self._y = y - print( - f"DEBUG _process_regression_targets: _y type={type(self._y)}, _shape={self._shape}", - file=sys.stderr, - ) return y def _fit_validation(self, X, y=None): - print( - f"DEBUG _fit_validation CALLED: X type={type(X)}, y type={type(y)}", - file=sys.stderr, - ) if sklearn_check_version("1.2"): self._validate_params() # check_feature_names(self, X, reset=True) diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index 183ef1f4ba..ec35689f6a 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -67,12 +67,6 @@ def __init__( ) def fit(self, X, y): - import sys - - print( - f"DEBUG KNeighborsClassifier.fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", - file=sys.stderr, - ) dispatch( self, "fit", @@ -83,20 +77,10 @@ def fit(self, X, y): X, y, ) - print( - f"DEBUG KNeighborsClassifier.fit END: _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", - file=sys.stderr, - ) return self @wrap_output_data def predict(self, X): - import sys - - print( - f"DEBUG KNeighborsClassifier.predict START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", - file=sys.stderr, - ) check_is_fitted(self) result = dispatch( @@ -108,20 +92,10 @@ def predict(self, X): }, X, ) - print( - f"DEBUG KNeighborsClassifier.predict END: result type={type(result)}", - file=sys.stderr, - ) return result @wrap_output_data def predict_proba(self, X): - import sys - - print( - f"DEBUG KNeighborsClassifier.predict_proba START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}", - file=sys.stderr, - ) check_is_fitted(self) result = dispatch( @@ -133,20 +107,10 @@ def predict_proba(self, X): }, X, ) - print( - f"DEBUG KNeighborsClassifier.predict_proba END: result type={type(result)}", - file=sys.stderr, - ) return result @wrap_output_data def score(self, X, y, sample_weight=None): - import sys - - print( - f"DEBUG KNeighborsClassifier.score START: X type={type(X)}, y type={type(y)}", - file=sys.stderr, - ) check_is_fitted(self) result = dispatch( @@ -160,19 +124,11 @@ def score(self, X, y, sample_weight=None): y, sample_weight=sample_weight, ) - print(f"DEBUG KNeighborsClassifier.score END: result={result}", file=sys.stderr) return result @wrap_output_data def kneighbors(self, X=None, n_neighbors=None, return_distance=True): - import sys - - print( - f"DEBUG KNeighborsClassifier.kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", - file=sys.stderr, - ) - - # Validate n_neighbors parameter first (before check_is_fitted) + # Validate n_neighbors parameter first if n_neighbors is not None: self._validate_n_neighbors(n_neighbors) @@ -192,25 +148,10 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors=n_neighbors, return_distance=return_distance, ) - print( - f"DEBUG KNeighborsClassifier.kneighbors END: result type={type(result)}", - file=sys.stderr, - ) return result def _onedal_fit(self, X, y, queue=None): - import sys - - print( - f"DEBUG KNeighborsClassifier._onedal_fit START: X type={type(X)}, y type={type(y)}", - file=sys.stderr, - ) - - # Get array namespace for array API support xp, _ = get_namespace(X) - print(f"DEBUG: Array namespace: {xp}", file=sys.stderr) - - # REFACTOR: Use validate_data to convert pandas to numpy and validate types X, y = validate_data( self, X, @@ -218,19 +159,8 @@ def _onedal_fit(self, X, y, queue=None): dtype=[xp.float64, xp.float32], accept_sparse="csr", ) - print( - f"DEBUG: After validate_data, X type={type(X)}, y type={type(y)}", - file=sys.stderr, - ) - - # REFACTOR STEP 1: Process classification targets in sklearnex before passing to onedal - print(f"DEBUG: Processing classification targets in sklearnex", file=sys.stderr) - y_processed = self._process_classification_targets(y) - print( - f"DEBUG: After _process_classification_targets, y_processed type={type(y_processed)}", - file=sys.stderr, - ) - + # Process classification targets in sklearnex before passing to onedal + self._process_classification_targets(y) onedal_params = { "n_neighbors": self.n_neighbors, "weights": self.weights, @@ -244,68 +174,24 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ - # REFACTOR: Pass both original and processed targets to onedal + # Pass both original and processed targets to onedal # onedal needs the processed classes_ and _y attributes that we just set self._onedal_estimator.classes_ = self.classes_ self._onedal_estimator._y = self._y self._onedal_estimator.outputs_2d_ = self.outputs_2d_ self._onedal_estimator._shape = self._shape # Pass shape from sklearnex - print( - f"DEBUG: Set onedal_estimator.classes_={self._onedal_estimator.classes_}", - file=sys.stderr, - ) - print( - f"DEBUG: Set onedal_estimator._y shape={self._onedal_estimator._y.shape}", - file=sys.stderr, - ) - print( - f"DEBUG: Set onedal_estimator._shape={self._onedal_estimator._shape}", - file=sys.stderr, - ) - print( - f"DEBUG KNeighborsClassifier._onedal_fit: Calling onedal_estimator.fit with X and original y", - file=sys.stderr, - ) # Pass original y to onedal - it will use the pre-set classes_ and _y attributes we just assigned self._onedal_estimator.fit(X, y, queue=queue) - print( - f"DEBUG KNeighborsClassifier._onedal_fit: After fit, calling _save_attributes", - file=sys.stderr, - ) - self._save_attributes() - print( - f"DEBUG KNeighborsClassifier._onedal_fit END: self._fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", - file=sys.stderr, - ) def _onedal_predict(self, X, queue=None): - import sys - - print( - f"DEBUG KNeighborsClassifier._onedal_predict START: X type={type(X)}", - file=sys.stderr, - ) - # Use the unified helper from common.py (calls kneighbors + computes prediction) # This properly handles X=None (LOOCV) case result = self._predict_skl_classification(X) - - print( - f"DEBUG KNeighborsClassifier._onedal_predict END: result type={type(result)}", - file=sys.stderr, - ) return result def _onedal_predict_proba(self, X, queue=None): - import sys - - print( - f"DEBUG KNeighborsClassifier._onedal_predict_proba START: X type={type(X)}", - file=sys.stderr, - ) - # Call kneighbors through sklearnex (self.kneighbors is the sklearnex method) # This properly handles X=None case (LOOCV) with query_is_train logic neigh_dist, neigh_ind = self.kneighbors(X) @@ -314,24 +200,11 @@ def _onedal_predict_proba(self, X, queue=None): result = self._compute_class_probabilities( neigh_dist, neigh_ind, self.weights, self._y, self.classes_, self.outputs_2d_ ) - - print( - f"DEBUG KNeighborsClassifier._onedal_predict_proba END: result type={type(result)}", - file=sys.stderr, - ) return result def _onedal_kneighbors( self, X=None, n_neighbors=None, return_distance=True, queue=None ): - import sys - - print( - f"DEBUG KNeighborsClassifier._onedal_kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", - file=sys.stderr, - ) - - # Validate X to convert array API/pandas to numpy and check feature names (only if X is not None) if X is not None: xp, _ = get_namespace(X) X = validate_data( @@ -342,7 +215,6 @@ def _onedal_kneighbors( reset=False, ) - # REFACTOR: All post-processing now in sklearnex following PCA pattern # Prepare inputs and handle query_is_train case X, n_neighbors, query_is_train = self._prepare_kneighbors_inputs(X, n_neighbors) @@ -355,20 +227,9 @@ def _onedal_kneighbors( result = self._kneighbors_post_processing( X, n_neighbors, return_distance, result, query_is_train ) - - print( - f"DEBUG KNeighborsClassifier._onedal_kneighbors END: result type={type(result)}", - file=sys.stderr, - ) return result def _onedal_score(self, X, y, sample_weight=None, queue=None): - import sys - - print( - f"DEBUG KNeighborsClassifier._onedal_score START: X type={type(X)}, y type={type(y)}", - file=sys.stderr, - ) # Convert array API to numpy for sklearn's accuracy_score # Note: validate_data does NOT convert array API to numpy, so we do it explicitly y = np.asarray(y) @@ -377,33 +238,17 @@ def _onedal_score(self, X, y, sample_weight=None, queue=None): result = accuracy_score( y, self._onedal_predict(X, queue=queue), sample_weight=sample_weight ) - print( - f"DEBUG KNeighborsClassifier._onedal_score END: result={result}", - file=sys.stderr, - ) return result def _save_attributes(self): - import sys - - print(f"DEBUG KNeighborsClassifier._save_attributes START", file=sys.stderr) self.classes_ = self._onedal_estimator.classes_ self.n_features_in_ = self._onedal_estimator.n_features_in_ self.n_samples_fit_ = self._onedal_estimator.n_samples_fit_ self._fit_X = self._onedal_estimator._fit_X - print( - f"DEBUG KNeighborsClassifier._save_attributes: _fit_X type={type(self._fit_X)}", - file=sys.stderr, - ) self._y = self._onedal_estimator._y - print( - f"DEBUG KNeighborsClassifier._save_attributes: _y type={type(self._y)}", - file=sys.stderr, - ) self._fit_method = self._onedal_estimator._fit_method self.outputs_2d_ = self._onedal_estimator.outputs_2d_ self._tree = self._onedal_estimator._tree - print(f"DEBUG KNeighborsClassifier._save_attributes END", file=sys.stderr) fit.__doc__ = _sklearn_KNeighborsClassifier.fit.__doc__ predict.__doc__ = _sklearn_KNeighborsClassifier.predict.__doc__ diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index cdce6a9df9..ad89aedfa3 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -14,7 +14,6 @@ # limitations under the License. # ============================================================================== -import numpy as np from sklearn.metrics import r2_score from sklearn.neighbors._regression import ( KNeighborsRegressor as _sklearn_KNeighborsRegressor, @@ -30,7 +29,7 @@ from ..utils._array_api import enable_array_api, get_namespace from ..utils.validation import check_feature_names, validate_data from .common import KNeighborsDispatchingBase - +from onedal._device_offload import _transfer_to_host @enable_array_api @control_n_jobs(decorated_methods=["fit", "predict", "kneighbors", "score"]) @@ -65,12 +64,6 @@ def __init__( ) def fit(self, X, y): - import sys - - print( - f"DEBUG KNeighborsRegressor.fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", - file=sys.stderr, - ) dispatch( self, "fit", @@ -81,19 +74,10 @@ def fit(self, X, y): X, y, ) - print( - f"DEBUG KNeighborsRegressor.fit END: _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", - file=sys.stderr, - ) return self @wrap_output_data def predict(self, X): - import sys - - print( - f"DEBUG KNeighborsRegressor.predict START: X type={type(X)}", file=sys.stderr - ) check_is_fitted(self) result = dispatch( @@ -105,20 +89,10 @@ def predict(self, X): }, X, ) - print( - f"DEBUG KNeighborsRegressor.predict END: result type={type(result)}", - file=sys.stderr, - ) return result @wrap_output_data def score(self, X, y, sample_weight=None): - import sys - - print( - f"DEBUG KNeighborsRegressor.score START: X type={type(X)}, y type={type(y)}", - file=sys.stderr, - ) check_is_fitted(self) result = dispatch( @@ -132,18 +106,10 @@ def score(self, X, y, sample_weight=None): y, sample_weight=sample_weight, ) - print(f"DEBUG KNeighborsRegressor.score END: result={result}", file=sys.stderr) return result @wrap_output_data def kneighbors(self, X=None, n_neighbors=None, return_distance=True): - import sys - - print( - f"DEBUG KNeighborsRegressor.kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", - file=sys.stderr, - ) - # Validate n_neighbors parameter first (before check_is_fitted) if n_neighbors is not None: self._validate_n_neighbors(n_neighbors) @@ -164,24 +130,10 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors=n_neighbors, return_distance=return_distance, ) - print( - f"DEBUG KNeighborsRegressor.kneighbors END: result type={type(result)}", - file=sys.stderr, - ) return result def _onedal_fit(self, X, y, queue=None): - import sys - - print( - f"DEBUG KNeighborsRegressor._onedal_fit START: X type={type(X)}, y type={type(y)}", - file=sys.stderr, - ) - - # Get array namespace for array API support xp, _ = get_namespace(X) - print(f"DEBUG: Array namespace: {xp}", file=sys.stderr) - # REFACTOR: Use validate_data to convert pandas to numpy and validate types for X only X = validate_data( self, @@ -189,20 +141,10 @@ def _onedal_fit(self, X, y, queue=None): dtype=[xp.float64, xp.float32], accept_sparse="csr", ) - print( - f"DEBUG: After validate_data, X type={type(X)}, y type={type(y)}", - file=sys.stderr, - ) - # REFACTOR: Process regression targets in sklearnex before passing to onedal # This sets _shape and _y attributes - print(f"DEBUG: Processing regression targets in sklearnex", file=sys.stderr) - y_processed = self._process_regression_targets(y) - print( - f"DEBUG: After _process_regression_targets, _shape={self._shape}, _y type={type(self._y)}", - file=sys.stderr, - ) - + self._process_regression_targets(y) + onedal_params = { "n_neighbors": self.n_neighbors, "weights": self.weights, @@ -230,25 +172,8 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator._y = xp.reshape(self._y, (-1, 1)) else: self._onedal_estimator._y = self._y - print( - f"DEBUG: Set onedal_estimator._shape={self._onedal_estimator._shape}", - file=sys.stderr, - ) - print( - f"DEBUG: GPU device={gpu_device}, _y shape={self._onedal_estimator._y.shape}", - file=sys.stderr, - ) - - print( - f"DEBUG KNeighborsRegressor._onedal_fit: Calling onedal_estimator.fit", - file=sys.stderr, - ) + self._onedal_estimator.fit(X, y, queue=queue) - print( - f"DEBUG KNeighborsRegressor._onedal_fit: After fit, calling _save_attributes", - file=sys.stderr, - ) - self._save_attributes() # REFACTOR: Replicate the EXACT post-fit reshaping from original onedal code @@ -262,24 +187,8 @@ def _onedal_fit(self, X, y, queue=None): self._y = y if self._shape is None else xp.reshape(y, self._shape) # Also update the onedal estimator's _y since that's what gets used in predict self._onedal_estimator._y = self._y - print( - f"DEBUG: After reshape, self._y type={type(self._y)}, shape={getattr(self._y, 'shape', 'NO_SHAPE')}", - file=sys.stderr, - ) - - print( - f"DEBUG KNeighborsRegressor._onedal_fit END: self._fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", - file=sys.stderr, - ) def _onedal_predict(self, X, queue=None): - import sys - - print( - f"DEBUG KNeighborsRegressor._onedal_predict START: X type={type(X)}", - file=sys.stderr, - ) - # Dispatch between GPU and SKL prediction methods # This logic matches onedal regressor predict() method but computation happens in sklearnex gpu_device = queue is not None and getattr(queue.sycl_device, "is_gpu", False) @@ -291,57 +200,23 @@ def _onedal_predict(self, X, queue=None): else: # SKL path: call kneighbors (through sklearnex) then compute in sklearnex result = self._predict_skl(X, queue=queue) - - print( - f"DEBUG KNeighborsRegressor._onedal_predict END: result type={type(result)}", - file=sys.stderr, - ) return result def _predict_gpu(self, X, queue=None): """GPU prediction path - calls onedal backend.""" - import sys - - print( - f"DEBUG KNeighborsRegressor._predict_gpu START: X type={type(X)}", - file=sys.stderr, - ) # Call onedal backend for GPU prediction (X is already validated by predict()) result = self._onedal_estimator._predict_gpu(X) - print( - f"DEBUG KNeighborsRegressor._predict_gpu END: result type={type(result)}", - file=sys.stderr, - ) return result def _predict_skl(self, X, queue=None): """SKL prediction path - calls kneighbors through sklearnex, computes prediction here.""" - import sys - - print( - f"DEBUG KNeighborsRegressor._predict_skl START: X type={type(X)}", - file=sys.stderr, - ) - # Use the unified helper from common.py (calls kneighbors + computes prediction) result = self._predict_skl_regression(X) - - print( - f"DEBUG KNeighborsRegressor._predict_skl END: result type={type(result)}", - file=sys.stderr, - ) return result def _onedal_kneighbors( self, X=None, n_neighbors=None, return_distance=True, queue=None ): - import sys - - print( - f"DEBUG KNeighborsRegressor._onedal_kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", - file=sys.stderr, - ) - # Validate X to convert array API/pandas to numpy and check feature names (only if X is not None) if X is not None: xp, _ = get_namespace(X) @@ -366,22 +241,9 @@ def _onedal_kneighbors( result = self._kneighbors_post_processing( X, n_neighbors, return_distance, result, query_is_train ) - - print( - f"DEBUG KNeighborsRegressor._onedal_kneighbors END: result type={type(result)}", - file=sys.stderr, - ) return result def _onedal_score(self, X, y, sample_weight=None, queue=None): - import sys - - from onedal._device_offload import _transfer_to_host - - print( - f"DEBUG KNeighborsRegressor._onedal_score START: X type={type(X)}, y type={type(y)}", - file=sys.stderr, - ) y_pred = self._onedal_predict(X, queue=queue) # Convert array API/USM arrays back to numpy for r2_score @@ -390,31 +252,15 @@ def _onedal_score(self, X, y, sample_weight=None, queue=None): y, y_pred, sample_weight = host_data result = r2_score(y, y_pred, sample_weight=sample_weight) - print( - f"DEBUG KNeighborsRegressor._onedal_score END: result={result}", - file=sys.stderr, - ) return result def _save_attributes(self): - import sys - - print(f"DEBUG KNeighborsRegressor._save_attributes START", file=sys.stderr) self.n_features_in_ = self._onedal_estimator.n_features_in_ self.n_samples_fit_ = self._onedal_estimator.n_samples_fit_ self._fit_X = self._onedal_estimator._fit_X - print( - f"DEBUG KNeighborsRegressor._save_attributes: _fit_X type={type(self._fit_X)}", - file=sys.stderr, - ) self._y = self._onedal_estimator._y - print( - f"DEBUG KNeighborsRegressor._save_attributes: _y type={type(self._y)}", - file=sys.stderr, - ) self._fit_method = self._onedal_estimator._fit_method self._tree = self._onedal_estimator._tree - print(f"DEBUG KNeighborsRegressor._save_attributes END", file=sys.stderr) fit.__doc__ = _sklearn_KNeighborsRegressor.__doc__ predict.__doc__ = _sklearn_KNeighborsRegressor.predict.__doc__ diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 731b36e7cc..de1b3bd91b 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -14,9 +14,6 @@ # limitations under the License. # =============================================================================== -import sys - -import numpy as np from sklearn.neighbors._unsupervised import NearestNeighbors as _sklearn_NearestNeighbors from sklearn.utils.validation import _deprecate_positional_args, check_is_fitted @@ -64,10 +61,6 @@ def __init__( ) def fit(self, X, y=None): - print( - f"DEBUG NearestNeighbors.fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", - file=sys.stderr, - ) dispatch( self, "fit", @@ -78,20 +71,11 @@ def fit(self, X, y=None): X, None, ) - print( - f"DEBUG NearestNeighbors.fit END: _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}, _fit_X shape={getattr(getattr(self, '_fit_X', None), 'shape', 'NO_SHAPE')}", - file=sys.stderr, - ) return self @wrap_output_data def kneighbors(self, X=None, n_neighbors=None, return_distance=True): - print( - f"DEBUG NearestNeighbors.kneighbors START: X type={type(X)}, _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", - file=sys.stderr, - ) - - # Validate n_neighbors parameter first (before check_is_fitted) + # Validate n_neighbors parameter first if n_neighbors is not None: self._validate_n_neighbors(n_neighbors) @@ -111,38 +95,18 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors=n_neighbors, return_distance=return_distance, ) - print( - f"DEBUG NearestNeighbors.kneighbors END: result type={type(result)}", - file=sys.stderr, - ) return result @wrap_output_data def radius_neighbors( self, X=None, radius=None, return_distance=True, sort_results=False ): - print( - f"DEBUG NearestNeighbors.radius_neighbors START: X type={type(X)}, _fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}, _fit_X shape={getattr(getattr(self, '_fit_X', None), 'shape', 'NO_SHAPE')}", - file=sys.stderr, - ) - print( - f"DEBUG radius_neighbors: hasattr _onedal_estimator={hasattr(self, '_onedal_estimator')}, _tree={getattr(self, '_tree', 'NOT_SET')}, _fit_method={getattr(self, '_fit_method', 'NOT_SET')}", - file=sys.stderr, - ) if ( hasattr(self, "_onedal_estimator") or getattr(self, "_tree", 0) is None and self._fit_method == "kd_tree" ): - print( - f"DEBUG radius_neighbors: Calling sklearn fit with _fit_X type={type(self._fit_X)}", - file=sys.stderr, - ) _sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, "_y", None)) - print( - f"DEBUG radius_neighbors: sklearn fit completed, _fit_X type now={type(getattr(self, '_fit_X', 'NOT_SET'))}", - file=sys.stderr, - ) check_is_fitted(self) result = dispatch( self, @@ -156,10 +120,6 @@ def radius_neighbors( return_distance=return_distance, sort_results=sort_results, ) - print( - f"DEBUG NearestNeighbors.radius_neighbors END: result type={type(result)}", - file=sys.stderr, - ) return result def radius_neighbors_graph( @@ -179,22 +139,13 @@ def radius_neighbors_graph( ) def _onedal_fit(self, X, y=None, queue=None): - print( - f"DEBUG NearestNeighbors._onedal_fit START: X type={type(X)}, X shape={getattr(X, 'shape', 'NO_SHAPE')}, y type={type(y)}", - file=sys.stderr, - ) - - # Get array namespace for array API support xp, _ = get_namespace(X) - - # REFACTOR: Use validate_data to convert pandas to numpy and validate types X = validate_data( self, X, dtype=[xp.float64, xp.float32], accept_sparse="csr", ) - print(f"DEBUG: After validate_data, X type={type(X)}", file=sys.stderr) onedal_params = { "n_neighbors": self.n_neighbors, @@ -207,24 +158,11 @@ def _onedal_fit(self, X, y=None, queue=None): self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ - print( - f"DEBUG NearestNeighbors._onedal_fit: Calling onedal_estimator.fit", - file=sys.stderr, - ) self._onedal_estimator.fit(X, y, queue=queue) - print( - f"DEBUG NearestNeighbors._onedal_fit: After fit, onedal_estimator._fit_X type={type(getattr(self._onedal_estimator, '_fit_X', 'NOT_SET'))}", - file=sys.stderr, - ) - self._save_attributes() - print( - f"DEBUG NearestNeighbors._onedal_fit END: self._fit_X type={type(getattr(self, '_fit_X', 'NOT_SET'))}", - file=sys.stderr, - ) def _onedal_predict(self, X, queue=None): - # Validate and convert X (pandas to numpy if needed) only if X is not None + # Validate and convert X if X is not None: xp, _ = get_namespace(X) X = validate_data( @@ -240,14 +178,6 @@ def _onedal_predict(self, X, queue=None): def _onedal_kneighbors( self, X=None, n_neighbors=None, return_distance=True, queue=None ): - import sys - - print( - f"DEBUG NearestNeighbors._onedal_kneighbors START: X type={type(X)}, n_neighbors={n_neighbors}, return_distance={return_distance}", - file=sys.stderr, - ) - - # Validate X to convert array API/pandas to numpy and check feature names (only if X is not None) if X is not None: xp, _ = get_namespace(X) X = validate_data( @@ -258,8 +188,7 @@ def _onedal_kneighbors( reset=False, ) - # REFACTOR: All post-processing now in sklearnex following PCA pattern - # Prepare inputs and handle query_is_train case (includes validation AFTER +=1) + # Prepare inputs and handle query_is_train case X, n_neighbors, query_is_train = self._prepare_kneighbors_inputs(X, n_neighbors) # Get raw results from onedal backend @@ -271,44 +200,15 @@ def _onedal_kneighbors( result = self._kneighbors_post_processing( X, n_neighbors, return_distance, result, query_is_train ) - - print( - f"DEBUG NearestNeighbors._onedal_kneighbors END: result type={type(result)}", - file=sys.stderr, - ) return result def _save_attributes(self): - print( - f"DEBUG NearestNeighbors._save_attributes START: onedal_estimator._fit_X type={type(getattr(self._onedal_estimator, '_fit_X', 'NOT_SET'))}", - file=sys.stderr, - ) - if hasattr(self._onedal_estimator, "_fit_X"): - fit_x_preview = str(self._onedal_estimator._fit_X)[:200] - print( - f"DEBUG _save_attributes: _fit_X value preview={fit_x_preview}", - file=sys.stderr, - ) self.classes_ = self._onedal_estimator.classes_ self.n_features_in_ = self._onedal_estimator.n_features_in_ self.n_samples_fit_ = self._onedal_estimator.n_samples_fit_ - # ORIGINAL MAIN BRANCH: Direct assignment without any tuple extraction self._fit_X = self._onedal_estimator._fit_X - print( - f"DEBUG _save_attributes: AFTER assignment - self._fit_X type={type(self._fit_X)}, has shape attr={hasattr(self._fit_X, 'shape')}", - file=sys.stderr, - ) - if hasattr(self._fit_X, "shape"): - print( - f"DEBUG _save_attributes: self._fit_X.shape={self._fit_X.shape}", - file=sys.stderr, - ) self._fit_method = self._onedal_estimator._fit_method self._tree = self._onedal_estimator._tree - print( - f"DEBUG NearestNeighbors._save_attributes END: _fit_method={self._fit_method}, _tree={self._tree}", - file=sys.stderr, - ) fit.__doc__ = _sklearn_NearestNeighbors.__doc__ kneighbors.__doc__ = _sklearn_NearestNeighbors.kneighbors.__doc__ From a05d28485a5374b89626bad9ba4cba6c024fe71b Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 21 Oct 2025 00:56:50 -0700 Subject: [PATCH 65/87] fix: format --- onedal/neighbors/neighbors.py | 4 +++- onedal/neighbors/tests/test_knn_classification.py | 2 ++ sklearnex/neighbors/knn_regression.py | 7 ++++--- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index 6efbe366db..b79e2c7eaf 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -15,9 +15,11 @@ # ============================================================================== from abc import ABCMeta, abstractmethod + from onedal._device_offload import supports_queue from onedal.common._backend import bind_default_backend from onedal.utils import _sycl_queue_manager as QM + from ..common._estimator_checks import _check_is_fitted, _is_classifier, _is_regressor from ..common._mixin import ClassifierMixin, RegressorMixin from ..datatypes import from_table, to_table @@ -166,7 +168,7 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): if n_neighbors is None: n_neighbors = self.n_neighbors - + # onedal now just returns raw results, sklearnex does all processing # Following PCA pattern: simple onedal layer if X is None: diff --git a/onedal/neighbors/tests/test_knn_classification.py b/onedal/neighbors/tests/test_knn_classification.py index a5fb812f4f..f3cf0b823a 100755 --- a/onedal/neighbors/tests/test_knn_classification.py +++ b/onedal/neighbors/tests/test_knn_classification.py @@ -20,6 +20,7 @@ from sklearn import datasets from onedal.tests.utils._device_selection import get_queues + # Classification processing now happens in sklearnex layer from sklearnex.neighbors import KNeighborsClassifier @@ -42,6 +43,7 @@ def test_pickle(queue): clf = KNeighborsClassifier(2).fit(iris.data, iris.target) expected = clf.predict(iris.data) import pickle + dump = pickle.dumps(clf) clf2 = pickle.loads(dump) diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index ad89aedfa3..37f15816d0 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -23,13 +23,14 @@ from daal4py.sklearn._n_jobs_support import control_n_jobs from daal4py.sklearn._utils import sklearn_check_version from daal4py.sklearn.utils.validation import get_requires_y_tag +from onedal._device_offload import _transfer_to_host from onedal.neighbors import KNeighborsRegressor as onedal_KNeighborsRegressor from .._device_offload import dispatch, wrap_output_data from ..utils._array_api import enable_array_api, get_namespace from ..utils.validation import check_feature_names, validate_data from .common import KNeighborsDispatchingBase -from onedal._device_offload import _transfer_to_host + @enable_array_api @control_n_jobs(decorated_methods=["fit", "predict", "kneighbors", "score"]) @@ -144,7 +145,7 @@ def _onedal_fit(self, X, y, queue=None): # REFACTOR: Process regression targets in sklearnex before passing to onedal # This sets _shape and _y attributes self._process_regression_targets(y) - + onedal_params = { "n_neighbors": self.n_neighbors, "weights": self.weights, @@ -172,7 +173,7 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator._y = xp.reshape(self._y, (-1, 1)) else: self._onedal_estimator._y = self._y - + self._onedal_estimator.fit(X, y, queue=queue) self._save_attributes() From cf1d44d9645787a3dd1d72942b5c293231e18a6e Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 21 Oct 2025 12:22:31 -0700 Subject: [PATCH 66/87] fix: fix conformance test --- sklearnex/neighbors/common.py | 20 +++------------ sklearnex/neighbors/knn_classification.py | 2 ++ sklearnex/neighbors/knn_regression.py | 30 +++++++++++++++++++---- 3 files changed, 30 insertions(+), 22 deletions(-) diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index 010175ebff..91a06c6137 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -320,18 +320,6 @@ def _validate_n_classes(self): f"The number of classes has to be greater than one; got {length}" ) - def _validate_feature_count(self, X, method_name=""): - n_features = getattr(self, "n_features_in_", None) - shape = getattr(X, "shape", None) - if n_features and shape and len(shape) > 1 and shape[1] != n_features: - raise ValueError( - ( - f"X has {X.shape[1]} features, " - f"but {method_name} is expecting " - f"{n_features} features as input" - ) - ) - def _validate_kneighbors_bounds(self, n_neighbors, query_is_train, X): n_samples_fit = self.n_samples_fit_ if n_neighbors > n_samples_fit: @@ -350,13 +338,11 @@ def _kneighbors_validation(self, X, n_neighbors): """Shared validation for kneighbors method called from sklearnex layer. Validates: - - Feature count matches training data if X is provided - n_neighbors is within valid bounds if provided + + Note: Feature validation (count, names, etc.) happens in validate_data + called by _onedal_kneighbors, so we don't duplicate it here. """ - # Validate feature count if X is provided - if X is not None: - self._validate_feature_count(X) - # Validate n_neighbors bounds if provided if n_neighbors is not None: # Determine if query is the training set diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index ec35689f6a..36f199a5b5 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -188,12 +188,14 @@ def _onedal_fit(self, X, y, queue=None): def _onedal_predict(self, X, queue=None): # Use the unified helper from common.py (calls kneighbors + computes prediction) # This properly handles X=None (LOOCV) case + # Note: X validation happens in kneighbors result = self._predict_skl_classification(X) return result def _onedal_predict_proba(self, X, queue=None): # Call kneighbors through sklearnex (self.kneighbors is the sklearnex method) # This properly handles X=None case (LOOCV) with query_is_train logic + # Note: X validation happens in kneighbors neigh_dist, neigh_ind = self.kneighbors(X) # Use the helper method to compute class probabilities diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 37f15816d0..108cfa3a38 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -18,7 +18,7 @@ from sklearn.neighbors._regression import ( KNeighborsRegressor as _sklearn_KNeighborsRegressor, ) -from sklearn.utils.validation import check_is_fitted +from sklearn.utils.validation import check_is_fitted, assert_all_finite from daal4py.sklearn._n_jobs_support import control_n_jobs from daal4py.sklearn._utils import sklearn_check_version @@ -134,13 +134,17 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): return result def _onedal_fit(self, X, y, queue=None): - xp, _ = get_namespace(X) - # REFACTOR: Use validate_data to convert pandas to numpy and validate types for X only - X = validate_data( + xp, _ = get_namespace(X, y) + # REFACTOR: Use validate_data with multi_output=True to preserve y shape + # (multi_output=False converts column vectors to 1D) + X, y = validate_data( self, X, + y, dtype=[xp.float64, xp.float32], accept_sparse="csr", + y_numeric=True, + multi_output=True, ) # REFACTOR: Process regression targets in sklearnex before passing to onedal # This sets _shape and _y attributes @@ -192,6 +196,7 @@ def _onedal_fit(self, X, y, queue=None): def _onedal_predict(self, X, queue=None): # Dispatch between GPU and SKL prediction methods # This logic matches onedal regressor predict() method but computation happens in sklearnex + # Note: X validation happens in kneighbors (for SKL path) or _predict_gpu (for GPU path) gpu_device = queue is not None and getattr(queue.sycl_device, "is_gpu", False) is_uniform_weights = getattr(self, "weights", "uniform") == "uniform" @@ -205,7 +210,22 @@ def _onedal_predict(self, X, queue=None): def _predict_gpu(self, X, queue=None): """GPU prediction path - calls onedal backend.""" - # Call onedal backend for GPU prediction (X is already validated by predict()) + # Validate X for GPU path (SKL path validation happens in kneighbors) + if X is not None: + xp, _ = get_namespace(X) + # For precomputed metric, only check NaN/inf, don't validate features + if getattr(self, "effective_metric_", self.metric) == "precomputed": + from ..utils.validation import assert_all_finite + assert_all_finite(X, allow_nan=False, input_name="X") + else: + X = validate_data( + self, + X, + dtype=[xp.float64, xp.float32], + accept_sparse="csr", + reset=False, + ) + # Call onedal backend for GPU prediction result = self._onedal_estimator._predict_gpu(X) return result From c2104accdf8e3d5d51f7ed11c233b96376156052 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 21 Oct 2025 12:25:13 -0700 Subject: [PATCH 67/87] fix: format --- sklearnex/neighbors/common.py | 4 ++-- sklearnex/neighbors/knn_regression.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index 91a06c6137..0fdf1bdeec 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -339,8 +339,8 @@ def _kneighbors_validation(self, X, n_neighbors): Validates: - n_neighbors is within valid bounds if provided - - Note: Feature validation (count, names, etc.) happens in validate_data + + Note: Feature validation (count, names, etc.) happens in validate_data called by _onedal_kneighbors, so we don't duplicate it here. """ # Validate n_neighbors bounds if provided diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 108cfa3a38..c42e8d66d0 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -18,7 +18,7 @@ from sklearn.neighbors._regression import ( KNeighborsRegressor as _sklearn_KNeighborsRegressor, ) -from sklearn.utils.validation import check_is_fitted, assert_all_finite +from sklearn.utils.validation import assert_all_finite, check_is_fitted from daal4py.sklearn._n_jobs_support import control_n_jobs from daal4py.sklearn._utils import sklearn_check_version @@ -216,6 +216,7 @@ def _predict_gpu(self, X, queue=None): # For precomputed metric, only check NaN/inf, don't validate features if getattr(self, "effective_metric_", self.metric) == "precomputed": from ..utils.validation import assert_all_finite + assert_all_finite(X, allow_nan=False, input_name="X") else: X = validate_data( From 503bf499add93134ce209341d545970f2c430c48 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 21 Oct 2025 12:35:01 -0700 Subject: [PATCH 68/87] fix: clean up unneeded var --- sklearnex/neighbors/_lof.py | 12 +++----- sklearnex/neighbors/common.py | 6 ++-- sklearnex/neighbors/knn_classification.py | 24 +++++---------- sklearnex/neighbors/knn_regression.py | 37 ++++++++--------------- sklearnex/neighbors/knn_unsupervised.py | 9 ++---- 5 files changed, 30 insertions(+), 58 deletions(-) diff --git a/sklearnex/neighbors/_lof.py b/sklearnex/neighbors/_lof.py index 4ce835e61e..728d09b8c4 100644 --- a/sklearnex/neighbors/_lof.py +++ b/sklearnex/neighbors/_lof.py @@ -111,7 +111,7 @@ def _onedal_fit(self, X, y, queue=None): return self def fit(self, X, y=None): - result = dispatch( + return dispatch( self, "fit", { @@ -121,7 +121,6 @@ def fit(self, X, y=None): X, None, ) - return result def _predict(self, X=None): check_is_fitted(self) @@ -144,8 +143,7 @@ def _predict(self, X=None): @wraps(_sklearn_LocalOutlierFactor.fit_predict, assigned=["__doc__"]) @wrap_output_data def fit_predict(self, X, y=None): - result = self.fit(X)._predict() - return result + return self.fit(X)._predict() def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): # Validate n_neighbors parameter first @@ -157,7 +155,7 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): # Validate kneighbors parameters (inherited from KNeighborsDispatchingBase) self._kneighbors_validation(X, n_neighbors) - result = dispatch( + return dispatch( self, "kneighbors", { @@ -168,7 +166,6 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors=n_neighbors, return_distance=return_distance, ) - return result kneighbors = wrap_output_data(_kneighbors) @@ -199,8 +196,7 @@ def score_samples(self, X): lrd_ratios_array = self._lrd[neighbors_indices_X] / X_lrd[:, np.newaxis] - result = -np.mean(lrd_ratios_array, axis=1) - return result + return -np.mean(lrd_ratios_array, axis=1) fit.__doc__ = _sklearn_LocalOutlierFactor.fit.__doc__ kneighbors.__doc__ = _sklearn_LocalOutlierFactor.kneighbors.__doc__ diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index 0fdf1bdeec..83f5b35b6d 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -283,16 +283,14 @@ def _predict_skl_classification(self, X): if not self.outputs_2d_: # Single output: classes_[argmax(proba, axis=1)] - result = self.classes_[xp.argmax(proba, axis=1)] + return self.classes_[xp.argmax(proba, axis=1)] else: # Multi-output: apply argmax separately for each output result = [ classes_k[xp.argmax(proba_k, axis=1)] for classes_k, proba_k in zip(self.classes_, proba.T) ] - result = xp.asarray(result).T - - return result + return xp.asarray(result).T def _validate_targets(self, y, dtype): arr = _column_or_1d(y, warn=True) diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index 36f199a5b5..10114b5987 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -83,7 +83,7 @@ def fit(self, X, y): def predict(self, X): check_is_fitted(self) - result = dispatch( + return dispatch( self, "predict", { @@ -92,13 +92,12 @@ def predict(self, X): }, X, ) - return result @wrap_output_data def predict_proba(self, X): check_is_fitted(self) - result = dispatch( + return dispatch( self, "predict_proba", { @@ -107,13 +106,12 @@ def predict_proba(self, X): }, X, ) - return result @wrap_output_data def score(self, X, y, sample_weight=None): check_is_fitted(self) - result = dispatch( + return dispatch( self, "score", { @@ -124,7 +122,6 @@ def score(self, X, y, sample_weight=None): y, sample_weight=sample_weight, ) - return result @wrap_output_data def kneighbors(self, X=None, n_neighbors=None, return_distance=True): @@ -137,7 +134,7 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): # Validate kneighbors parameters (inherited from KNeighborsDispatchingBase) self._kneighbors_validation(X, n_neighbors) - result = dispatch( + return dispatch( self, "kneighbors", { @@ -148,7 +145,6 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors=n_neighbors, return_distance=return_distance, ) - return result def _onedal_fit(self, X, y, queue=None): xp, _ = get_namespace(X) @@ -189,8 +185,7 @@ def _onedal_predict(self, X, queue=None): # Use the unified helper from common.py (calls kneighbors + computes prediction) # This properly handles X=None (LOOCV) case # Note: X validation happens in kneighbors - result = self._predict_skl_classification(X) - return result + return self._predict_skl_classification(X) def _onedal_predict_proba(self, X, queue=None): # Call kneighbors through sklearnex (self.kneighbors is the sklearnex method) @@ -199,10 +194,9 @@ def _onedal_predict_proba(self, X, queue=None): neigh_dist, neigh_ind = self.kneighbors(X) # Use the helper method to compute class probabilities - result = self._compute_class_probabilities( + return self._compute_class_probabilities( neigh_dist, neigh_ind, self.weights, self._y, self.classes_, self.outputs_2d_ ) - return result def _onedal_kneighbors( self, X=None, n_neighbors=None, return_distance=True, queue=None @@ -226,10 +220,9 @@ def _onedal_kneighbors( ) # Apply post-processing (kd_tree sorting, removing self from results) - result = self._kneighbors_post_processing( + return self._kneighbors_post_processing( X, n_neighbors, return_distance, result, query_is_train ) - return result def _onedal_score(self, X, y, sample_weight=None, queue=None): # Convert array API to numpy for sklearn's accuracy_score @@ -237,10 +230,9 @@ def _onedal_score(self, X, y, sample_weight=None, queue=None): y = np.asarray(y) if sample_weight is not None: sample_weight = np.asarray(sample_weight) - result = accuracy_score( + return accuracy_score( y, self._onedal_predict(X, queue=queue), sample_weight=sample_weight ) - return result def _save_attributes(self): self.classes_ = self._onedal_estimator.classes_ diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index c42e8d66d0..c45ae1d9bc 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -81,7 +81,7 @@ def fit(self, X, y): def predict(self, X): check_is_fitted(self) - result = dispatch( + return dispatch( self, "predict", { @@ -90,13 +90,12 @@ def predict(self, X): }, X, ) - return result @wrap_output_data def score(self, X, y, sample_weight=None): check_is_fitted(self) - result = dispatch( + return dispatch( self, "score", { @@ -107,7 +106,6 @@ def score(self, X, y, sample_weight=None): y, sample_weight=sample_weight, ) - return result @wrap_output_data def kneighbors(self, X=None, n_neighbors=None, return_distance=True): @@ -120,7 +118,7 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): # Validate kneighbors parameters (inherited from KNeighborsDispatchingBase) self._kneighbors_validation(X, n_neighbors) - result = dispatch( + return dispatch( self, "kneighbors", { @@ -131,11 +129,10 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors=n_neighbors, return_distance=return_distance, ) - return result def _onedal_fit(self, X, y, queue=None): xp, _ = get_namespace(X, y) - # REFACTOR: Use validate_data with multi_output=True to preserve y shape + # Use validate_data with multi_output=True to preserve y shape # (multi_output=False converts column vectors to 1D) X, y = validate_data( self, @@ -146,7 +143,7 @@ def _onedal_fit(self, X, y, queue=None): y_numeric=True, multi_output=True, ) - # REFACTOR: Process regression targets in sklearnex before passing to onedal + # Process regression targets in sklearnex before passing to onedal # This sets _shape and _y attributes self._process_regression_targets(y) @@ -163,7 +160,7 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ - # REFACTOR: Pass pre-processed shape and _y to onedal + # Pass pre-processed shape and _y to onedal # For GPU backend, reshape _y to (-1, 1) before passing to onedal from onedal.utils import _sycl_queue_manager as QM @@ -171,8 +168,7 @@ def _onedal_fit(self, X, y, queue=None): gpu_device = queue_instance is not None and queue_instance.sycl_device.is_gpu self._onedal_estimator._shape = self._shape - # REFACTOR: Reshape _y for GPU backend (needs column vector) - # Following PCA pattern: all data preparation in sklearnex + # Reshape _y for GPU backend (needs column vector) if gpu_device: self._onedal_estimator._y = xp.reshape(self._y, (-1, 1)) else: @@ -181,7 +177,6 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator.fit(X, y, queue=queue) self._save_attributes() - # REFACTOR: Replicate the EXACT post-fit reshaping from original onedal code # Original onedal code (after fit): # if y is not None and _is_regressor(self): # _, xp, _ = _get_sycl_namespace(X) @@ -202,11 +197,10 @@ def _onedal_predict(self, X, queue=None): if gpu_device and is_uniform_weights: # GPU path: call onedal backend directly - result = self._predict_gpu(X, queue=queue) + return self._predict_gpu(X, queue=queue) else: # SKL path: call kneighbors (through sklearnex) then compute in sklearnex - result = self._predict_skl(X, queue=queue) - return result + return self._predict_skl(X, queue=queue) def _predict_gpu(self, X, queue=None): """GPU prediction path - calls onedal backend.""" @@ -227,14 +221,12 @@ def _predict_gpu(self, X, queue=None): reset=False, ) # Call onedal backend for GPU prediction - result = self._onedal_estimator._predict_gpu(X) - return result + return self._onedal_estimator._predict_gpu(X) def _predict_skl(self, X, queue=None): """SKL prediction path - calls kneighbors through sklearnex, computes prediction here.""" # Use the unified helper from common.py (calls kneighbors + computes prediction) - result = self._predict_skl_regression(X) - return result + return self._predict_skl_regression(X) def _onedal_kneighbors( self, X=None, n_neighbors=None, return_distance=True, queue=None @@ -250,7 +242,6 @@ def _onedal_kneighbors( reset=False, ) - # REFACTOR: All post-processing now in sklearnex following PCA pattern # Prepare inputs and handle query_is_train case X, n_neighbors, query_is_train = self._prepare_kneighbors_inputs(X, n_neighbors) @@ -260,10 +251,9 @@ def _onedal_kneighbors( ) # Apply post-processing (kd_tree sorting, removing self from results) - result = self._kneighbors_post_processing( + return self._kneighbors_post_processing( X, n_neighbors, return_distance, result, query_is_train ) - return result def _onedal_score(self, X, y, sample_weight=None, queue=None): y_pred = self._onedal_predict(X, queue=queue) @@ -273,8 +263,7 @@ def _onedal_score(self, X, y, sample_weight=None, queue=None): _, host_data = _transfer_to_host(y, y_pred, sample_weight) y, y_pred, sample_weight = host_data - result = r2_score(y, y_pred, sample_weight=sample_weight) - return result + return r2_score(y, y_pred, sample_weight=sample_weight) def _save_attributes(self): self.n_features_in_ = self._onedal_estimator.n_features_in_ diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index de1b3bd91b..c6f8a27893 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -84,7 +84,7 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): # Validate kneighbors parameters (inherited from KNeighborsDispatchingBase) self._kneighbors_validation(X, n_neighbors) - result = dispatch( + return dispatch( self, "kneighbors", { @@ -95,7 +95,6 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): n_neighbors=n_neighbors, return_distance=return_distance, ) - return result @wrap_output_data def radius_neighbors( @@ -108,7 +107,7 @@ def radius_neighbors( ): _sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, "_y", None)) check_is_fitted(self) - result = dispatch( + return dispatch( self, "radius_neighbors", { @@ -120,7 +119,6 @@ def radius_neighbors( return_distance=return_distance, sort_results=sort_results, ) - return result def radius_neighbors_graph( self, X=None, radius=None, mode="connectivity", sort_results=False @@ -197,10 +195,9 @@ def _onedal_kneighbors( ) # Apply post-processing (kd_tree sorting, removing self from results) - result = self._kneighbors_post_processing( + return self._kneighbors_post_processing( X, n_neighbors, return_distance, result, query_is_train ) - return result def _save_attributes(self): self.classes_ = self._onedal_estimator.classes_ From b4e6423d9087db966c1e3f5f0a087941ddd11ab7 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 21 Oct 2025 13:16:12 -0700 Subject: [PATCH 69/87] fix: attributeerror --- sklearnex/neighbors/knn_regression.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index c45ae1d9bc..97a94893a2 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -134,13 +134,14 @@ def _onedal_fit(self, X, y, queue=None): xp, _ = get_namespace(X, y) # Use validate_data with multi_output=True to preserve y shape # (multi_output=False converts column vectors to 1D) + # Note: Don't use y_numeric=True with multi_output=True for array API + # (sklearn's _check_y tries to access dtype.kind which doesn't exist on array API dtypes) X, y = validate_data( self, X, y, dtype=[xp.float64, xp.float32], accept_sparse="csr", - y_numeric=True, multi_output=True, ) # Process regression targets in sklearnex before passing to onedal From f3c949b03ff6962a53ea261200ca4b54a90943a6 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 21 Oct 2025 18:35:04 -0700 Subject: [PATCH 70/87] fix: spmd also use skelarnex neighbors --- onedal/spmd/neighbors/neighbors.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/onedal/spmd/neighbors/neighbors.py b/onedal/spmd/neighbors/neighbors.py index 94deec6826..cc55ee3e21 100644 --- a/onedal/spmd/neighbors/neighbors.py +++ b/onedal/spmd/neighbors/neighbors.py @@ -16,9 +16,11 @@ from ..._device_offload import support_input_format, supports_queue from ...common._backend import bind_spmd_backend -from ...neighbors import KNeighborsClassifier as KNeighborsClassifier_Batch -from ...neighbors import KNeighborsRegressor as KNeighborsRegressor_Batch -from ...neighbors import NearestNeighbors as NearestNeighbors_Batch + +# Import from sklearnex instead of onedal to get target processing in sklearnex layer +from sklearnex.neighbors import KNeighborsClassifier as KNeighborsClassifier_Batch +from sklearnex.neighbors import KNeighborsRegressor as KNeighborsRegressor_Batch +from sklearnex.neighbors import NearestNeighbors as NearestNeighbors_Batch class KNeighborsClassifier(KNeighborsClassifier_Batch): From db8070d9838c0d81e47ce5b12aab98d091443de3 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 21 Oct 2025 23:50:24 -0700 Subject: [PATCH 71/87] test: test without classes_check in onedal neighbor --- onedal/neighbors/neighbors.py | 10 +++++----- onedal/spmd/neighbors/neighbors.py | 8 +++----- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index b79e2c7eaf..d19d91abeb 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -128,11 +128,11 @@ def _fit(self, X, y): # This code is now commented out - processing MUST happen in sklearnex before calling fit # Assertion: Verify that sklearnex has done the preprocessing if _is_classifier(self): - if not hasattr(self, "classes_") or self.classes_ is None: - raise ValueError( - "Classification target processing must be done in sklearnex layer before calling onedal fit. " - "classes_ attribute is not set. This indicates the refactoring is incomplete." - ) + # if not hasattr(self, "classes_") or self.classes_ is None: + # raise ValueError( + # "Classification target processing must be done in sklearnex layer before calling onedal fit. " + # "classes_ attribute is not set. This indicates the refactoring is incomplete." + # ) if not hasattr(self, "_y") or self._y is None: raise ValueError( "Classification target processing must be done in sklearnex layer before calling onedal fit. " diff --git a/onedal/spmd/neighbors/neighbors.py b/onedal/spmd/neighbors/neighbors.py index cc55ee3e21..94deec6826 100644 --- a/onedal/spmd/neighbors/neighbors.py +++ b/onedal/spmd/neighbors/neighbors.py @@ -16,11 +16,9 @@ from ..._device_offload import support_input_format, supports_queue from ...common._backend import bind_spmd_backend - -# Import from sklearnex instead of onedal to get target processing in sklearnex layer -from sklearnex.neighbors import KNeighborsClassifier as KNeighborsClassifier_Batch -from sklearnex.neighbors import KNeighborsRegressor as KNeighborsRegressor_Batch -from sklearnex.neighbors import NearestNeighbors as NearestNeighbors_Batch +from ...neighbors import KNeighborsClassifier as KNeighborsClassifier_Batch +from ...neighbors import KNeighborsRegressor as KNeighborsRegressor_Batch +from ...neighbors import NearestNeighbors as NearestNeighbors_Batch class KNeighborsClassifier(KNeighborsClassifier_Batch): From 65b160bbb00c6c2996291c03a5eb337532eee6d4 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Wed, 22 Oct 2025 23:32:49 -0700 Subject: [PATCH 72/87] fix: spmd issue --- sklearnex/neighbors/common.py | 14 +++++++--- sklearnex/neighbors/knn_unsupervised.py | 2 +- sklearnex/spmd/neighbors/__init__.py | 2 +- sklearnex/spmd/neighbors/neighbors.py | 35 +++++++++++++++++++++++++ 4 files changed, 47 insertions(+), 6 deletions(-) create mode 100644 sklearnex/spmd/neighbors/neighbors.py diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index 83f5b35b6d..bb178591ea 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -603,7 +603,9 @@ def _fit_validation(self, X, y=None): else: self._fit_method = self.algorithm - if hasattr(self, "_onedal_estimator"): + # Only delete _onedal_estimator if it's an instance attribute, not a class attribute + # (SPMD classes define _onedal_estimator as a staticmethod at class level) + if "_onedal_estimator" in self.__dict__: delattr(self, "_onedal_estimator") # To cover test case when we pass patched # estimator as an input for other estimator @@ -613,7 +615,8 @@ def _fit_validation(self, X, y=None): self._fit_method = X._fit_method self.n_samples_fit_ = X.n_samples_fit_ self.n_features_in_ = X.n_features_in_ - if hasattr(X, "_onedal_estimator"): + # Check if X has _onedal_estimator as an instance attribute (not class attribute) + if "_onedal_estimator" in X.__dict__: self.effective_metric_params_.pop("p") if self._fit_method == "ball_tree": X._tree = BallTree( @@ -714,7 +717,8 @@ def _onedal_supported(self, device, method_name, *data): if is_classifier: # Use numpy for unique (standard sklearn pattern) class_count = len(np.unique(np.asarray(y))) - if hasattr(self, "_onedal_estimator"): + # Only access _onedal_estimator if it's an instance attribute (not a class-level staticmethod) + if "_onedal_estimator" in self.__dict__: y = self._onedal_estimator._y if y is not None and hasattr(y, "ndim") and hasattr(y, "shape"): is_single_output = y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1 @@ -773,8 +777,10 @@ def _onedal_supported(self, device, method_name, *data): ) return patching_status if method_name in ["predict", "predict_proba", "kneighbors", "score"]: + # Check if _onedal_estimator is an instance attribute (model was trained) + # For SPMD classes, _onedal_estimator is a class-level staticmethod, so we check __dict__ patching_status.and_condition( - hasattr(self, "_onedal_estimator"), "oneDAL model was not trained." + "_onedal_estimator" in self.__dict__, "oneDAL model was not trained." ) return patching_status raise RuntimeError(f"Unknown method {method_name} in {class_name}") diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index c6f8a27893..f2c5d950d0 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -101,7 +101,7 @@ def radius_neighbors( self, X=None, radius=None, return_distance=True, sort_results=False ): if ( - hasattr(self, "_onedal_estimator") + "_onedal_estimator" in self.__dict__ or getattr(self, "_tree", 0) is None and self._fit_method == "kd_tree" ): diff --git a/sklearnex/spmd/neighbors/__init__.py b/sklearnex/spmd/neighbors/__init__.py index 44cb849591..3f74cca4ad 100644 --- a/sklearnex/spmd/neighbors/__init__.py +++ b/sklearnex/spmd/neighbors/__init__.py @@ -14,7 +14,7 @@ # limitations under the License. # ============================================================================== -from onedal.spmd.neighbors import ( +from .neighbors import ( KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors, diff --git a/sklearnex/spmd/neighbors/neighbors.py b/sklearnex/spmd/neighbors/neighbors.py new file mode 100644 index 0000000000..485d48e955 --- /dev/null +++ b/sklearnex/spmd/neighbors/neighbors.py @@ -0,0 +1,35 @@ +# ============================================================================== +# Copyright 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from onedal.spmd.neighbors import KNeighborsClassifier as onedal_KNeighborsClassifier +from onedal.spmd.neighbors import KNeighborsRegressor as onedal_KNeighborsRegressor +from onedal.spmd.neighbors import NearestNeighbors as onedal_NearestNeighbors + +from ...neighbors import KNeighborsClassifier as base_KNeighborsClassifier +from ...neighbors import KNeighborsRegressor as base_KNeighborsRegressor +from ...neighbors import NearestNeighbors as base_NearestNeighbors + + +class KNeighborsClassifier(base_KNeighborsClassifier): + _onedal_estimator = staticmethod(onedal_KNeighborsClassifier) + + +class KNeighborsRegressor(base_KNeighborsRegressor): + _onedal_estimator = staticmethod(onedal_KNeighborsRegressor) + + +class NearestNeighbors(base_NearestNeighbors): + _onedal_estimator = staticmethod(onedal_NearestNeighbors) From 231eb325bc0c6beded04de9f85ff0f028c4282d1 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Wed, 22 Oct 2025 23:37:39 -0700 Subject: [PATCH 73/87] fix: format --- sklearnex/spmd/neighbors/__init__.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sklearnex/spmd/neighbors/__init__.py b/sklearnex/spmd/neighbors/__init__.py index 3f74cca4ad..8036511d9f 100644 --- a/sklearnex/spmd/neighbors/__init__.py +++ b/sklearnex/spmd/neighbors/__init__.py @@ -14,10 +14,6 @@ # limitations under the License. # ============================================================================== -from .neighbors import ( - KNeighborsClassifier, - KNeighborsRegressor, - NearestNeighbors, -) +from .neighbors import KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors __all__ = ["KNeighborsClassifier", "KNeighborsRegressor", "NearestNeighbors"] From 64bb25e97a220942e305e04d6d1941e0c56211d8 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Thu, 23 Oct 2025 18:49:44 -0700 Subject: [PATCH 74/87] fix: make sure y is numeric in regrresor --- sklearnex/neighbors/knn_regression.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 97a94893a2..98d9123dff 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -143,6 +143,7 @@ def _onedal_fit(self, X, y, queue=None): dtype=[xp.float64, xp.float32], accept_sparse="csr", multi_output=True, + y_numeric=True, ) # Process regression targets in sklearnex before passing to onedal # This sets _shape and _y attributes From 9dfcb708443b519882e7044e964e544513948322 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Thu, 23 Oct 2025 23:34:10 -0700 Subject: [PATCH 75/87] fix: fix spmd test --- sklearnex/neighbors/common.py | 73 ++++++++++++----------- sklearnex/neighbors/knn_classification.py | 31 +++++++--- 2 files changed, 61 insertions(+), 43 deletions(-) diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index bb178591ea..cdd39ba160 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -180,7 +180,13 @@ def _compute_class_probabilities( """ from ..utils.validation import _num_samples - # Array API support: get namespace from input arrays + # Transfer all arrays to host to ensure they're on the same queue/device + # This is needed especially for SPMD where arrays might be on different queues + _, (neigh_dist, neigh_ind, y_train) = _transfer_to_host( + neigh_dist, neigh_ind, y_train + ) + + # After transfer, get the array namespace (will be numpy for host arrays) xp, _ = get_namespace(neigh_dist, neigh_ind, y_train) _y = y_train @@ -366,32 +372,25 @@ def _prepare_kneighbors_inputs(self, X, n_neighbors): - query_is_train: Boolean flag indicating if original X was None """ query_is_train = X is None + X = self._fit_X + # Include an extra neighbor to account for the sample itself being + # returned, which is removed later + if n_neighbors is None: + n_neighbors = self.n_neighbors + n_neighbors += 1 - if X is not None: - # Get the array namespace to use correct dtypes - xp, _ = get_namespace(X) - # Use _check_array like main branch, with array API dtype support - X = _check_array(X, dtype=[xp.float64, xp.float32], accept_sparse="csr") - else: - X = self._fit_X - # Include an extra neighbor to account for the sample itself being - # returned, which is removed later - if n_neighbors is None: - n_neighbors = self.n_neighbors - n_neighbors += 1 - - # Validate bounds AFTER adding +1 (replicates original onedal behavior) - # Original code in onedal had validation after n_neighbors += 1 - n_samples_fit = self.n_samples_fit_ - if n_neighbors > n_samples_fit: - n_neighbors_for_msg = ( - n_neighbors - 1 - ) # for error message, show original value - raise ValueError( - f"Expected n_neighbors < n_samples_fit, but " - f"n_neighbors = {n_neighbors_for_msg}, n_samples_fit = {n_samples_fit}, " - f"n_samples = {X.shape[0]}" - ) + # Validate bounds AFTER adding +1 (replicates original onedal behavior) + # Original code in onedal had validation after n_neighbors += 1 + n_samples_fit = self.n_samples_fit_ + if n_neighbors > n_samples_fit: + n_neighbors_for_msg = ( + n_neighbors - 1 + ) # for error message, show original value + raise ValueError( + f"Expected n_neighbors < n_samples_fit, but " + f"n_neighbors = {n_neighbors_for_msg}, n_samples_fit = {n_samples_fit}, " + f"n_samples = {X.shape[0]}" + ) return X, n_neighbors, query_is_train @@ -470,10 +469,16 @@ def _kneighbors_post_processing( return neigh_dist, neigh_ind return neigh_ind - def _process_classification_targets(self, y): + def _process_classification_targets(self, y, skip_validation=False): """Process classification targets and set class-related attributes. - Note: y should already be converted to numpy array via validate_data before calling this. + Parameters + ---------- + y : array-like + Target values + skip_validation : bool, default=False + If True, skip _check_classification_targets validation. + Used when use_raw_input=True (raw array API arrays like dpctl.usm_ndarray). """ # Array API support: get namespace from y xp, _ = get_namespace(y) @@ -491,8 +496,9 @@ def _process_classification_targets(self, y): else: self.outputs_2d_ = True - # Validate classification targets - _check_classification_targets(y) + # Validate classification targets (skip for raw array API inputs) + if not skip_validation: + _check_classification_targets(y) # Process classes - note: np.unique is used for class extraction # This is acceptable as classes are typically numpy arrays in sklearn @@ -500,8 +506,9 @@ def _process_classification_targets(self, y): self._y = xp.empty(y.shape, dtype=xp.int32) for k in range(self._y.shape[1]): # Use numpy unique for class extraction (standard sklearn pattern) - y_k = np.asarray(y[:, k]) - classes, indices = np.unique(y_k, return_inverse=True) + # Transfer to host first to ensure proper numpy array conversion + y_k_host = np.asarray(_transfer_to_host(y[:, k])[1][0]) + classes, indices = np.unique(y_k_host, return_inverse=True) self.classes_.append(classes) self._y[:, k] = xp.asarray(indices, dtype=xp.int32) @@ -526,8 +533,6 @@ def _process_regression_targets(self, y): For now, just store _shape and _y as-is. The reshape happens after onedal fit is complete. """ - import sys - # EXACT replication of original onedal shape processing shape = getattr(y, "shape", None) self._shape = shape if shape is not None else y.shape diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index 10114b5987..f1a0e28226 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -26,6 +26,7 @@ from daal4py.sklearn.utils.validation import get_requires_y_tag from onedal.neighbors import KNeighborsClassifier as onedal_KNeighborsClassifier +from .._config import get_config from .._device_offload import dispatch, wrap_output_data from ..utils._array_api import enable_array_api, get_namespace from ..utils.validation import check_feature_names, validate_data @@ -148,15 +149,24 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): def _onedal_fit(self, X, y, queue=None): xp, _ = get_namespace(X) - X, y = validate_data( - self, - X, - y, - dtype=[xp.float64, xp.float32], - accept_sparse="csr", - ) + + # When use_raw_input=True, dispatch bypasses _onedal_supported() which calls _fit_validation() + # We need to call it here to set effective_metric_ and effective_metric_params_ + use_raw_input = get_config()["use_raw_input"] + if use_raw_input: + self._fit_validation(X, y) + else: + X, y = validate_data( + self, + X, + y, + dtype=[xp.float64, xp.float32], + accept_sparse="csr", + ) + # Process classification targets in sklearnex before passing to onedal - self._process_classification_targets(y) + # When use_raw_input=True, y is raw array API (dpctl/dpnp), skip sklearn validation + self._process_classification_targets(y, skip_validation=use_raw_input) onedal_params = { "n_neighbors": self.n_neighbors, "weights": self.weights, @@ -201,7 +211,10 @@ def _onedal_predict_proba(self, X, queue=None): def _onedal_kneighbors( self, X=None, n_neighbors=None, return_distance=True, queue=None ): - if X is not None: + # Only skip validation when use_raw_input=True (SPMD mode) + use_raw_input = get_config()["use_raw_input"] + + if X is not None and not use_raw_input: xp, _ = get_namespace(X) X = validate_data( self, From 93fcbfdc8a4ab994e4cc1a7e2aebfda8918702a3 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Fri, 24 Oct 2025 08:25:03 -0700 Subject: [PATCH 76/87] fix: common tests --- sklearnex/neighbors/common.py | 47 +++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index cdd39ba160..840b43ee13 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -361,6 +361,10 @@ def _prepare_kneighbors_inputs(self, X, n_neighbors): Handles query_is_train case: when X=None, sets X to training data and adds +1 to n_neighbors. Validates n_neighbors bounds AFTER adding +1 (replicates original onedal behavior). + NOTE: Caller is responsible for validating X (via validate_data or _check_array). + This function does NOT validate X to avoid double validation and to support + use_raw_input mode where validation should be skipped. + Args: X: Query data or None n_neighbors: Number of neighbors or None @@ -372,25 +376,32 @@ def _prepare_kneighbors_inputs(self, X, n_neighbors): - query_is_train: Boolean flag indicating if original X was None """ query_is_train = X is None - X = self._fit_X - # Include an extra neighbor to account for the sample itself being - # returned, which is removed later - if n_neighbors is None: - n_neighbors = self.n_neighbors - n_neighbors += 1 - # Validate bounds AFTER adding +1 (replicates original onedal behavior) - # Original code in onedal had validation after n_neighbors += 1 - n_samples_fit = self.n_samples_fit_ - if n_neighbors > n_samples_fit: - n_neighbors_for_msg = ( - n_neighbors - 1 - ) # for error message, show original value - raise ValueError( - f"Expected n_neighbors < n_samples_fit, but " - f"n_neighbors = {n_neighbors_for_msg}, n_samples_fit = {n_samples_fit}, " - f"n_samples = {X.shape[0]}" - ) + if X is not None: + # X validation should already be done by caller + # Do NOT call _check_array here to avoid double validation + # and to support use_raw_input mode + pass + else: + X = self._fit_X + # Include an extra neighbor to account for the sample itself being + # returned, which is removed later + if n_neighbors is None: + n_neighbors = self.n_neighbors + n_neighbors += 1 + + # Validate bounds AFTER adding +1 (replicates original onedal behavior) + # Original code in onedal had validation after n_neighbors += 1 + n_samples_fit = self.n_samples_fit_ + if n_neighbors > n_samples_fit: + n_neighbors_for_msg = ( + n_neighbors - 1 + ) # for error message, show original value + raise ValueError( + f"Expected n_neighbors < n_samples_fit, but " + f"n_neighbors = {n_neighbors_for_msg}, n_samples_fit = {n_samples_fit}, " + f"n_samples = {X.shape[0]}" + ) return X, n_neighbors, query_is_train From 295be53e840c90e8aae3e62edb8cf41265cc25ec Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Fri, 24 Oct 2025 16:14:45 -0700 Subject: [PATCH 77/87] fix: spmd issues --- sklearnex/neighbors/common.py | 73 ++++++++++++++--------- sklearnex/neighbors/knn_classification.py | 45 +++++++------- sklearnex/neighbors/knn_regression.py | 69 ++++++++++----------- 3 files changed, 100 insertions(+), 87 deletions(-) diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index 840b43ee13..b84ea8616f 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -71,7 +71,9 @@ def _get_weights(self, dist, weights): # if user attempts to classify a point that was zero distance from one # or more training points, those training points are weighted as 1.0 # and the other points as 0.0 - if dist.dtype is xp.asarray(object).dtype: + # Check for object dtype - use string comparison for Array API compatibility + is_object_dtype = str(dist.dtype) == 'object' or (hasattr(dist.dtype, 'kind') and dist.dtype.kind == 'O') + if is_object_dtype: for point_dist_i, point_dist in enumerate(dist): # check if point_dist is iterable # (ex: RadiusNeighborClassifier.predict may set an element of @@ -138,7 +140,11 @@ def _compute_weighted_prediction(self, neigh_dist, neigh_ind, weights_param, y_t ) # Shape: (n_samples, n_neighbors, n_outputs) y_pred = xp.mean(gathered, axis=1) else: - y_pred = xp.empty((neigh_ind.shape[0], _y.shape[1]), dtype=xp.float64) + # Create y_pred with proper device/queue by using zeros_like pattern + # This ensures device compatibility in SPMD mode + y_pred_shape = (neigh_ind.shape[0], _y.shape[1]) + # Create on same device as neigh_ind to ensure queue compatibility + y_pred = xp.zeros(y_pred_shape, dtype=xp.float64, device=getattr(neigh_ind, 'device', None)) denom = xp.sum(weights, axis=1) for j in range(_y.shape[1]): @@ -316,6 +322,40 @@ def _validate_n_neighbors(self, n_neighbors): "enter integer value" % type(n_neighbors) ) + def _set_effective_metric(self): + """Set effective_metric_ and effective_metric_params_ without validation. + + Used when we need to set metrics but can't call _fit_validation + (e.g., in SPMD mode with use_raw_input=True where sklearn validation + would try to convert array API to numpy). + """ + if self.metric_params is not None and "p" in self.metric_params: + if self.p is not None: + warnings.warn( + "Parameter p is found in metric_params. " + "The corresponding parameter from __init__ " + "is ignored.", + SyntaxWarning, + stacklevel=2, + ) + self.effective_metric_params_ = self.metric_params.copy() + effective_p = self.metric_params["p"] + else: + self.effective_metric_params_ = {} + effective_p = self.p + + self.effective_metric_params_["p"] = effective_p + self.effective_metric_ = self.metric + # For minkowski distance, use more efficient methods where available + if self.metric == "minkowski": + p = self.effective_metric_params_["p"] + if p == 1: + self.effective_metric_ = "manhattan" + elif p == 2: + self.effective_metric_ = "euclidean" + elif p == np.inf: + self.effective_metric_ = "chebyshev" + def _validate_n_classes(self): """Validate that the classifier has at least 2 classes.""" length = 0 if self.classes_ is None else len(self.classes_) @@ -556,32 +596,9 @@ def _fit_validation(self, X, y=None): # check_feature_names(self, X, reset=True) # Validate n_neighbors parameter self._validate_n_neighbors(self.n_neighbors) - if self.metric_params is not None and "p" in self.metric_params: - if self.p is not None: - warnings.warn( - "Parameter p is found in metric_params. " - "The corresponding parameter from __init__ " - "is ignored.", - SyntaxWarning, - stacklevel=2, - ) - self.effective_metric_params_ = self.metric_params.copy() - effective_p = self.metric_params["p"] - else: - self.effective_metric_params_ = {} - effective_p = self.p - - self.effective_metric_params_["p"] = effective_p - self.effective_metric_ = self.metric - # For minkowski distance, use more efficient methods where available - if self.metric == "minkowski": - p = self.effective_metric_params_["p"] - if p == 1: - self.effective_metric_ = "manhattan" - elif p == 2: - self.effective_metric_ = "euclidean" - elif p == np.inf: - self.effective_metric_ = "chebyshev" + + # Set effective metric and parameters + self._set_effective_metric() if not isinstance(X, (KDTree, BallTree, _sklearn_NeighborsBase)): # Use _check_array like main branch, but with array API dtype support diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index f1a0e28226..82e2fcf840 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -24,6 +24,7 @@ from daal4py.sklearn._n_jobs_support import control_n_jobs from daal4py.sklearn._utils import sklearn_check_version from daal4py.sklearn.utils.validation import get_requires_y_tag +from onedal._device_offload import _transfer_to_host from onedal.neighbors import KNeighborsClassifier as onedal_KNeighborsClassifier from .._config import get_config @@ -150,12 +151,8 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): def _onedal_fit(self, X, y, queue=None): xp, _ = get_namespace(X) - # When use_raw_input=True, dispatch bypasses _onedal_supported() which calls _fit_validation() - # We need to call it here to set effective_metric_ and effective_metric_params_ - use_raw_input = get_config()["use_raw_input"] - if use_raw_input: - self._fit_validation(X, y) - else: + # Validation step (follows PCA pattern) + if not get_config()["use_raw_input"]: X, y = validate_data( self, X, @@ -163,10 +160,16 @@ def _onedal_fit(self, X, y, queue=None): dtype=[xp.float64, xp.float32], accept_sparse="csr", ) + # Set effective metric after validation + self._set_effective_metric() + else: + # SPMD mode: skip validation but still set effective metric + self._set_effective_metric() - # Process classification targets in sklearnex before passing to onedal - # When use_raw_input=True, y is raw array API (dpctl/dpnp), skip sklearn validation - self._process_classification_targets(y, skip_validation=use_raw_input) + # Process classification targets before passing to onedal + self._process_classification_targets(y, skip_validation=get_config()["use_raw_input"]) + + # Call onedal backend onedal_params = { "n_neighbors": self.n_neighbors, "weights": self.weights, @@ -179,16 +182,14 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ - - # Pass both original and processed targets to onedal - # onedal needs the processed classes_ and _y attributes that we just set self._onedal_estimator.classes_ = self.classes_ self._onedal_estimator._y = self._y self._onedal_estimator.outputs_2d_ = self.outputs_2d_ - self._onedal_estimator._shape = self._shape # Pass shape from sklearnex + self._onedal_estimator._shape = self._shape - # Pass original y to onedal - it will use the pre-set classes_ and _y attributes we just assigned self._onedal_estimator.fit(X, y, queue=queue) + + # Post-processing self._save_attributes() def _onedal_predict(self, X, queue=None): @@ -238,14 +239,14 @@ def _onedal_kneighbors( ) def _onedal_score(self, X, y, sample_weight=None, queue=None): - # Convert array API to numpy for sklearn's accuracy_score - # Note: validate_data does NOT convert array API to numpy, so we do it explicitly - y = np.asarray(y) - if sample_weight is not None: - sample_weight = np.asarray(sample_weight) - return accuracy_score( - y, self._onedal_predict(X, queue=queue), sample_weight=sample_weight - ) + # Get predictions + y_pred = self._onedal_predict(X, queue=queue) + + # Convert array API to numpy for sklearn's accuracy_score using _transfer_to_host + # This properly handles Array API arrays that don't allow implicit conversion + _, (y, y_pred, sample_weight) = _transfer_to_host(y, y_pred, sample_weight) + + return accuracy_score(y, y_pred, sample_weight=sample_weight) def _save_attributes(self): self.classes_ = self._onedal_estimator.classes_ diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 98d9123dff..ac321e393b 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -30,7 +30,8 @@ from ..utils._array_api import enable_array_api, get_namespace from ..utils.validation import check_feature_names, validate_data from .common import KNeighborsDispatchingBase - +from .._config import get_config +from onedal.utils import _sycl_queue_manager as QM @enable_array_api @control_n_jobs(decorated_methods=["fit", "predict", "kneighbors", "score"]) @@ -130,25 +131,29 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): return_distance=return_distance, ) - def _onedal_fit(self, X, y, queue=None): + def _onedal_fit(self, X, y, queue=None): xp, _ = get_namespace(X, y) - # Use validate_data with multi_output=True to preserve y shape - # (multi_output=False converts column vectors to 1D) - # Note: Don't use y_numeric=True with multi_output=True for array API - # (sklearn's _check_y tries to access dtype.kind which doesn't exist on array API dtypes) - X, y = validate_data( - self, - X, - y, - dtype=[xp.float64, xp.float32], - accept_sparse="csr", - multi_output=True, - y_numeric=True, - ) - # Process regression targets in sklearnex before passing to onedal - # This sets _shape and _y attributes + + # Validation step (follows PCA pattern) + if not get_config()["use_raw_input"]: + X, y = validate_data( + self, + X, + y, + dtype=[xp.float64, xp.float32], + accept_sparse="csr", + multi_output=True, + ) + # Set effective metric after validation + self._set_effective_metric() + else: + # SPMD mode: skip validation but still set effective metric + self._set_effective_metric() + + # Process regression targets before passing to onedal self._process_regression_targets(y) + # Call onedal backend onedal_params = { "n_neighbors": self.n_neighbors, "weights": self.weights, @@ -161,33 +166,23 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator.requires_y = get_requires_y_tag(self) self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ - - # Pass pre-processed shape and _y to onedal - # For GPU backend, reshape _y to (-1, 1) before passing to onedal - from onedal.utils import _sycl_queue_manager as QM - + self._onedal_estimator._shape = self._shape + + # Reshape _y for GPU backend queue_instance = QM.get_global_queue() gpu_device = queue_instance is not None and queue_instance.sycl_device.is_gpu - - self._onedal_estimator._shape = self._shape - # Reshape _y for GPU backend (needs column vector) if gpu_device: self._onedal_estimator._y = xp.reshape(self._y, (-1, 1)) else: self._onedal_estimator._y = self._y self._onedal_estimator.fit(X, y, queue=queue) + + # Post-processing: save attributes and reshape _y self._save_attributes() - - # Original onedal code (after fit): - # if y is not None and _is_regressor(self): - # _, xp, _ = _get_sycl_namespace(X) - # self._y = y if self._shape is None else xp.reshape(y, self._shape) - # Now doing this in sklearnex layer if y is not None: xp, _ = get_namespace(y) self._y = y if self._shape is None else xp.reshape(y, self._shape) - # Also update the onedal estimator's _y since that's what gets used in predict self._onedal_estimator._y = self._y def _onedal_predict(self, X, queue=None): @@ -233,8 +228,8 @@ def _predict_skl(self, X, queue=None): def _onedal_kneighbors( self, X=None, n_neighbors=None, return_distance=True, queue=None ): - # Validate X to convert array API/pandas to numpy and check feature names (only if X is not None) - if X is not None: + # Validation step + if X is not None and not get_config()["use_raw_input"]: xp, _ = get_namespace(X) X = validate_data( self, @@ -244,15 +239,15 @@ def _onedal_kneighbors( reset=False, ) - # Prepare inputs and handle query_is_train case + # Prepare inputs X, n_neighbors, query_is_train = self._prepare_kneighbors_inputs(X, n_neighbors) - # Get raw results from onedal backend + # Call onedal backend result = self._onedal_estimator.kneighbors( X, n_neighbors, return_distance, queue=queue ) - # Apply post-processing (kd_tree sorting, removing self from results) + # Post-processing return self._kneighbors_post_processing( X, n_neighbors, return_distance, result, query_is_train ) From db1e13068fbc8d896a268652cf1978916ab53368 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Fri, 24 Oct 2025 16:15:57 -0700 Subject: [PATCH 78/87] fix: format --- sklearnex/neighbors/common.py | 14 +++++++++----- sklearnex/neighbors/knn_classification.py | 12 +++++++----- sklearnex/neighbors/knn_regression.py | 15 ++++++++------- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index b84ea8616f..ac05d775ad 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -72,7 +72,9 @@ def _get_weights(self, dist, weights): # or more training points, those training points are weighted as 1.0 # and the other points as 0.0 # Check for object dtype - use string comparison for Array API compatibility - is_object_dtype = str(dist.dtype) == 'object' or (hasattr(dist.dtype, 'kind') and dist.dtype.kind == 'O') + is_object_dtype = str(dist.dtype) == "object" or ( + hasattr(dist.dtype, "kind") and dist.dtype.kind == "O" + ) if is_object_dtype: for point_dist_i, point_dist in enumerate(dist): # check if point_dist is iterable @@ -144,7 +146,9 @@ def _compute_weighted_prediction(self, neigh_dist, neigh_ind, weights_param, y_t # This ensures device compatibility in SPMD mode y_pred_shape = (neigh_ind.shape[0], _y.shape[1]) # Create on same device as neigh_ind to ensure queue compatibility - y_pred = xp.zeros(y_pred_shape, dtype=xp.float64, device=getattr(neigh_ind, 'device', None)) + y_pred = xp.zeros( + y_pred_shape, dtype=xp.float64, device=getattr(neigh_ind, "device", None) + ) denom = xp.sum(weights, axis=1) for j in range(_y.shape[1]): @@ -324,7 +328,7 @@ def _validate_n_neighbors(self, n_neighbors): def _set_effective_metric(self): """Set effective_metric_ and effective_metric_params_ without validation. - + Used when we need to set metrics but can't call _fit_validation (e.g., in SPMD mode with use_raw_input=True where sklearn validation would try to convert array API to numpy). @@ -343,7 +347,7 @@ def _set_effective_metric(self): else: self.effective_metric_params_ = {} effective_p = self.p - + self.effective_metric_params_["p"] = effective_p self.effective_metric_ = self.metric # For minkowski distance, use more efficient methods where available @@ -596,7 +600,7 @@ def _fit_validation(self, X, y=None): # check_feature_names(self, X, reset=True) # Validate n_neighbors parameter self._validate_n_neighbors(self.n_neighbors) - + # Set effective metric and parameters self._set_effective_metric() diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index 82e2fcf840..e86e7c433d 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -167,8 +167,10 @@ def _onedal_fit(self, X, y, queue=None): self._set_effective_metric() # Process classification targets before passing to onedal - self._process_classification_targets(y, skip_validation=get_config()["use_raw_input"]) - + self._process_classification_targets( + y, skip_validation=get_config()["use_raw_input"] + ) + # Call onedal backend onedal_params = { "n_neighbors": self.n_neighbors, @@ -188,7 +190,7 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator._shape = self._shape self._onedal_estimator.fit(X, y, queue=queue) - + # Post-processing self._save_attributes() @@ -241,11 +243,11 @@ def _onedal_kneighbors( def _onedal_score(self, X, y, sample_weight=None, queue=None): # Get predictions y_pred = self._onedal_predict(X, queue=queue) - + # Convert array API to numpy for sklearn's accuracy_score using _transfer_to_host # This properly handles Array API arrays that don't allow implicit conversion _, (y, y_pred, sample_weight) = _transfer_to_host(y, y_pred, sample_weight) - + return accuracy_score(y, y_pred, sample_weight=sample_weight) def _save_attributes(self): diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index ac321e393b..1313e28bba 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -25,13 +25,14 @@ from daal4py.sklearn.utils.validation import get_requires_y_tag from onedal._device_offload import _transfer_to_host from onedal.neighbors import KNeighborsRegressor as onedal_KNeighborsRegressor +from onedal.utils import _sycl_queue_manager as QM +from .._config import get_config from .._device_offload import dispatch, wrap_output_data from ..utils._array_api import enable_array_api, get_namespace from ..utils.validation import check_feature_names, validate_data from .common import KNeighborsDispatchingBase -from .._config import get_config -from onedal.utils import _sycl_queue_manager as QM + @enable_array_api @control_n_jobs(decorated_methods=["fit", "predict", "kneighbors", "score"]) @@ -131,9 +132,9 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): return_distance=return_distance, ) - def _onedal_fit(self, X, y, queue=None): + def _onedal_fit(self, X, y, queue=None): xp, _ = get_namespace(X, y) - + # Validation step (follows PCA pattern) if not get_config()["use_raw_input"]: X, y = validate_data( @@ -149,7 +150,7 @@ def _onedal_fit(self, X, y, queue=None): else: # SPMD mode: skip validation but still set effective metric self._set_effective_metric() - + # Process regression targets before passing to onedal self._process_regression_targets(y) @@ -167,7 +168,7 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator.effective_metric_ = self.effective_metric_ self._onedal_estimator.effective_metric_params_ = self.effective_metric_params_ self._onedal_estimator._shape = self._shape - + # Reshape _y for GPU backend queue_instance = QM.get_global_queue() gpu_device = queue_instance is not None and queue_instance.sycl_device.is_gpu @@ -177,7 +178,7 @@ def _onedal_fit(self, X, y, queue=None): self._onedal_estimator._y = self._y self._onedal_estimator.fit(X, y, queue=queue) - + # Post-processing: save attributes and reshape _y self._save_attributes() if y is not None: From 4077898aec4926aef20470541611fb5260bf31e1 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Sat, 25 Oct 2025 23:50:16 -0700 Subject: [PATCH 79/87] fix: fix metric value --- sklearnex/neighbors/common.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index ac05d775ad..561258cf82 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -350,6 +350,16 @@ def _set_effective_metric(self): self.effective_metric_params_["p"] = effective_p self.effective_metric_ = self.metric + + # Convert sklearn metric aliases to canonical names for oneDAL compatibility + metric_aliases = { + "cityblock": "manhattan", + "l1": "manhattan", + "l2": "euclidean", + } + if self.metric in metric_aliases: + self.effective_metric_ = metric_aliases[self.metric] + # For minkowski distance, use more efficient methods where available if self.metric == "minkowski": p = self.effective_metric_params_["p"] From 767bd210059a2886dd05ead93e3c0a872b2ac236 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Sun, 26 Oct 2025 18:19:39 -0700 Subject: [PATCH 80/87] fix: stability test --- sklearnex/neighbors/knn_regression.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 1313e28bba..35fc064c5c 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -143,7 +143,6 @@ def _onedal_fit(self, X, y, queue=None): y, dtype=[xp.float64, xp.float32], accept_sparse="csr", - multi_output=True, ) # Set effective metric after validation self._set_effective_metric() From 9dd8c001e2a52630cd6389f4fbdd2c238c067cda Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Sun, 26 Oct 2025 22:48:09 -0700 Subject: [PATCH 81/87] fix: test --- sklearnex/neighbors/common.py | 16 ++++++++++------ sklearnex/neighbors/knn_regression.py | 4 +++- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index 561258cf82..510096532a 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -29,6 +29,7 @@ from daal4py.sklearn._n_jobs_support import control_n_jobs from daal4py.sklearn._utils import sklearn_check_version from onedal._device_offload import _transfer_to_host +from onedal.utils._array_api import _is_numpy_namespace from onedal.utils.validation import ( _check_array, _check_classification_targets, @@ -142,13 +143,16 @@ def _compute_weighted_prediction(self, neigh_dist, neigh_ind, weights_param, y_t ) # Shape: (n_samples, n_neighbors, n_outputs) y_pred = xp.mean(gathered, axis=1) else: - # Create y_pred with proper device/queue by using zeros_like pattern - # This ensures device compatibility in SPMD mode + # Create y_pred array - matches original onedal implementation using empty() + # For Array API arrays (dpctl/dpnp), pass device parameter to match input device + # For numpy arrays, device parameter is not supported and not needed y_pred_shape = (neigh_ind.shape[0], _y.shape[1]) - # Create on same device as neigh_ind to ensure queue compatibility - y_pred = xp.zeros( - y_pred_shape, dtype=xp.float64, device=getattr(neigh_ind, "device", None) - ) + if not _is_numpy_namespace(xp): + # Array API: pass device to ensure same device as input + y_pred = xp.empty(y_pred_shape, dtype=xp.float64, device=neigh_ind.device) + else: + # Numpy: no device parameter + y_pred = xp.empty(y_pred_shape, dtype=xp.float64) denom = xp.sum(weights, axis=1) for j in range(_y.shape[1]): diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 35fc064c5c..a78491e971 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -135,7 +135,7 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): def _onedal_fit(self, X, y, queue=None): xp, _ = get_namespace(X, y) - # Validation step (follows PCA pattern) + # Validation step if not get_config()["use_raw_input"]: X, y = validate_data( self, @@ -143,6 +143,8 @@ def _onedal_fit(self, X, y, queue=None): y, dtype=[xp.float64, xp.float32], accept_sparse="csr", + multi_output=True, + y_numeric=True, ) # Set effective metric after validation self._set_effective_metric() From 11d560f769db27ce91ed0d108b6bd87808a22de9 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Sun, 26 Oct 2025 23:31:03 -0700 Subject: [PATCH 82/87] fix: fix patching error --- sklearnex/neighbors/knn_regression.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index a78491e971..e54063b1d2 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -144,7 +144,6 @@ def _onedal_fit(self, X, y, queue=None): dtype=[xp.float64, xp.float32], accept_sparse="csr", multi_output=True, - y_numeric=True, ) # Set effective metric after validation self._set_effective_metric() From 44fd2c37d85c005b21146687b1d675b43c8393cf Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 27 Oct 2025 12:30:05 -0700 Subject: [PATCH 83/87] fix: spmd preduct --- sklearnex/spmd/neighbors/neighbors.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sklearnex/spmd/neighbors/neighbors.py b/sklearnex/spmd/neighbors/neighbors.py index 485d48e955..d333f4530a 100644 --- a/sklearnex/spmd/neighbors/neighbors.py +++ b/sklearnex/spmd/neighbors/neighbors.py @@ -30,6 +30,17 @@ class KNeighborsClassifier(base_KNeighborsClassifier): class KNeighborsRegressor(base_KNeighborsRegressor): _onedal_estimator = staticmethod(onedal_KNeighborsRegressor) + def _onedal_predict(self, X, queue=None): + """Override to always use GPU path in SPMD mode. + + SPMD KNN regression always trains on GPU (creating regression.model), + so we must always use the GPU prediction path even with weights='distance'. + The parent class would dispatch to CPU/SKL path for weights='distance', + which would call infer_search() expecting search.model, causing type mismatch. + """ + # Always use GPU path - call parent's _predict_gpu directly + return self._predict_gpu(X, queue=queue) + class NearestNeighbors(base_NearestNeighbors): _onedal_estimator = staticmethod(onedal_NearestNeighbors) From 2eb6cf82aaa03c3b577a507588b7187efa7d273a Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 27 Oct 2025 16:12:20 -0700 Subject: [PATCH 84/87] fix: validate y for regressor --- sklearnex/neighbors/knn_regression.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index e54063b1d2..3c37513bea 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -135,7 +135,7 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): def _onedal_fit(self, X, y, queue=None): xp, _ = get_namespace(X, y) - # Validation step + # Validation step - validates and converts dtypes to float32/float64 if not get_config()["use_raw_input"]: X, y = validate_data( self, @@ -144,6 +144,7 @@ def _onedal_fit(self, X, y, queue=None): dtype=[xp.float64, xp.float32], accept_sparse="csr", multi_output=True, + y_numeric=True, # Ensures y dtype conversion for regressors (int8/16, uint8/16, float16 -> float32/64) ) # Set effective metric after validation self._set_effective_metric() @@ -151,7 +152,7 @@ def _onedal_fit(self, X, y, queue=None): # SPMD mode: skip validation but still set effective metric self._set_effective_metric() - # Process regression targets before passing to onedal + # Process regression targets before passing to onedal (uses validated y) self._process_regression_targets(y) # Call onedal backend @@ -177,6 +178,7 @@ def _onedal_fit(self, X, y, queue=None): else: self._onedal_estimator._y = self._y + # Pass validated X and y to onedal (after validate_data converted dtypes) self._onedal_estimator.fit(X, y, queue=queue) # Post-processing: save attributes and reshape _y From 0eb8229f89171ce1f7db6be67614b08a661acd5f Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Mon, 27 Oct 2025 23:52:30 -0700 Subject: [PATCH 85/87] test: try regressor without ynumric but verify it ouside validate dat --- sklearnex/neighbors/knn_regression.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 3c37513bea..f2fa69bdf6 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -144,8 +144,16 @@ def _onedal_fit(self, X, y, queue=None): dtype=[xp.float64, xp.float32], accept_sparse="csr", multi_output=True, - y_numeric=True, # Ensures y dtype conversion for regressors (int8/16, uint8/16, float16 -> float32/64) + # Note: y_numeric=True causes issues with Array API (no dtype.kind attribute) + # We handle y dtype conversion manually below ) + + # Convert y dtype if needed (handles int8/16, uint8/16, float16 -> float32/64) + # This is needed for regressors to ensure y is in the correct dtype + target_dtypes = [xp.float64, xp.float32] + if y.dtype not in target_dtypes: + y = xp.asarray(y, dtype=target_dtypes[0]) + # Set effective metric after validation self._set_effective_metric() else: From d0751d947034a17e6fc8f11d1a0bd76a2ca030ec Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 28 Oct 2025 12:43:08 -0700 Subject: [PATCH 86/87] fix: foloow ridge patten ensure y numberic requrie ksnearln >=1.5 --- sklearnex/neighbors/knn_regression.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index f2fa69bdf6..1a1760af9d 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -34,7 +34,7 @@ from .common import KNeighborsDispatchingBase -@enable_array_api +@enable_array_api("1.5") # validate_data y_numeric requires sklearn >=1.5 @control_n_jobs(decorated_methods=["fit", "predict", "kneighbors", "score"]) class KNeighborsRegressor(KNeighborsDispatchingBase, _sklearn_KNeighborsRegressor): __doc__ = _sklearn_KNeighborsRegressor.__doc__ @@ -144,16 +144,9 @@ def _onedal_fit(self, X, y, queue=None): dtype=[xp.float64, xp.float32], accept_sparse="csr", multi_output=True, - # Note: y_numeric=True causes issues with Array API (no dtype.kind attribute) - # We handle y dtype conversion manually below + y_numeric=True, ) - # Convert y dtype if needed (handles int8/16, uint8/16, float16 -> float32/64) - # This is needed for regressors to ensure y is in the correct dtype - target_dtypes = [xp.float64, xp.float32] - if y.dtype not in target_dtypes: - y = xp.asarray(y, dtype=target_dtypes[0]) - # Set effective metric after validation self._set_effective_metric() else: From 7966c97ac58f29b69e7495b279860c90cba26a72 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Tue, 28 Oct 2025 17:29:40 -0700 Subject: [PATCH 87/87] fix: test without manual convertion --- sklearnex/neighbors/knn_regression.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 1a1760af9d..7df1c184ce 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -151,6 +151,7 @@ def _onedal_fit(self, X, y, queue=None): self._set_effective_metric() else: # SPMD mode: skip validation but still set effective metric + # Note: SPMD tests provide data in correct dtype, no conversion needed self._set_effective_metric() # Process regression targets before passing to onedal (uses validated y)