Skip to content

Commit 9dda937

Browse files
committed
fix: first round of refactor move preprocssing function to sklearnex
1 parent 0e8b4c6 commit 9dda937

File tree

6 files changed

+263
-213
lines changed

6 files changed

+263
-213
lines changed

onedal/neighbors/neighbors.py

Lines changed: 23 additions & 204 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
# ==============================================================================
1616

1717
from abc import ABCMeta, abstractmethod
18-
from numbers import Integral
1918

2019
import numpy as np
2120

@@ -28,14 +27,7 @@
2827
from ..common._mixin import ClassifierMixin, RegressorMixin
2928
from ..datatypes import from_table, to_table
3029
from ..utils._array_api import _get_sycl_namespace
31-
from ..utils.validation import (
32-
_check_array,
33-
_check_classification_targets,
34-
_check_n_features,
35-
_check_X_y,
36-
_column_or_1d,
37-
_num_samples,
38-
)
30+
from ..utils.validation import _num_samples
3931

4032

4133
class NeighborsCommonBase(metaclass=ABCMeta):
@@ -50,23 +42,6 @@ def __init__(self):
5042
self.effective_metric_params_ = None
5143
self._onedal_model = None
5244

53-
def _parse_auto_method(self, method, n_samples, n_features):
54-
result_method = method
55-
56-
if method in ["auto", "ball_tree"]:
57-
condition = (
58-
self.n_neighbors is not None and self.n_neighbors >= n_samples // 2
59-
)
60-
if self.metric == "precomputed" or n_features > 15 or condition:
61-
result_method = "brute"
62-
else:
63-
if self.metric == "euclidean":
64-
result_method = "kd_tree"
65-
else:
66-
result_method = "brute"
67-
68-
return result_method
69-
7045
@abstractmethod
7146
def train(self, *args, **kwargs): ...
7247

@@ -76,66 +51,6 @@ def infer(self, *args, **kwargs): ...
7651
@abstractmethod
7752
def _onedal_fit(self, X, y): ...
7853

79-
def _validate_data(
80-
self, X, y=None, reset=True, validate_separately=None, **check_params
81-
):
82-
if y is None:
83-
if self.requires_y:
84-
raise ValueError(
85-
f"This {self.__class__.__name__} estimator "
86-
f"requires y to be passed, but the target y is None."
87-
)
88-
X = _check_array(X, **check_params)
89-
out = X, y
90-
else:
91-
if validate_separately:
92-
# We need this because some estimators validate X and y
93-
# separately, and in general, separately calling _check_array()
94-
# on X and y isn't equivalent to just calling _check_X_y()
95-
# :(
96-
check_X_params, check_y_params = validate_separately
97-
X = _check_array(X, **check_X_params)
98-
y = _check_array(y, **check_y_params)
99-
else:
100-
X, y = _check_X_y(X, y, **check_params)
101-
out = X, y
102-
103-
if check_params.get("ensure_2d", True):
104-
_check_n_features(self, X, reset=reset)
105-
106-
return out
107-
108-
def _get_weights(self, dist, weights):
109-
if weights in (None, "uniform"):
110-
return None
111-
if weights == "distance":
112-
# if user attempts to classify a point that was zero distance from one
113-
# or more training points, those training points are weighted as 1.0
114-
# and the other points as 0.0
115-
if dist.dtype is np.dtype(object):
116-
for point_dist_i, point_dist in enumerate(dist):
117-
# check if point_dist is iterable
118-
# (ex: RadiusNeighborClassifier.predict may set an element of
119-
# dist to 1e-6 to represent an 'outlier')
120-
if hasattr(point_dist, "__contains__") and 0.0 in point_dist:
121-
dist[point_dist_i] = point_dist == 0.0
122-
else:
123-
dist[point_dist_i] = 1.0 / point_dist
124-
else:
125-
with np.errstate(divide="ignore"):
126-
dist = 1.0 / dist
127-
inf_mask = np.isinf(dist)
128-
inf_row = np.any(inf_mask, axis=1)
129-
dist[inf_row] = inf_mask[inf_row]
130-
return dist
131-
elif callable(weights):
132-
return weights(dist)
133-
else:
134-
raise ValueError(
135-
"weights not recognized: should be 'uniform', "
136-
"'distance', or a callable function"
137-
)
138-
13954
def _get_onedal_params(self, X, y=None, n_neighbors=None):
14055
class_count = 0 if self.classes_ is None else len(self.classes_)
14156
weights = getattr(self, "weights", "uniform")
@@ -145,8 +60,18 @@ def _get_onedal_params(self, X, y=None, n_neighbors=None):
14560
p = 2.0
14661
else:
14762
p = self.p
63+
64+
# Handle different input types for dtype
65+
try:
66+
fptype = X.dtype
67+
except AttributeError:
68+
# For pandas DataFrames or other types without dtype attribute
69+
import numpy as np
70+
71+
fptype = np.float64
72+
14873
return {
149-
"fptype": X.dtype,
74+
"fptype": fptype,
15075
"vote_weights": "uniform" if weights == "uniform" else "distance",
15176
"method": self._fit_method,
15277
"radius": self.radius,
@@ -176,21 +101,6 @@ def __init__(
176101
self.p = p
177102
self.metric_params = metric_params
178103

179-
def _validate_targets(self, y, dtype):
180-
arr = _column_or_1d(y, warn=True)
181-
182-
try:
183-
return arr.astype(dtype, copy=False)
184-
except ValueError:
185-
return arr
186-
187-
def _validate_n_classes(self):
188-
length = 0 if self.classes_ is None else len(self.classes_)
189-
if length < 2:
190-
raise ValueError(
191-
f"The number of classes has to be greater than one; got {length}"
192-
)
193-
194104
def _fit(self, X, y):
195105
self._onedal_model = None
196106
self._tree = None
@@ -202,13 +112,8 @@ def _fit(self, X, y):
202112
)
203113

204114
_, xp, _ = _get_sycl_namespace(X)
205-
use_raw_input = _get_config().get("use_raw_input", False) is True
206115
if y is not None or self.requires_y:
207116
shape = getattr(y, "shape", None)
208-
if not use_raw_input:
209-
X, y = super()._validate_data(
210-
X, y, dtype=[np.float64, np.float32], accept_sparse="csr"
211-
)
212117
self._shape = shape if shape is not None else y.shape
213118

214119
if _is_classifier(self):
@@ -218,7 +123,6 @@ def _fit(self, X, y):
218123
else:
219124
self.outputs_2d_ = True
220125

221-
_check_classification_targets(y)
222126
self.classes_ = []
223127
self._y = np.empty(y.shape, dtype=int)
224128
for k in range(self._y.shape[1]):
@@ -228,36 +132,19 @@ def _fit(self, X, y):
228132
if not self.outputs_2d_:
229133
self.classes_ = self.classes_[0]
230134
self._y = self._y.ravel()
231-
232-
self._validate_n_classes()
233135
else:
234136
self._y = y
235-
elif not use_raw_input:
236-
X, _ = super()._validate_data(X, dtype=[np.float64, np.float32])
237137

238138
self.n_samples_fit_ = X.shape[0]
239139
self.n_features_in_ = X.shape[1]
240140
self._fit_X = X
241141

242-
if self.n_neighbors is not None:
243-
if self.n_neighbors <= 0:
244-
raise ValueError("Expected n_neighbors > 0. Got %d" % self.n_neighbors)
245-
if not isinstance(self.n_neighbors, Integral):
246-
raise TypeError(
247-
"n_neighbors does not take %s value, "
248-
"enter integer value" % type(self.n_neighbors)
249-
)
250-
251-
self._fit_method = super()._parse_auto_method(
252-
self.algorithm, self.n_samples_fit_, self.n_features_in_
253-
)
254-
255142
_fit_y = None
256143
queue = QM.get_global_queue()
257144
gpu_device = queue is not None and queue.sycl_device.is_gpu
258145

259146
if _is_classifier(self) or (_is_regressor(self) and gpu_device):
260-
_fit_y = self._validate_targets(self._y, X.dtype).reshape((-1, 1))
147+
_fit_y = y.astype(X.dtype).reshape((-1, 1)) if y is not None else None
261148
result = self._onedal_fit(X, _fit_y)
262149

263150
if y is not None and _is_regressor(self):
@@ -269,38 +156,22 @@ def _fit(self, X, y):
269156
return result
270157

271158
def _kneighbors(self, X=None, n_neighbors=None, return_distance=True):
272-
n_features = getattr(self, "n_features_in_", None)
273-
shape = getattr(X, "shape", None)
274-
if n_features and shape and len(shape) > 1 and shape[1] != n_features:
275-
raise ValueError(
276-
(
277-
f"X has {X.shape[1]} features, "
278-
f"but kneighbors is expecting "
279-
f"{n_features} features as input"
280-
)
281-
)
282-
283159
_check_is_fitted(self)
284160

285161
if n_neighbors is None:
286162
n_neighbors = self.n_neighbors
287-
elif n_neighbors <= 0:
288-
raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors)
289-
else:
290-
if not isinstance(n_neighbors, Integral):
291-
raise TypeError(
292-
"n_neighbors does not take %s value, "
293-
"enter integer value" % type(n_neighbors)
294-
)
295163

296164
if X is not None:
297165
query_is_train = False
166+
<<<<<<< HEAD
298167
<<<<<<< HEAD
299168
if not use_raw_input:
300169
X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32])
301170
=======
302171
X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32])
303172
>>>>>>> e003b37f (fix: try it again)
173+
=======
174+
>>>>>>> 8cd6f2b2 (fix: first round of refactor move preprocssing function to sklearnex)
304175
else:
305176
query_is_train = True
306177
X = self._fit_X
@@ -309,24 +180,12 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True):
309180
n_neighbors += 1
310181

311182
n_samples_fit = self.n_samples_fit_
312-
if n_neighbors > n_samples_fit:
313-
if query_is_train:
314-
n_neighbors -= 1 # ok to modify inplace because an error is raised
315-
inequality_str = "n_neighbors < n_samples_fit"
316-
else:
317-
inequality_str = "n_neighbors <= n_samples_fit"
318-
raise ValueError(
319-
f"Expected {inequality_str}, but "
320-
f"n_neighbors = {n_neighbors}, n_samples_fit = {n_samples_fit}, "
321-
f"n_samples = {X.shape[0]}" # include n_samples for common tests
322-
)
323183

324184
chunked_results = None
325-
method = self._parse_auto_method(
326-
self._fit_method, self.n_samples_fit_, n_features
327-
)
185+
# Use the fit method determined at sklearnex level
186+
method = getattr(self, "_fit_method", "brute")
328187

329-
params = super()._get_onedal_params(X, n_neighbors=n_neighbors)
188+
params = self._get_onedal_params(X, n_neighbors=n_neighbors)
330189
prediction_results = self._onedal_predict(self._onedal_model, X, params)
331190
distances = from_table(prediction_results.distances)
332191
indices = from_table(prediction_results.indices)
@@ -434,30 +293,9 @@ def fit(self, X, y, queue=None):
434293

435294
@supports_queue
436295
def predict(self, X, queue=None):
437-
use_raw_input = _get_config().get("use_raw_input", False) is True
438-
if not use_raw_input:
439-
X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32])
440296
onedal_model = getattr(self, "_onedal_model", None)
441-
n_features = getattr(self, "n_features_in_", None)
442-
n_samples_fit_ = getattr(self, "n_samples_fit_", None)
443-
shape = getattr(X, "shape", None)
444-
if n_features and shape and len(shape) > 1 and shape[1] != n_features:
445-
raise ValueError(
446-
(
447-
f"X has {X.shape[1]} features, "
448-
f"but KNNClassifier is expecting "
449-
f"{n_features} features as input"
450-
)
451-
)
452-
453297
_check_is_fitted(self)
454298

455-
self._fit_method = self._parse_auto_method(
456-
self.algorithm, n_samples_fit_, n_features
457-
)
458-
459-
self._validate_n_classes()
460-
461299
params = self._get_onedal_params(X)
462300
prediction_result = self._onedal_predict(onedal_model, X, params)
463301
responses = from_table(prediction_result.responses)
@@ -477,9 +315,8 @@ def predict_proba(self, X, queue=None):
477315

478316
n_queries = _num_samples(X)
479317

480-
weights = self._get_weights(neigh_dist, self.weights)
481-
if weights is None:
482-
weights = np.ones_like(neigh_ind)
318+
# Use uniform weights for now - weights calculation should be done at sklearnex level
319+
weights = np.ones_like(neigh_ind)
483320

484321
all_rows = np.arange(n_queries)
485322
probabilities = []
@@ -580,28 +417,9 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None)
580417
return self._kneighbors(X, n_neighbors, return_distance)
581418

582419
def _predict_gpu(self, X):
583-
use_raw_input = _get_config().get("use_raw_input", False) is True
584-
if not use_raw_input:
585-
X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32])
586420
onedal_model = getattr(self, "_onedal_model", None)
587-
n_features = getattr(self, "n_features_in_", None)
588-
n_samples_fit_ = getattr(self, "n_samples_fit_", None)
589-
shape = getattr(X, "shape", None)
590-
if n_features and shape and len(shape) > 1 and shape[1] != n_features:
591-
raise ValueError(
592-
(
593-
f"X has {X.shape[1]} features, "
594-
f"but KNNClassifier is expecting "
595-
f"{n_features} features as input"
596-
)
597-
)
598-
599421
_check_is_fitted(self)
600422

601-
self._fit_method = self._parse_auto_method(
602-
self.algorithm, n_samples_fit_, n_features
603-
)
604-
605423
params = self._get_onedal_params(X)
606424

607425
prediction_result = self._onedal_predict(onedal_model, X, params)
@@ -613,7 +431,8 @@ def _predict_gpu(self, X):
613431
def _predict_skl(self, X):
614432
neigh_dist, neigh_ind = self.kneighbors(X)
615433

616-
weights = self._get_weights(neigh_dist, self.weights)
434+
# Use uniform weights for now - weights calculation should be done at sklearnex level
435+
weights = None
617436

618437
_y = self._y
619438
if _y.ndim == 1:

sklearnex/neighbors/_lof.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,18 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True):
152152
check_is_fitted(self)
153153
if X is not None:
154154
check_feature_names(self, X, reset=False)
155+
# Perform preprocessing at sklearnex level
156+
import numpy as np
157+
158+
from onedal.utils.validation import _check_array
159+
160+
X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32])
161+
self._validate_feature_count(X, "kneighbors")
162+
163+
# Validate n_neighbors
164+
if n_neighbors is not None:
165+
self._validate_n_neighbors(n_neighbors)
166+
155167
return dispatch(
156168
self,
157169
"kneighbors",
@@ -186,4 +198,4 @@ def score_samples(self, X):
186198
return -np.mean(lrd_ratios_array, axis=1)
187199

188200
fit.__doc__ = _sklearn_LocalOutlierFactor.fit.__doc__
189-
kneighbors.__doc__ = _sklearn_LocalOutlierFactor.kneighbors.__doc__
201+
kneighbors.__doc__ = _sklearn_LocalOutlierFactor.kneighbors.__doc__

0 commit comments

Comments
 (0)