Skip to content

Commit 8cd6f2b

Browse files
committed
fix: first round of refactor move preprocssing function to sklearnex
1 parent 6ceb3f5 commit 8cd6f2b

File tree

6 files changed

+277
-211
lines changed

6 files changed

+277
-211
lines changed

onedal/neighbors/neighbors.py

Lines changed: 20 additions & 205 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
# ==============================================================================
1616

1717
from abc import ABCMeta, abstractmethod
18-
from numbers import Integral
1918

2019
import numpy as np
2120

@@ -28,14 +27,7 @@
2827
from ..common._mixin import ClassifierMixin, RegressorMixin
2928
from ..datatypes import from_table, to_table
3029
from ..utils._array_api import _get_sycl_namespace
31-
from ..utils.validation import (
32-
_check_array,
33-
_check_classification_targets,
34-
_check_n_features,
35-
_check_X_y,
36-
_column_or_1d,
37-
_num_samples,
38-
)
30+
from ..utils.validation import _num_samples
3931

4032

4133
class NeighborsCommonBase(metaclass=ABCMeta):
@@ -50,23 +42,6 @@ def __init__(self):
5042
self.effective_metric_params_ = None
5143
self._onedal_model = None
5244

53-
def _parse_auto_method(self, method, n_samples, n_features):
54-
result_method = method
55-
56-
if method in ["auto", "ball_tree"]:
57-
condition = (
58-
self.n_neighbors is not None and self.n_neighbors >= n_samples // 2
59-
)
60-
if self.metric == "precomputed" or n_features > 15 or condition:
61-
result_method = "brute"
62-
else:
63-
if self.metric == "euclidean":
64-
result_method = "kd_tree"
65-
else:
66-
result_method = "brute"
67-
68-
return result_method
69-
7045
@abstractmethod
7146
def train(self, *args, **kwargs): ...
7247

@@ -76,66 +51,6 @@ def infer(self, *args, **kwargs): ...
7651
@abstractmethod
7752
def _onedal_fit(self, X, y): ...
7853

79-
def _validate_data(
80-
self, X, y=None, reset=True, validate_separately=None, **check_params
81-
):
82-
if y is None:
83-
if self.requires_y:
84-
raise ValueError(
85-
f"This {self.__class__.__name__} estimator "
86-
f"requires y to be passed, but the target y is None."
87-
)
88-
X = _check_array(X, **check_params)
89-
out = X, y
90-
else:
91-
if validate_separately:
92-
# We need this because some estimators validate X and y
93-
# separately, and in general, separately calling _check_array()
94-
# on X and y isn't equivalent to just calling _check_X_y()
95-
# :(
96-
check_X_params, check_y_params = validate_separately
97-
X = _check_array(X, **check_X_params)
98-
y = _check_array(y, **check_y_params)
99-
else:
100-
X, y = _check_X_y(X, y, **check_params)
101-
out = X, y
102-
103-
if check_params.get("ensure_2d", True):
104-
_check_n_features(self, X, reset=reset)
105-
106-
return out
107-
108-
def _get_weights(self, dist, weights):
109-
if weights in (None, "uniform"):
110-
return None
111-
if weights == "distance":
112-
# if user attempts to classify a point that was zero distance from one
113-
# or more training points, those training points are weighted as 1.0
114-
# and the other points as 0.0
115-
if dist.dtype is np.dtype(object):
116-
for point_dist_i, point_dist in enumerate(dist):
117-
# check if point_dist is iterable
118-
# (ex: RadiusNeighborClassifier.predict may set an element of
119-
# dist to 1e-6 to represent an 'outlier')
120-
if hasattr(point_dist, "__contains__") and 0.0 in point_dist:
121-
dist[point_dist_i] = point_dist == 0.0
122-
else:
123-
dist[point_dist_i] = 1.0 / point_dist
124-
else:
125-
with np.errstate(divide="ignore"):
126-
dist = 1.0 / dist
127-
inf_mask = np.isinf(dist)
128-
inf_row = np.any(inf_mask, axis=1)
129-
dist[inf_row] = inf_mask[inf_row]
130-
return dist
131-
elif callable(weights):
132-
return weights(dist)
133-
else:
134-
raise ValueError(
135-
"weights not recognized: should be 'uniform', "
136-
"'distance', or a callable function"
137-
)
138-
13954
def _get_onedal_params(self, X, y=None, n_neighbors=None):
14055
class_count = 0 if self.classes_ is None else len(self.classes_)
14156
weights = getattr(self, "weights", "uniform")
@@ -145,8 +60,18 @@ def _get_onedal_params(self, X, y=None, n_neighbors=None):
14560
p = 2.0
14661
else:
14762
p = self.p
63+
64+
# Handle different input types for dtype
65+
try:
66+
fptype = X.dtype
67+
except AttributeError:
68+
# For pandas DataFrames or other types without dtype attribute
69+
import numpy as np
70+
71+
fptype = np.float64
72+
14873
return {
149-
"fptype": X.dtype,
74+
"fptype": fptype,
15075
"vote_weights": "uniform" if weights == "uniform" else "distance",
15176
"method": self._fit_method,
15277
"radius": self.radius,
@@ -176,21 +101,6 @@ def __init__(
176101
self.p = p
177102
self.metric_params = metric_params
178103

179-
def _validate_targets(self, y, dtype):
180-
arr = _column_or_1d(y, warn=True)
181-
182-
try:
183-
return arr.astype(dtype, copy=False)
184-
except ValueError:
185-
return arr
186-
187-
def _validate_n_classes(self):
188-
length = 0 if self.classes_ is None else len(self.classes_)
189-
if length < 2:
190-
raise ValueError(
191-
f"The number of classes has to be greater than one; got {length}"
192-
)
193-
194104
def _fit(self, X, y):
195105
self._onedal_model = None
196106
self._tree = None
@@ -202,13 +112,8 @@ def _fit(self, X, y):
202112
)
203113

204114
_, xp, _ = _get_sycl_namespace(X)
205-
use_raw_input = _get_config().get("use_raw_input", False) is True
206115
if y is not None or self.requires_y:
207116
shape = getattr(y, "shape", None)
208-
if not use_raw_input:
209-
X, y = super()._validate_data(
210-
X, y, dtype=[np.float64, np.float32], accept_sparse="csr"
211-
)
212117
self._shape = shape if shape is not None else y.shape
213118

214119
if _is_classifier(self):
@@ -218,7 +123,6 @@ def _fit(self, X, y):
218123
else:
219124
self.outputs_2d_ = True
220125

221-
_check_classification_targets(y)
222126
self.classes_ = []
223127
self._y = np.empty(y.shape, dtype=int)
224128
for k in range(self._y.shape[1]):
@@ -228,36 +132,19 @@ def _fit(self, X, y):
228132
if not self.outputs_2d_:
229133
self.classes_ = self.classes_[0]
230134
self._y = self._y.ravel()
231-
232-
self._validate_n_classes()
233135
else:
234136
self._y = y
235-
elif not use_raw_input:
236-
X, _ = super()._validate_data(X, dtype=[np.float64, np.float32])
237137

238138
self.n_samples_fit_ = X.shape[0]
239139
self.n_features_in_ = X.shape[1]
240140
self._fit_X = X
241141

242-
if self.n_neighbors is not None:
243-
if self.n_neighbors <= 0:
244-
raise ValueError("Expected n_neighbors > 0. Got %d" % self.n_neighbors)
245-
if not isinstance(self.n_neighbors, Integral):
246-
raise TypeError(
247-
"n_neighbors does not take %s value, "
248-
"enter integer value" % type(self.n_neighbors)
249-
)
250-
251-
self._fit_method = super()._parse_auto_method(
252-
self.algorithm, self.n_samples_fit_, self.n_features_in_
253-
)
254-
255142
_fit_y = None
256143
queue = QM.get_global_queue()
257144
gpu_device = queue is not None and queue.sycl_device.is_gpu
258145

259146
if _is_classifier(self) or (_is_regressor(self) and gpu_device):
260-
_fit_y = self._validate_targets(self._y, X.dtype).reshape((-1, 1))
147+
_fit_y = y.astype(X.dtype).reshape((-1, 1)) if y is not None else None
261148
result = self._onedal_fit(X, _fit_y)
262149

263150
if y is not None and _is_regressor(self):
@@ -269,33 +156,13 @@ def _fit(self, X, y):
269156
return result
270157

271158
def _kneighbors(self, X=None, n_neighbors=None, return_distance=True):
272-
n_features = getattr(self, "n_features_in_", None)
273-
shape = getattr(X, "shape", None)
274-
if n_features and shape and len(shape) > 1 and shape[1] != n_features:
275-
raise ValueError(
276-
(
277-
f"X has {X.shape[1]} features, "
278-
f"but kneighbors is expecting "
279-
f"{n_features} features as input"
280-
)
281-
)
282-
283159
_check_is_fitted(self)
284160

285161
if n_neighbors is None:
286162
n_neighbors = self.n_neighbors
287-
elif n_neighbors <= 0:
288-
raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors)
289-
else:
290-
if not isinstance(n_neighbors, Integral):
291-
raise TypeError(
292-
"n_neighbors does not take %s value, "
293-
"enter integer value" % type(n_neighbors)
294-
)
295163

296164
if X is not None:
297165
query_is_train = False
298-
X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32])
299166
else:
300167
query_is_train = True
301168
X = self._fit_X
@@ -304,24 +171,12 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True):
304171
n_neighbors += 1
305172

306173
n_samples_fit = self.n_samples_fit_
307-
if n_neighbors > n_samples_fit:
308-
if query_is_train:
309-
n_neighbors -= 1 # ok to modify inplace because an error is raised
310-
inequality_str = "n_neighbors < n_samples_fit"
311-
else:
312-
inequality_str = "n_neighbors <= n_samples_fit"
313-
raise ValueError(
314-
f"Expected {inequality_str}, but "
315-
f"n_neighbors = {n_neighbors}, n_samples_fit = {n_samples_fit}, "
316-
f"n_samples = {X.shape[0]}" # include n_samples for common tests
317-
)
318174

319175
chunked_results = None
320-
method = self._parse_auto_method(
321-
self._fit_method, self.n_samples_fit_, n_features
322-
)
176+
# Use the fit method determined at sklearnex level
177+
method = getattr(self, "_fit_method", "brute")
323178

324-
params = super()._get_onedal_params(X, n_neighbors=n_neighbors)
179+
params = self._get_onedal_params(X, n_neighbors=n_neighbors)
325180
prediction_results = self._onedal_predict(self._onedal_model, X, params)
326181
distances = from_table(prediction_results.distances)
327182
indices = from_table(prediction_results.indices)
@@ -429,30 +284,9 @@ def fit(self, X, y, queue=None):
429284

430285
@supports_queue
431286
def predict(self, X, queue=None):
432-
use_raw_input = _get_config().get("use_raw_input", False) is True
433-
if not use_raw_input:
434-
X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32])
435287
onedal_model = getattr(self, "_onedal_model", None)
436-
n_features = getattr(self, "n_features_in_", None)
437-
n_samples_fit_ = getattr(self, "n_samples_fit_", None)
438-
shape = getattr(X, "shape", None)
439-
if n_features and shape and len(shape) > 1 and shape[1] != n_features:
440-
raise ValueError(
441-
(
442-
f"X has {X.shape[1]} features, "
443-
f"but KNNClassifier is expecting "
444-
f"{n_features} features as input"
445-
)
446-
)
447-
448288
_check_is_fitted(self)
449289

450-
self._fit_method = self._parse_auto_method(
451-
self.algorithm, n_samples_fit_, n_features
452-
)
453-
454-
self._validate_n_classes()
455-
456290
params = self._get_onedal_params(X)
457291
prediction_result = self._onedal_predict(onedal_model, X, params)
458292
responses = from_table(prediction_result.responses)
@@ -472,9 +306,8 @@ def predict_proba(self, X, queue=None):
472306

473307
n_queries = _num_samples(X)
474308

475-
weights = self._get_weights(neigh_dist, self.weights)
476-
if weights is None:
477-
weights = np.ones_like(neigh_ind)
309+
# Use uniform weights for now - weights calculation should be done at sklearnex level
310+
weights = np.ones_like(neigh_ind)
478311

479312
all_rows = np.arange(n_queries)
480313
probabilities = []
@@ -575,28 +408,9 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None)
575408
return self._kneighbors(X, n_neighbors, return_distance)
576409

577410
def _predict_gpu(self, X):
578-
use_raw_input = _get_config().get("use_raw_input", False) is True
579-
if not use_raw_input:
580-
X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32])
581411
onedal_model = getattr(self, "_onedal_model", None)
582-
n_features = getattr(self, "n_features_in_", None)
583-
n_samples_fit_ = getattr(self, "n_samples_fit_", None)
584-
shape = getattr(X, "shape", None)
585-
if n_features and shape and len(shape) > 1 and shape[1] != n_features:
586-
raise ValueError(
587-
(
588-
f"X has {X.shape[1]} features, "
589-
f"but KNNClassifier is expecting "
590-
f"{n_features} features as input"
591-
)
592-
)
593-
594412
_check_is_fitted(self)
595413

596-
self._fit_method = self._parse_auto_method(
597-
self.algorithm, n_samples_fit_, n_features
598-
)
599-
600414
params = self._get_onedal_params(X)
601415

602416
prediction_result = self._onedal_predict(onedal_model, X, params)
@@ -608,7 +422,8 @@ def _predict_gpu(self, X):
608422
def _predict_skl(self, X):
609423
neigh_dist, neigh_ind = self.kneighbors(X)
610424

611-
weights = self._get_weights(neigh_dist, self.weights)
425+
# Use uniform weights for now - weights calculation should be done at sklearnex level
426+
weights = None
612427

613428
_y = self._y
614429
if _y.ndim == 1:

sklearnex/neighbors/_lof.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,18 @@ def _kneighbors(self, X=None, n_neighbors=None, return_distance=True):
152152
check_is_fitted(self)
153153
if X is not None:
154154
check_feature_names(self, X, reset=False)
155+
# Perform preprocessing at sklearnex level
156+
import numpy as np
157+
158+
from onedal.utils.validation import _check_array
159+
160+
X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32])
161+
self._validate_feature_count(X, "kneighbors")
162+
163+
# Validate n_neighbors
164+
if n_neighbors is not None:
165+
self._validate_n_neighbors(n_neighbors)
166+
155167
return dispatch(
156168
self,
157169
"kneighbors",
@@ -186,4 +198,4 @@ def score_samples(self, X):
186198
return -np.mean(lrd_ratios_array, axis=1)
187199

188200
fit.__doc__ = _sklearn_LocalOutlierFactor.fit.__doc__
189-
kneighbors.__doc__ = _sklearn_LocalOutlierFactor.kneighbors.__doc__
201+
kneighbors.__doc__ = _sklearn_LocalOutlierFactor.kneighbors.__doc__

0 commit comments

Comments
 (0)