Skip to content

Commit b85fbfa

Browse files
committed
add non time based anomaly detection models, initial release with isolation forest and oneclasssvm
1 parent 916a7f5 commit b85fbfa

File tree

9 files changed

+234
-22
lines changed

9 files changed

+234
-22
lines changed

ads/opctl/operator/lowcode/anomaly/const.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,14 @@ class SupportedModels(str, metaclass=ExtendedEnumMeta):
1616
Auto = "auto"
1717
# TODS = "tods"
1818

19+
class NonTimeADSupportedModels(str, metaclass=ExtendedEnumMeta):
20+
"""Supported non time-based anomaly detection models."""
21+
22+
OneClassSVM = "oneclasssvm"
23+
IsolationForest = "isolationforest"
24+
# TODO : Add DBScan
25+
# DBScan = "dbscan"
26+
1927

2028
class TODSSubModels(str, metaclass=ExtendedEnumMeta):
2129
"""Supported TODS sub models."""

ads/opctl/operator/lowcode/anomaly/model/anomaly_dataset.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,17 +84,21 @@ def get_inliers_by_cat(self, category: str, data: pd.DataFrame):
8484
scores = self.get_scores_by_cat(category)
8585
inlier_indices = anomaly.index[anomaly[OutputColumns.ANOMALY_COL] == 0]
8686
inliers = data.iloc[inlier_indices]
87-
if scores is not None and not scores.empty:
87+
if scores is not None and not scores.empty and self.date_column != "index":
8888
inliers = pd.merge(inliers, scores, on=self.date_column, how="inner")
89+
else:
90+
inliers = pd.merge(inliers, anomaly, left_index=True, right_index=True, how="inner")
8991
return inliers
9092

9193
def get_outliers_by_cat(self, category: str, data: pd.DataFrame):
9294
anomaly = self.get_anomalies_by_cat(category)
9395
scores = self.get_scores_by_cat(category)
9496
outliers_indices = anomaly.index[anomaly[OutputColumns.ANOMALY_COL] == 1]
9597
outliers = data.iloc[outliers_indices]
96-
if scores is not None and not scores.empty:
98+
if scores is not None and not scores.empty and self.date_column != "index":
9799
outliers = pd.merge(outliers, scores, on=self.date_column, how="inner")
100+
else:
101+
outliers = pd.merge(outliers, anomaly, left_index=True, right_index=True, how="inner")
98102
return outliers
99103

100104
def get_inliers(self, datasets):

ads/opctl/operator/lowcode/anomaly/model/base_model.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
write_data,
2626
)
2727
from .anomaly_dataset import AnomalyDatasets, AnomalyOutput, TestData
28-
from ..const import SupportedModels
28+
from ..const import NonTimeADSupportedModels, SupportedModels
2929
from ..operator_config import AnomalyOperatorConfig, AnomalyOperatorSpec
3030

3131

@@ -61,7 +61,8 @@ def generate_report(self):
6161
try:
6262
anomaly_output = self._build_model()
6363
except Exception as e:
64-
anomaly_output = self._fallback_build_model()
64+
if self.spec.datetime_column:
65+
anomaly_output = self._fallback_build_model()
6566

6667
elapsed_time = time.time() - start_time
6768

@@ -79,7 +80,9 @@ def generate_report(self):
7980
for col, df in self.datasets.full_data_dict.items()
8081
]
8182
data_table = rc.Select(blocks=table_blocks)
82-
date_column = self.spec.datetime_column.name
83+
date_column = (
84+
self.spec.datetime_column.name if self.spec.datetime_column else "index"
85+
)
8386

8487
blocks = []
8588
for target, df in self.datasets.full_data_dict.items():
@@ -114,7 +117,7 @@ def generate_report(self):
114117
rc.Text(f"You selected the **`{self.spec.model}`** model."),
115118
rc.Text(
116119
"Based on your dataset, you could have also selected "
117-
f"any of the models: `{'`, `'.join(SupportedModels.keys())}`."
120+
f"any of the models: `{'`, `'.join(SupportedModels.keys() if self.spec.datetime_column else NonTimeADSupportedModels.keys())}`."
118121
),
119122
rc.Metric(
120123
heading="Analysis was completed in ",
@@ -320,7 +323,9 @@ def _fallback_build_model(self):
320323
y_pred = np.vectorize(self.outlier_map.get)(
321324
est.predict(df[self.spec.target_column].fillna(0).values.reshape(-1, 1))
322325
)
323-
scores = est.score_samples(df[self.spec.target_column].fillna(0).values.reshape(-1, 1))
326+
scores = est.score_samples(
327+
df[self.spec.target_column].fillna(0).values.reshape(-1, 1)
328+
)
324329

325330
anomaly = pd.DataFrame(
326331
{date_column: df[date_column], OutputColumns.ANOMALY_COL: y_pred}

ads/opctl/operator/lowcode/anomaly/model/factory.py

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44
# Copyright (c) 2023 Oracle and/or its affiliates.
55
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
66

7-
from ..const import SupportedModels
7+
from ..const import SupportedModels, NonTimeADSupportedModels
88
from ..operator_config import AnomalyOperatorConfig
99
from .automlx import AutoMLXOperatorModel
1010
from .autots import AutoTSOperatorModel
11+
from .oneclasssvm import OneClassSVMOperatorModel
12+
from .isolationforest import IsolationForestOperatorModel
1113
from ads.opctl.operator.lowcode.anomaly.utils import select_auto_model
1214

1315
# from .tods import TODSOperatorModel
@@ -16,11 +18,24 @@
1618

1719

1820
class UnSupportedModelError(Exception):
19-
def __init__(self, model_type: str):
20-
super().__init__(
21-
f"Model: `{model_type}` "
22-
f"is not supported. Supported models: {SupportedModels.values}"
21+
"""Exception raised when the model is not supported.
22+
23+
Attributes:
24+
operator_config (AnomalyOperatorConfig): The operator configuration.
25+
model_type (str): The type of the unsupported model.
26+
"""
27+
28+
def __init__(self, operator_config: AnomalyOperatorConfig, model_type: str):
29+
supported_models = (
30+
SupportedModels.values
31+
if operator_config.spec.datetime_column
32+
else NonTimeADSupportedModels.values
2333
)
34+
message = (
35+
f"Model: `{model_type}` is not supported. "
36+
f"Supported models: {supported_models}"
37+
)
38+
super().__init__(message)
2439

2540

2641
class AnomalyOperatorModelFactory:
@@ -34,6 +49,13 @@ class AnomalyOperatorModelFactory:
3449
SupportedModels.AutoTS: AutoTSOperatorModel,
3550
}
3651

52+
_NonTime_MAP = {
53+
NonTimeADSupportedModels.OneClassSVM: OneClassSVMOperatorModel,
54+
NonTimeADSupportedModels.IsolationForest: IsolationForestOperatorModel,
55+
# TODO: Add DBScan model for non time based anomaly
56+
# NonTimeADSupportedModels.DBScan: DBScanOperatorModel,
57+
}
58+
3759
@classmethod
3860
def get_model(
3961
cls, operator_config: AnomalyOperatorConfig, datasets: AnomalyDatasets
@@ -62,6 +84,12 @@ def get_model(
6284
model_type = operator_config.spec.model
6385
if model_type == "auto":
6486
model_type = select_auto_model(datasets, operator_config)
65-
if model_type not in cls._MAP:
66-
raise UnSupportedModelError(model_type)
67-
return cls._MAP[model_type](config=operator_config, datasets=datasets)
87+
88+
model_map = (
89+
cls._MAP if operator_config.spec.datetime_column else cls._NonTime_MAP
90+
)
91+
92+
if model_type not in model_map:
93+
raise UnSupportedModelError(operator_config, model_type)
94+
95+
return model_map[model_type](config=operator_config, datasets=datasets)
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*--
3+
4+
# Copyright (c) 2023, 2024 Oracle and/or its affiliates.
5+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
6+
7+
import numpy as np
8+
import pandas as pd
9+
10+
from ads.common.decorator.runtime_dependency import runtime_dependency
11+
12+
from .base_model import AnomalyOperatorBaseModel
13+
from .anomaly_dataset import AnomalyOutput
14+
from ads.opctl.operator.lowcode.anomaly.const import OutputColumns
15+
16+
17+
class IsolationForestOperatorModel(AnomalyOperatorBaseModel):
18+
"""Class representing OneClassSVM Anomaly Detection operator model."""
19+
20+
@runtime_dependency(
21+
module="sklearn",
22+
err_msg=(
23+
"Please run `pip3 install scikit-learn` to "
24+
"install the required dependencies for OneClassSVM."
25+
),
26+
)
27+
def _build_model(self) -> AnomalyOutput:
28+
from sklearn.ensemble import IsolationForest
29+
30+
model_kwargs = self.spec.model_kwargs
31+
# map the output as per anomaly dataset class, 1: outlier, 0: inlier
32+
self.outlier_map = {1: 0, -1: 1}
33+
34+
anomaly_output = AnomalyOutput(date_column="index")
35+
36+
for target, df in self.datasets.full_data_dict.items():
37+
model = IsolationForest(**model_kwargs)
38+
model.fit(df[self.spec.target_column].values.reshape(-1, 1))
39+
y_pred = np.vectorize(self.outlier_map.get)(
40+
model.predict(df[self.spec.target_column].values.reshape(-1, 1))
41+
)
42+
43+
scores = model.score_samples(
44+
df[self.spec.target_column].values.reshape(-1, 1)
45+
)
46+
47+
index_col = df.columns[0]
48+
49+
anomaly = pd.DataFrame(
50+
{index_col: df[index_col], OutputColumns.ANOMALY_COL: y_pred}
51+
).reset_index(drop=True)
52+
score = pd.DataFrame(
53+
{"index": df[index_col], OutputColumns.SCORE_COL: scores}
54+
).reset_index(drop=True)
55+
56+
anomaly_output.add_output(target, anomaly, score)
57+
58+
return anomaly_output
59+
60+
def _generate_report(self):
61+
"""Generates the report."""
62+
import report_creator as rc
63+
64+
other_sections = [
65+
rc.Heading("Selected Models Overview", level=2),
66+
rc.Text(
67+
"The following tables provide information regarding the chosen model."
68+
),
69+
]
70+
71+
model_description = rc.Text(
72+
"The Isolation Forest is an ensemble of “Isolation Trees” that “isolate” observations by recursive random partitioning"
73+
" which can be represented by a tree structure. The number of splittings required to isolate a sample is lower for outliers and higher for inliers."
74+
)
75+
76+
return (
77+
model_description,
78+
other_sections,
79+
)
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*--
3+
4+
# Copyright (c) 2023, 2024 Oracle and/or its affiliates.
5+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
6+
7+
import numpy as np
8+
import pandas as pd
9+
10+
from ads.common.decorator.runtime_dependency import runtime_dependency
11+
12+
from .base_model import AnomalyOperatorBaseModel
13+
from .anomaly_dataset import AnomalyOutput
14+
from ads.opctl.operator.lowcode.anomaly.const import OutputColumns
15+
16+
17+
class OneClassSVMOperatorModel(AnomalyOperatorBaseModel):
18+
"""Class representing OneClassSVM Anomaly Detection operator model."""
19+
20+
@runtime_dependency(
21+
module="sklearn",
22+
err_msg=(
23+
"Please run `pip3 install scikit-learn` to "
24+
"install the required dependencies for OneClassSVM."
25+
),
26+
)
27+
def _build_model(self) -> AnomalyOutput:
28+
from sklearn.svm import OneClassSVM
29+
30+
model_kwargs = self.spec.model_kwargs
31+
# map the output as per anomaly dataset class, 1: outlier, 0: inlier
32+
self.outlier_map = {1: 0, -1: 1}
33+
34+
anomaly_output = AnomalyOutput(date_column="index")
35+
36+
for target, df in self.datasets.full_data_dict.items():
37+
model = OneClassSVM(**model_kwargs)
38+
model.fit(df[self.spec.target_column].values.reshape(-1, 1))
39+
y_pred = np.vectorize(self.outlier_map.get)(
40+
model.predict(df[self.spec.target_column].values.reshape(-1, 1))
41+
)
42+
43+
scores = model.score_samples(
44+
df[self.spec.target_column].values.reshape(-1, 1)
45+
)
46+
47+
index_col = df.columns[0]
48+
49+
anomaly = pd.DataFrame(
50+
{index_col: df[index_col], OutputColumns.ANOMALY_COL: y_pred}
51+
).reset_index(drop=True)
52+
score = pd.DataFrame(
53+
{"index": df[index_col], OutputColumns.SCORE_COL: scores}
54+
).reset_index(drop=True)
55+
56+
anomaly_output.add_output(target, anomaly, score)
57+
58+
return anomaly_output
59+
60+
def _generate_report(self):
61+
"""Generates the report."""
62+
import report_creator as rc
63+
64+
other_sections = [
65+
rc.Heading("Selected Models Overview", level=2),
66+
rc.Text(
67+
"The following tables provide information regarding the chosen model."
68+
),
69+
]
70+
71+
model_description = rc.Text(
72+
"The oneclasssvm model is a full-stack automated machine learning system for outlier detection. "
73+
"It is best suited for novelty detection when the training set is not contaminated by outliers"
74+
)
75+
76+
return (
77+
model_description,
78+
other_sections,
79+
)

ads/opctl/operator/lowcode/anomaly/schema.yaml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ spec:
2929
input_data:
3030
required: true
3131
type: dict
32-
default: {"url": "data.csv"}
32+
default: { "url": "data.csv" }
3333
meta:
3434
description: "The payload that the detector should evaluate."
3535
schema:
@@ -134,6 +134,9 @@ spec:
134134
datetime_column:
135135
type: dict
136136
required: true
137+
default: {"name": "uid"}
138+
meta:
139+
description: "`datetime_column` is required for time series anomaly detection, only non time-based anomaly detection models can be run without `datetime_column`"
137140
schema:
138141
name:
139142
type: string
@@ -353,6 +356,8 @@ spec:
353356
allowed:
354357
- autots
355358
- auto
359+
- oneclasssvm
360+
- isolationforest
356361
meta:
357362
description: "The model to be used for anomaly detection"
358363

ads/opctl/operator/lowcode/common/data.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ def __init__(self, spec: dict, name="input_data"):
2525
self.data = None
2626
self._data_dict = dict()
2727
self.name = name
28+
self.spec = spec
2829
self.load_transform_ingest_data(spec)
2930

3031
def get_raw_data_by_cat(self, category):
@@ -36,7 +37,7 @@ def get_raw_data_by_cat(self, category):
3637
for col, val in mapping[category].items():
3738
condition &= (self.raw_data[col] == val)
3839
data_by_cat = self.raw_data[condition].reset_index(drop=True)
39-
data_by_cat = self._data_transformer._format_datetime_col(data_by_cat)
40+
data_by_cat = self._data_transformer._format_datetime_col(data_by_cat) if self.spec.datetime_column else data_by_cat
4041
return data_by_cat
4142

4243

0 commit comments

Comments
 (0)