add non time based anomaly detection models, initial release with isolation forest and oneclasssvm

codeloop · codeloop · commit b85fbfa039a0 · 2024-06-30T16:42:59.000Z
diff --git a/ads/opctl/operator/lowcode/anomaly/const.py b/ads/opctl/operator/lowcode/anomaly/const.py
@@ -16,6 +16,14 @@ class SupportedModels(str, metaclass=ExtendedEnumMeta):
     Auto = "auto"
     # TODS = "tods"
 
+class NonTimeADSupportedModels(str, metaclass=ExtendedEnumMeta):
+    """Supported non time-based anomaly detection models."""
+
+    OneClassSVM = "oneclasssvm"
+    IsolationForest = "isolationforest"
+    # TODO : Add DBScan
+    # DBScan = "dbscan"
+    
 
 class TODSSubModels(str, metaclass=ExtendedEnumMeta):
     """Supported TODS sub models."""
diff --git a/ads/opctl/operator/lowcode/anomaly/model/anomaly_dataset.py b/ads/opctl/operator/lowcode/anomaly/model/anomaly_dataset.py
@@ -84,17 +84,21 @@ def get_inliers_by_cat(self, category: str, data: pd.DataFrame):
         scores = self.get_scores_by_cat(category)
         inlier_indices = anomaly.index[anomaly[OutputColumns.ANOMALY_COL] == 0]
         inliers = data.iloc[inlier_indices]
-        if scores is not None and not scores.empty:
+        if scores is not None and not scores.empty and self.date_column != "index":
             inliers = pd.merge(inliers, scores, on=self.date_column, how="inner")
+        else:
+            inliers = pd.merge(inliers, anomaly, left_index=True, right_index=True, how="inner")
         return inliers
 
     def get_outliers_by_cat(self, category: str, data: pd.DataFrame):
         anomaly = self.get_anomalies_by_cat(category)
         scores = self.get_scores_by_cat(category)
         outliers_indices = anomaly.index[anomaly[OutputColumns.ANOMALY_COL] == 1]
         outliers = data.iloc[outliers_indices]
-        if scores is not None and not scores.empty:
+        if scores is not None and not scores.empty and self.date_column != "index":
             outliers = pd.merge(outliers, scores, on=self.date_column, how="inner")
+        else:
+            outliers = pd.merge(outliers, anomaly, left_index=True, right_index=True, how="inner")
         return outliers
 
     def get_inliers(self, datasets):
diff --git a/ads/opctl/operator/lowcode/anomaly/model/base_model.py b/ads/opctl/operator/lowcode/anomaly/model/base_model.py
@@ -25,7 +25,7 @@
     write_data,
 )
 from .anomaly_dataset import AnomalyDatasets, AnomalyOutput, TestData
-from ..const import SupportedModels
+from ..const import NonTimeADSupportedModels, SupportedModels
 from ..operator_config import AnomalyOperatorConfig, AnomalyOperatorSpec
 
 
@@ -61,7 +61,8 @@ def generate_report(self):
         try:
             anomaly_output = self._build_model()
         except Exception as e:
-            anomaly_output = self._fallback_build_model()
+            if self.spec.datetime_column:
+                anomaly_output = self._fallback_build_model()
 
         elapsed_time = time.time() - start_time
 
@@ -79,7 +80,9 @@ def generate_report(self):
             for col, df in self.datasets.full_data_dict.items()
         ]
         data_table = rc.Select(blocks=table_blocks)
-        date_column = self.spec.datetime_column.name
+        date_column = (
+            self.spec.datetime_column.name if self.spec.datetime_column else "index"
+        )
 
         blocks = []
         for target, df in self.datasets.full_data_dict.items():
@@ -114,7 +117,7 @@ def generate_report(self):
                 rc.Text(f"You selected the **`{self.spec.model}`** model."),
                 rc.Text(
                     "Based on your dataset, you could have also selected "
-                    f"any of the models: `{'`, `'.join(SupportedModels.keys())}`."
+                    f"any of the models: `{'`, `'.join(SupportedModels.keys() if self.spec.datetime_column else NonTimeADSupportedModels.keys())}`."
                 ),
                 rc.Metric(
                     heading="Analysis was completed in ",
@@ -320,7 +323,9 @@ def _fallback_build_model(self):
             y_pred = np.vectorize(self.outlier_map.get)(
                 est.predict(df[self.spec.target_column].fillna(0).values.reshape(-1, 1))
             )
-            scores = est.score_samples(df[self.spec.target_column].fillna(0).values.reshape(-1, 1))
+            scores = est.score_samples(
+                df[self.spec.target_column].fillna(0).values.reshape(-1, 1)
+            )
 
             anomaly = pd.DataFrame(
                 {date_column: df[date_column], OutputColumns.ANOMALY_COL: y_pred}
diff --git a/ads/opctl/operator/lowcode/anomaly/model/factory.py b/ads/opctl/operator/lowcode/anomaly/model/factory.py
@@ -4,10 +4,12 @@
 # Copyright (c) 2023 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
-from ..const import SupportedModels
+from ..const import SupportedModels, NonTimeADSupportedModels
 from ..operator_config import AnomalyOperatorConfig
 from .automlx import AutoMLXOperatorModel
 from .autots import AutoTSOperatorModel
+from .oneclasssvm import OneClassSVMOperatorModel
+from .isolationforest import IsolationForestOperatorModel
 from ads.opctl.operator.lowcode.anomaly.utils import select_auto_model
 
 # from .tods import TODSOperatorModel
@@ -16,11 +18,24 @@
 
 
 class UnSupportedModelError(Exception):
-    def __init__(self, model_type: str):
-        super().__init__(
-            f"Model: `{model_type}` "
-            f"is not supported. Supported models: {SupportedModels.values}"
+    """Exception raised when the model is not supported.
+
+    Attributes:
+        operator_config (AnomalyOperatorConfig): The operator configuration.
+        model_type (str): The type of the unsupported model.
+    """
+
+    def __init__(self, operator_config: AnomalyOperatorConfig, model_type: str):
+        supported_models = (
+            SupportedModels.values
+            if operator_config.spec.datetime_column
+            else NonTimeADSupportedModels.values
         )
+        message = (
+            f"Model: `{model_type}` is not supported. "
+            f"Supported models: {supported_models}"
+        )
+        super().__init__(message)
 
 
 class AnomalyOperatorModelFactory:
@@ -34,6 +49,13 @@ class AnomalyOperatorModelFactory:
         SupportedModels.AutoTS: AutoTSOperatorModel,
     }
 
+    _NonTime_MAP = {
+        NonTimeADSupportedModels.OneClassSVM: OneClassSVMOperatorModel,
+        NonTimeADSupportedModels.IsolationForest: IsolationForestOperatorModel,
+        # TODO: Add DBScan model for non time based anomaly
+        # NonTimeADSupportedModels.DBScan: DBScanOperatorModel,
+    }
+
     @classmethod
     def get_model(
         cls, operator_config: AnomalyOperatorConfig, datasets: AnomalyDatasets
@@ -62,6 +84,12 @@ def get_model(
         model_type = operator_config.spec.model
         if model_type == "auto":
             model_type = select_auto_model(datasets, operator_config)
-        if model_type not in cls._MAP:
-            raise UnSupportedModelError(model_type)
-        return cls._MAP[model_type](config=operator_config, datasets=datasets)
+
+        model_map = (
+            cls._MAP if operator_config.spec.datetime_column else cls._NonTime_MAP
+        )
+
+        if model_type not in model_map:
+            raise UnSupportedModelError(operator_config, model_type)
+
+        return model_map[model_type](config=operator_config, datasets=datasets)
diff --git a/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py b/ads/opctl/operator/lowcode/anomaly/model/isolationforest.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*--
+
+# Copyright (c) 2023, 2024 Oracle and/or its affiliates.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
+
+import numpy as np
+import pandas as pd
+
+from ads.common.decorator.runtime_dependency import runtime_dependency
+
+from .base_model import AnomalyOperatorBaseModel
+from .anomaly_dataset import AnomalyOutput
+from ads.opctl.operator.lowcode.anomaly.const import OutputColumns
+
+
+class IsolationForestOperatorModel(AnomalyOperatorBaseModel):
+    """Class representing OneClassSVM Anomaly Detection operator model."""
+
+    @runtime_dependency(
+        module="sklearn",
+        err_msg=(
+            "Please run `pip3 install scikit-learn` to "
+            "install the required dependencies for OneClassSVM."
+        ),
+    )
+    def _build_model(self) -> AnomalyOutput:
+        from sklearn.ensemble import IsolationForest
+
+        model_kwargs = self.spec.model_kwargs
+        # map the output as per anomaly dataset class, 1: outlier, 0: inlier
+        self.outlier_map = {1: 0, -1: 1}
+
+        anomaly_output = AnomalyOutput(date_column="index")
+
+        for target, df in self.datasets.full_data_dict.items():
+            model = IsolationForest(**model_kwargs)
+            model.fit(df[self.spec.target_column].values.reshape(-1, 1))
+            y_pred = np.vectorize(self.outlier_map.get)(
+                model.predict(df[self.spec.target_column].values.reshape(-1, 1))
+            )
+
+            scores = model.score_samples(
+                df[self.spec.target_column].values.reshape(-1, 1)
+            )
+
+            index_col = df.columns[0]
+
+            anomaly = pd.DataFrame(
+                {index_col: df[index_col], OutputColumns.ANOMALY_COL: y_pred}
+            ).reset_index(drop=True)
+            score = pd.DataFrame(
+                {"index": df[index_col], OutputColumns.SCORE_COL: scores}
+            ).reset_index(drop=True)
+
+            anomaly_output.add_output(target, anomaly, score)
+
+        return anomaly_output
+
+    def _generate_report(self):
+        """Generates the report."""
+        import report_creator as rc
+
+        other_sections = [
+            rc.Heading("Selected Models Overview", level=2),
+            rc.Text(
+                "The following tables provide information regarding the chosen model."
+            ),
+        ]
+
+        model_description = rc.Text(
+            "The Isolation Forest is an ensemble of “Isolation Trees” that “isolate” observations by recursive random partitioning"
+            " which can be represented by a tree structure. The number of splittings required to isolate a sample is lower for outliers and higher for inliers."
+        )
+
+        return (
+            model_description,
+            other_sections,
+        )
diff --git a/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py b/ads/opctl/operator/lowcode/anomaly/model/oneclasssvm.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*--
+
+# Copyright (c) 2023, 2024 Oracle and/or its affiliates.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
+
+import numpy as np
+import pandas as pd
+
+from ads.common.decorator.runtime_dependency import runtime_dependency
+
+from .base_model import AnomalyOperatorBaseModel
+from .anomaly_dataset import AnomalyOutput
+from ads.opctl.operator.lowcode.anomaly.const import OutputColumns
+
+
+class OneClassSVMOperatorModel(AnomalyOperatorBaseModel):
+    """Class representing OneClassSVM Anomaly Detection operator model."""
+
+    @runtime_dependency(
+        module="sklearn",
+        err_msg=(
+            "Please run `pip3 install scikit-learn` to "
+            "install the required dependencies for OneClassSVM."
+        ),
+    )
+    def _build_model(self) -> AnomalyOutput:
+        from sklearn.svm import OneClassSVM
+
+        model_kwargs = self.spec.model_kwargs
+        # map the output as per anomaly dataset class, 1: outlier, 0: inlier
+        self.outlier_map = {1: 0, -1: 1}
+
+        anomaly_output = AnomalyOutput(date_column="index")
+
+        for target, df in self.datasets.full_data_dict.items():
+            model = OneClassSVM(**model_kwargs)
+            model.fit(df[self.spec.target_column].values.reshape(-1, 1))
+            y_pred = np.vectorize(self.outlier_map.get)(
+                model.predict(df[self.spec.target_column].values.reshape(-1, 1))
+            )
+
+            scores = model.score_samples(
+                df[self.spec.target_column].values.reshape(-1, 1)
+            )
+
+            index_col = df.columns[0]
+
+            anomaly = pd.DataFrame(
+                {index_col: df[index_col], OutputColumns.ANOMALY_COL: y_pred}
+            ).reset_index(drop=True)
+            score = pd.DataFrame(
+                {"index": df[index_col], OutputColumns.SCORE_COL: scores}
+            ).reset_index(drop=True)
+
+            anomaly_output.add_output(target, anomaly, score)
+            
+        return anomaly_output
+
+    def _generate_report(self):
+        """Generates the report."""
+        import report_creator as rc
+
+        other_sections = [
+            rc.Heading("Selected Models Overview", level=2),
+            rc.Text(
+                "The following tables provide information regarding the chosen model."
+            ),
+        ]
+
+        model_description = rc.Text(
+            "The oneclasssvm model is a full-stack automated machine learning system for outlier detection. "
+            "It is best suited for novelty detection when the training set is not contaminated by outliers"
+        )
+
+        return (
+            model_description,
+            other_sections,
+        )
diff --git a/ads/opctl/operator/lowcode/anomaly/schema.yaml b/ads/opctl/operator/lowcode/anomaly/schema.yaml
@@ -29,7 +29,7 @@ spec:
     input_data:
       required: true
       type: dict
-      default: {"url": "data.csv"}
+      default: { "url": "data.csv" }
       meta:
         description: "The payload that the detector should evaluate."
       schema:
@@ -134,6 +134,9 @@ spec:
     datetime_column:
       type: dict
       required: true
+      default: {"name": "uid"}
+      meta:
+          description: "`datetime_column` is required for time series anomaly detection, only non time-based anomaly detection models can be run without `datetime_column`"
       schema:
         name:
           type: string
@@ -353,6 +356,8 @@ spec:
       allowed:
         - autots
         - auto
+        - oneclasssvm
+        - isolationforest
       meta:
         description: "The model to be used for anomaly detection"
 
diff --git a/ads/opctl/operator/lowcode/common/data.py b/ads/opctl/operator/lowcode/common/data.py
@@ -25,6 +25,7 @@ def __init__(self, spec: dict, name="input_data"):
         self.data = None
         self._data_dict = dict()
         self.name = name
+        self.spec = spec
         self.load_transform_ingest_data(spec)
 
     def get_raw_data_by_cat(self, category):
@@ -36,7 +37,7 @@ def get_raw_data_by_cat(self, category):
             for col, val in mapping[category].items():
                 condition &= (self.raw_data[col] == val)
         data_by_cat = self.raw_data[condition].reset_index(drop=True)
-        data_by_cat = self._data_transformer._format_datetime_col(data_by_cat)
+        data_by_cat = self._data_transformer._format_datetime_col(data_by_cat) if self.spec.datetime_column else data_by_cat
         return data_by_cat
 
 
diff --git a/ads/opctl/operator/lowcode/common/transformations.py b/ads/opctl/operator/lowcode/common/transformations.py