Skip to content

Commit acae51e

Browse files
Initial commit: Anomaly detection service with Isolation Forest and Random Cut Forest
0 parents  commit acae51e

File tree

15 files changed

+1540
-0
lines changed

15 files changed

+1540
-0
lines changed

.env.example

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# AWS credentials (required for Random Cut Forest)
2+
AWS_ACCESS_KEY_ID=your_access_key_id
3+
AWS_SECRET_ACCESS_KEY=your_secret_access_key
4+
AWS_REGION=us-west-2
5+
6+
# API settings
7+
API_HOST=0.0.0.0
8+
API_PORT=8000
9+
DEBUG=True
10+
11+
# Model settings
12+
DEFAULT_MODEL=isolation_forest
13+
DEFAULT_CONTAMINATION=0.1

.gitignore

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# Python
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
*.so
6+
.Python
7+
build/
8+
develop-eggs/
9+
dist/
10+
downloads/
11+
eggs/
12+
.eggs/
13+
lib/
14+
lib64/
15+
parts/
16+
sdist/
17+
var/
18+
wheels/
19+
*.egg-info/
20+
.installed.cfg
21+
*.egg
22+
23+
# Virtual Environment
24+
.venv/
25+
venv/
26+
ENV/
27+
28+
# IDE
29+
.idea/
30+
.vscode/
31+
*.swp
32+
*.swo
33+
34+
# Environment variables
35+
.env
36+
37+
# Data
38+
data/*.csv
39+
!data/synthetic_data.csv
40+
41+
# Logs
42+
*.log
43+
44+
# Coverage
45+
.coverage
46+
htmlcov/
47+
48+
# Jupyter Notebook
49+
.ipynb_checkpoints
50+
51+
# pytest
52+
.pytest_cache/

README.md

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# Anomaly Detection Service
2+
3+
A Python-based service for detecting anomalies in time series data using Random Cut Forest and Isolation Forest algorithms.
4+
5+
## Features
6+
7+
- REST API endpoints for training and inference
8+
- Support for multiple anomaly detection algorithms:
9+
- Random Cut Forest (AWS)
10+
- Isolation Forest
11+
- Integration with NAB (Numenta Anomaly Benchmark) datasets
12+
- Easy-to-use training and prediction interfaces
13+
14+
## Project Structure
15+
16+
```bash
17+
anomaly_detection/
18+
├── api/ # FastAPI endpoints
19+
├── core/ # Core anomaly detection logic
20+
├── data/ # Data processing and loading
21+
├── models/ # Model implementations
22+
├── utils/ # Utility functions
23+
└── tests/ # Test suite
24+
```
25+
26+
## Setup
27+
28+
1. Create a virtual environment:
29+
30+
```bash
31+
python -m venv ./.venv
32+
source ./.venv/bin/activate # On Windows: venv\Scripts\activate
33+
```
34+
35+
2. Install dependencies:
36+
37+
```bash
38+
pip install -r requirements.txt
39+
```
40+
41+
3. Set up environment variables:
42+
43+
```bash
44+
cp .env.example .env
45+
# Edit .env with your AWS credentials if using RCF
46+
```
47+
48+
## Usage
49+
50+
1. Start the API server:
51+
52+
```bash
53+
uvicorn anomaly_detection.api.main:app --reload
54+
```
55+
56+
2. Train a model:
57+
58+
```bash
59+
curl -X POST "http://localhost:8000/train" -H "Content-Type: application/json" -d '{"algorithm": "isolation_forest", "data_path": "path/to/data.csv"}'
60+
```
61+
62+
3. Make predictions:
63+
64+
```bash
65+
curl -X POST "http://localhost:8000/predict" -H "Content-Type: application/json" -d '{"algorithm": "isolation_forest", "data": [1.2, 2.3, 3.4]}'
66+
```
67+
68+
## Testing
69+
70+
Run tests with coverage:
71+
72+
```bash
73+
pytest --cov=anomaly_detection tests/
74+
```
75+
76+
## License
77+
78+
MIT

anomaly_detection/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
"""
2+
Anomaly Detection Service
3+
4+
A Python-based service for detecting anomalies in time series data using
5+
Random Cut Forest and Isolation Forest algorithms.
6+
"""
7+
8+
__version__ = "0.1.0"

anomaly_detection/api/main.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
from fastapi import FastAPI, HTTPException
2+
from pydantic import BaseModel
3+
from typing import List, Optional
4+
import numpy as np
5+
6+
app = FastAPI(
7+
title="Anomaly Detection Service",
8+
description="A service for detecting anomalies in time series data",
9+
version="1.0.0"
10+
)
11+
12+
class TrainingRequest(BaseModel):
13+
algorithm: str
14+
data_path: str
15+
parameters: Optional[dict] = None
16+
17+
class PredictionRequest(BaseModel):
18+
algorithm: str
19+
data: List[float]
20+
model_id: Optional[str] = None
21+
22+
class PredictionResponse(BaseModel):
23+
is_anomaly: bool
24+
score: float
25+
threshold: float
26+
27+
@app.post("/train")
28+
async def train_model(request: TrainingRequest):
29+
"""
30+
Train an anomaly detection model using the specified algorithm and data.
31+
"""
32+
try:
33+
# TODO: Implement training logic
34+
return {"status": "success", "model_id": "model_123"}
35+
except Exception as e:
36+
raise HTTPException(status_code=500, detail=str(e))
37+
38+
@app.post("/predict")
39+
async def predict(request: PredictionRequest) -> PredictionResponse:
40+
"""
41+
Make predictions using a trained anomaly detection model.
42+
"""
43+
try:
44+
# TODO: Implement prediction logic
45+
return PredictionResponse(
46+
is_anomaly=False,
47+
score=0.5,
48+
threshold=0.7
49+
)
50+
except Exception as e:
51+
raise HTTPException(status_code=500, detail=str(e))
52+
53+
@app.get("/health")
54+
async def health_check():
55+
"""
56+
Health check endpoint.
57+
"""
58+
return {"status": "healthy"}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import pandas as pd
2+
import numpy as np
3+
from typing import Tuple, Optional
4+
import os
5+
6+
class NABLoader:
7+
"""Loader for NAB (Numenta Anomaly Benchmark) datasets."""
8+
9+
def __init__(self, data_dir: str = "data"):
10+
self.data_dir = data_dir
11+
12+
def load_dataset(self, dataset_name: str) -> Tuple[np.ndarray, Optional[np.ndarray]]:
13+
"""
14+
Load a NAB dataset and return features and labels.
15+
16+
Args:
17+
dataset_name: Name of the dataset to load
18+
19+
Returns:
20+
Tuple containing:
21+
- features: numpy array of shape (n_samples, n_features)
22+
- labels: numpy array of shape (n_samples,) or None if no labels
23+
"""
24+
# Construct path to dataset
25+
dataset_path = os.path.join(self.data_dir, dataset_name)
26+
27+
if not os.path.exists(dataset_path):
28+
raise FileNotFoundError(f"Dataset {dataset_name} not found in {self.data_dir}")
29+
30+
# Load data
31+
df = pd.read_csv(dataset_path)
32+
33+
# Extract features (assuming first column is timestamp)
34+
features = df.iloc[:, 1:].values
35+
36+
# Check if labels exist (they might be in a separate file)
37+
labels_path = os.path.join(self.data_dir, "labels", f"{dataset_name}_labels.csv")
38+
labels = None
39+
40+
if os.path.exists(labels_path):
41+
labels_df = pd.read_csv(labels_path)
42+
labels = labels_df.iloc[:, 1].values
43+
44+
return features, labels
45+
46+
def get_available_datasets(self) -> list:
47+
"""Get list of available datasets in the data directory."""
48+
return [f for f in os.listdir(self.data_dir) if f.endswith('.csv')]

anomaly_detection/models/base.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from abc import ABC, abstractmethod
2+
import numpy as np
3+
from typing import Dict, Any, Optional
4+
5+
class AnomalyDetector(ABC):
6+
"""Base class for anomaly detection models."""
7+
8+
def __init__(self, **kwargs):
9+
self.model = None
10+
self.threshold = None
11+
self.parameters = kwargs
12+
13+
@abstractmethod
14+
def fit(self, X: np.ndarray) -> None:
15+
"""Fit the model to the training data."""
16+
pass
17+
18+
@abstractmethod
19+
def predict(self, X: np.ndarray) -> np.ndarray:
20+
"""Predict anomaly scores for the input data."""
21+
pass
22+
23+
def is_anomaly(self, X: np.ndarray) -> np.ndarray:
24+
"""Determine if samples are anomalies based on the threshold."""
25+
scores = self.predict(X)
26+
return scores > self.threshold
27+
28+
def set_threshold(self, threshold: float) -> None:
29+
"""Set the anomaly detection threshold."""
30+
self.threshold = threshold
31+
32+
def get_parameters(self) -> Dict[str, Any]:
33+
"""Get the model parameters."""
34+
return self.parameters
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from typing import Dict, Type
2+
from .base import AnomalyDetector
3+
from .isolation_forest import IsolationForestDetector
4+
from .random_cut_forest import RandomCutForestDetector
5+
6+
class ModelFactory:
7+
"""Factory class for creating anomaly detection models."""
8+
9+
_models: Dict[str, Type[AnomalyDetector]] = {
10+
"isolation_forest": IsolationForestDetector,
11+
"random_cut_forest": RandomCutForestDetector
12+
}
13+
14+
@classmethod
15+
def create_model(cls, model_type: str, **kwargs) -> AnomalyDetector:
16+
"""
17+
Create an anomaly detection model.
18+
19+
Args:
20+
model_type: Type of model to create
21+
**kwargs: Additional arguments to pass to the model constructor
22+
23+
Returns:
24+
An instance of the requested anomaly detection model
25+
26+
Raises:
27+
ValueError: If the requested model type is not supported
28+
"""
29+
if model_type not in cls._models:
30+
raise ValueError(f"Unsupported model type: {model_type}")
31+
32+
return cls._models[model_type](**kwargs)
33+
34+
@classmethod
35+
def get_supported_models(cls) -> list:
36+
"""Get list of supported model types."""
37+
return list(cls._models.keys())
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from sklearn.ensemble import IsolationForest
2+
import numpy as np
3+
from .base import AnomalyDetector
4+
5+
class IsolationForestDetector(AnomalyDetector):
6+
"""Isolation Forest based anomaly detector."""
7+
8+
def __init__(self, contamination: float = 0.1, **kwargs):
9+
super().__init__(**kwargs)
10+
self.contamination = contamination
11+
self.model = IsolationForest(
12+
contamination=contamination,
13+
random_state=42,
14+
**kwargs
15+
)
16+
17+
def fit(self, X: np.ndarray) -> None:
18+
"""Fit the Isolation Forest model."""
19+
self.model.fit(X)
20+
# Set threshold based on contamination
21+
scores = self.model.score_samples(X)
22+
self.threshold = np.percentile(scores, 100 * self.contamination)
23+
24+
def predict(self, X: np.ndarray) -> np.ndarray:
25+
"""Predict anomaly scores."""
26+
return -self.model.score_samples(X) # Convert to positive scores

0 commit comments

Comments
 (0)