From 5c2eded23f1ef5fff65339745791d590d35ee0c4 Mon Sep 17 00:00:00 2001
From: Robert - Localstack <robert@localstack.cloud>
Date: Wed, 3 Jul 2024 04:18:08 +0300
Subject: [PATCH 01/16] Add functioning boiler plate

---
 mwaa-etl-processor/Makefile                   | 38 +++++++++++++++++++
 .../airflow-bucket/dags/etl_dag.py            | 26 +++++++++++++
 .../airflow-bucket/requirements.txt           |  1 +
 mwaa-etl-processor/run.sh                     | 23 +++++++++++
 4 files changed, 88 insertions(+)
 create mode 100644 mwaa-etl-processor/Makefile
 create mode 100644 mwaa-etl-processor/airflow-bucket/dags/etl_dag.py
 create mode 100644 mwaa-etl-processor/airflow-bucket/requirements.txt
 create mode 100755 mwaa-etl-processor/run.sh

diff --git a/mwaa-etl-processor/Makefile b/mwaa-etl-processor/Makefile
new file mode 100644
index 0000000..979a9e5
--- /dev/null
+++ b/mwaa-etl-processor/Makefile
@@ -0,0 +1,38 @@
+export DEBUG=1
+
+SHELL := /bin/bash
+
+usage:       ## Show this help
+	@fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##//'
+
+install:     ## Install dependencies
+	@which localstack || pip install localstack
+	@which awslocal || pip install awscli-local
+
+run:         ## Run MWAA ETL Processor
+	./run.sh
+
+start:       ## Start LocalStack
+	@if [ -z `docker network ls --filter name=localstack -q` ]; then \
+		echo "Creating localstack network"; \
+		docker network create --attachable localstack; \
+	fi
+	DOCKER_FLAGS='--network localstack --name=localhost.localstack.cloud' localstack start -d
+
+stop:        ## Stop LocalStack
+	@echo
+	MAIN_CONTAINER_NAME=localhost.localstack.cloud localstack stop
+	docker rm -f $$(docker network inspect localstack | jq -r '.[0].Containers | keys[0]')
+
+ready:       ## Wait for LocalStack to be ready
+	@echo Waiting on the LocalStack container...
+	@localstack wait -t 30 && echo Localstack is ready to use! || (echo Gave up waiting on LocalStack, exiting. && exit 1)
+
+logs:        ## Retrieve logs from LocalStack
+	@localstack logs > logs.txt
+
+test-ci:     ## Run CI test
+	make start install ready run; return_code=`echo $$?`;\
+	make logs; make stop; exit $$return_code;
+
+.PHONY: usage install start run stop ready logs test-ci
diff --git a/mwaa-etl-processor/airflow-bucket/dags/etl_dag.py b/mwaa-etl-processor/airflow-bucket/dags/etl_dag.py
new file mode 100644
index 0000000..131bf6b
--- /dev/null
+++ b/mwaa-etl-processor/airflow-bucket/dags/etl_dag.py
@@ -0,0 +1,26 @@
+from airflow.decorators import dag, task
+from airflow.utils.dates import days_ago
+from airflow.providers.amazon.aws.hooks.s3 import S3Hook
+
+@dag(schedule_interval=None, start_date=days_ago(1), tags=['example'])
+def example_s3_list_dag():
+
+    @task
+    def list_environment_variables():
+        import os
+        for key, value in os.environ.items():
+            print(f"Found env var: {key}={value}")
+        return dict(os.environ.items())
+
+    @task
+    def list_s3_bucket(env_items):
+        hook = S3Hook(aws_conn_id='aws_default')
+        bucket_name = 'airflow'  # Replace with your S3 bucket name
+        keys = hook.list_keys(bucket_name)
+        for key in keys:
+            print(f"Found key: {key}")
+        return keys
+
+    list_s3_bucket(list_environment_variables())
+
+dag = example_s3_list_dag()
diff --git a/mwaa-etl-processor/airflow-bucket/requirements.txt b/mwaa-etl-processor/airflow-bucket/requirements.txt
new file mode 100644
index 0000000..72b587d
--- /dev/null
+++ b/mwaa-etl-processor/airflow-bucket/requirements.txt
@@ -0,0 +1 @@
+apache-airflow[amazon]==2.8.1
diff --git a/mwaa-etl-processor/run.sh b/mwaa-etl-processor/run.sh
new file mode 100755
index 0000000..d657660
--- /dev/null
+++ b/mwaa-etl-processor/run.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+set -euxo pipefail
+
+# Create default AWS connection for MWAA environment
+awslocal secretsmanager create-secret \
+    --name airflow/connections/aws_default \
+    --secret-string '{"conn_type": "aws", "login": "test", "password": "test", "extra": {"region_name": "us-east-1", "endpoint_url": "https://localhost.localstack.cloud:4566"}}'
+
+# Create MWAA environment with default AWS connection
+awslocal s3 mb s3://airflow
+awslocal mwaa create-environment \
+    --name my-mwaa-env \
+    --airflow-version 2.8.1 \
+    --dag-s3-path /dags \
+    --airflow-configuration-options '{"secrets.backend": "airflow.providers.amazon.aws.secrets.secrets_manager.SecretsManagerBackend", "secrets.backend_kwargs": "{\"connections_prefix\": \"airflow/connections/\", \"variables_prefix\": \"airflow/variables/\", \"endpoint_url\": \"https://localhost.localstack.cloud:4566\", \"aws_access_key_id\": \"test\", \"aws_secret_access_key\": \"test\", \"region_name\": \"us-east-1\"}"}' \
+    --execution-role-arn arn:aws:iam::000000000000:role/airflow-role \
+    --source-bucket-arn arn:aws:s3:::airflow \
+    --network-configuration {} \
+    --endpoint-url http://localhost.localstack.cloud:4566
+
+# Upload DAG to MWAA environment
+awslocal s3 cp --recursive airflow-bucket s3://airflow
+

From 0839ce959b101430fbdfd18e20ac8e617324929d Mon Sep 17 00:00:00 2001
From: Robert - Localstack <robert@localstack.cloud>
Date: Fri, 5 Jul 2024 05:15:45 +0300
Subject: [PATCH 02/16] WIP model training sample in MWAA

---
 mwaa-etl-processor/Makefile                   |   2 +-
 .../airflow-bucket/dags/etl_dag.py            |  26 ----
 .../airflow-bucket/dags/train_and_deploy.py   | 129 ++++++++++++++++++
 .../airflow-bucket/requirements.txt           |   4 +
 mwaa-etl-processor/proxy.conf                 |  10 ++
 mwaa-etl-processor/run.sh                     |   6 +-
 6 files changed, 149 insertions(+), 28 deletions(-)
 delete mode 100644 mwaa-etl-processor/airflow-bucket/dags/etl_dag.py
 create mode 100644 mwaa-etl-processor/airflow-bucket/dags/train_and_deploy.py
 create mode 100644 mwaa-etl-processor/proxy.conf

diff --git a/mwaa-etl-processor/Makefile b/mwaa-etl-processor/Makefile
index 979a9e5..99deb48 100644
--- a/mwaa-etl-processor/Makefile
+++ b/mwaa-etl-processor/Makefile
@@ -17,7 +17,7 @@ start:       ## Start LocalStack
 		echo "Creating localstack network"; \
 		docker network create --attachable localstack; \
 	fi
-	DOCKER_FLAGS='--network localstack --name=localhost.localstack.cloud' localstack start -d
+	DOCKER_FLAGS='--network localstack --name=localhost.localstack.cloud -e GATEWAY_SERVER=hypercorn' localstack start -d
 
 stop:        ## Stop LocalStack
 	@echo
diff --git a/mwaa-etl-processor/airflow-bucket/dags/etl_dag.py b/mwaa-etl-processor/airflow-bucket/dags/etl_dag.py
deleted file mode 100644
index 131bf6b..0000000
--- a/mwaa-etl-processor/airflow-bucket/dags/etl_dag.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from airflow.decorators import dag, task
-from airflow.utils.dates import days_ago
-from airflow.providers.amazon.aws.hooks.s3 import S3Hook
-
-@dag(schedule_interval=None, start_date=days_ago(1), tags=['example'])
-def example_s3_list_dag():
-
-    @task
-    def list_environment_variables():
-        import os
-        for key, value in os.environ.items():
-            print(f"Found env var: {key}={value}")
-        return dict(os.environ.items())
-
-    @task
-    def list_s3_bucket(env_items):
-        hook = S3Hook(aws_conn_id='aws_default')
-        bucket_name = 'airflow'  # Replace with your S3 bucket name
-        keys = hook.list_keys(bucket_name)
-        for key in keys:
-            print(f"Found key: {key}")
-        return keys
-
-    list_s3_bucket(list_environment_variables())
-
-dag = example_s3_list_dag()
diff --git a/mwaa-etl-processor/airflow-bucket/dags/train_and_deploy.py b/mwaa-etl-processor/airflow-bucket/dags/train_and_deploy.py
new file mode 100644
index 0000000..5b6e88d
--- /dev/null
+++ b/mwaa-etl-processor/airflow-bucket/dags/train_and_deploy.py
@@ -0,0 +1,129 @@
+from airflow.decorators import dag, task
+from airflow.utils.dates import days_ago
+from airflow.models import Variable
+
+from pydantic import BaseModel
+from typing import List, Dict, Optional
+
+import hashlib
+import pandas as pd
+
+from sklearn.preprocessing import LabelEncoder
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn import svm
+from sklearn import metrics
+from sklearn.tree import DecisionTreeClassifier
+
+class DatasetSpec(BaseModel):
+    url: str
+    name: str
+    feature_columns: List[str]
+    target_column: str
+
+@dag(schedule_interval=None, start_date=days_ago(1), tags=['example'])
+def train_and_deploy_classifier_model():
+    @task
+    def retrieve_dataset():
+        # Retrieve the dataset from the Airflow Variable.
+        dataset_spec = Variable.get(
+            "dataset_spec",
+            deserialize_json=True,
+        )
+        if dataset_spec is None:
+            raise ValueError("Dataset URL is not defined")
+        
+        try:
+            DatasetSpec(**dataset_spec)
+        except Exception as e:
+            raise ValueError(f"Invalid dataset spec: {e}")
+        
+        return dataset_spec
+
+    @task
+    def read_dataset(dataset_spec):
+        dataset = DatasetSpec(**dataset_spec)
+
+        # Read the dataset from the specified URL.
+        df = pd.read_csv(dataset.url)
+        print(df.head())
+
+        # Compute dataset ID.
+        dataset_id = hashlib.sha256(pd.util.hash_pandas_object(df, index=True).values).hexdigest()
+        print(f"Dataset ID: {dataset_id}")
+
+        # Return the dataset and its ID.
+        return {
+            "dataset": df.to_json(),
+            "dataset_id": dataset_id,
+        }
+    
+    @task
+    def train_model(dataset_spec: dict, dataset: dict, algorithm: str):
+        df_json: Dict = dataset["dataset"]
+        dataset_id: str = dataset["dataset_id"]
+
+        dataset_spec: DatasetSpec = DatasetSpec(**dataset_spec)
+        data: pd.DataFrame = pd.DataFrame.from_dict(df_json)
+
+        # Split the dataset into training and testing sets.
+        X_train, Y_train, X_test, Y_test = train_test_split(data, test_size=0.2)
+        X_train = X_train[dataset_spec.feature_columns]
+        X_test = X_test[dataset_spec.feature_columns]
+        Y_train = Y_train[dataset_spec.target_column]
+        Y_test = Y_test[dataset_spec.target_column]
+
+        # Encode the target column.
+        label_encoder = LabelEncoder()
+        Y_train_encoded = label_encoder.fit_transform(Y_train)
+        Y_test_encoded = label_encoder.transform(Y_test)
+        Y_train = Y_train_encoded
+        Y_test = Y_test_encoded
+
+        # Print the dataset information.
+        print(f"Feature columns: {dataset_spec.feature_columns}")
+        print(f"Target column: {dataset_spec.target_column}")
+        print(f"Train size: {len(X_train)}")
+        print(f"Test size: {len(X_test)}")
+        print(f"Training model using {algorithm} algorithm")
+
+        # Train the model using the specified algorithm.
+        if algorithm == "SVM":
+            model = svm.SVC()
+        elif algorithm == "LogisticRegression":
+            model = LogisticRegression()
+        elif algorithm == "DecisionTreeClassifier":
+            model = DecisionTreeClassifier()
+        else:
+            raise ValueError(f"Unsupported algorithm: {algorithm}")
+        
+        # Train the model.
+        model.fit(X_train, Y_train)
+
+        # Predict the target values.
+        Y_pred = model.predict(X_test)
+
+        # Compute the accuracy of the model.
+        accuracy = metrics.accuracy_score(Y_test, Y_pred)
+        precision = metrics.precision_score(Y_test, Y_pred)
+        recall = metrics.recall_score(Y_test, Y_pred)
+        f1 = metrics.f1_score(Y_test, Y_pred)
+        conf_matrix = metrics.confusion_matrix(Y_test, Y_pred)
+
+        # Print or log the evaluation metrics
+        print(f"Accuracy: {accuracy}")
+        print(f"Precision: {precision}")
+        print(f"Recall: {recall}")
+        print(f"F1 Score: {f1}")
+        print(f"Confusion Matrix:\n{conf_matrix}")
+    
+    dataset_spec: Dict = retrieve_dataset()
+    dataset = read_dataset(dataset_spec)
+
+    ml_algorithms = ["SVM", "LogisticRegression", "DecisionTreeClassifier"]
+    for algorithm in ml_algorithms:
+        train_model(dataset_spec, dataset, algorithm)
+
+
+dag = train_and_deploy_classifier_model()
diff --git a/mwaa-etl-processor/airflow-bucket/requirements.txt b/mwaa-etl-processor/airflow-bucket/requirements.txt
index 72b587d..f68086b 100644
--- a/mwaa-etl-processor/airflow-bucket/requirements.txt
+++ b/mwaa-etl-processor/airflow-bucket/requirements.txt
@@ -1 +1,5 @@
 apache-airflow[amazon]==2.8.1
+pandas
+pydantic
+scikit-learn
+scipy
diff --git a/mwaa-etl-processor/proxy.conf b/mwaa-etl-processor/proxy.conf
new file mode 100644
index 0000000..a42aecf
--- /dev/null
+++ b/mwaa-etl-processor/proxy.conf
@@ -0,0 +1,10 @@
+services:
+  secretsmanager:
+    resources:
+      # list of ARNs of secrets to proxy to real AWS
+      - 'arn:aws:secretsmanager:.+:secret:airflow/variables/.*'
+    operations:
+      # list of operation name regex patterns to include all operations
+      - '.*'
+    # optionally, specify if only read requests should be allowed; false allows all operations
+    read_only: true
diff --git a/mwaa-etl-processor/run.sh b/mwaa-etl-processor/run.sh
index d657660..542510e 100755
--- a/mwaa-etl-processor/run.sh
+++ b/mwaa-etl-processor/run.sh
@@ -6,6 +6,11 @@ awslocal secretsmanager create-secret \
     --name airflow/connections/aws_default \
     --secret-string '{"conn_type": "aws", "login": "test", "password": "test", "extra": {"region_name": "us-east-1", "endpoint_url": "https://localhost.localstack.cloud:4566"}}'
 
+# Create variable that holds the URL to the Iris dataset
+awslocal secretsmanager create-secret \
+    --name airflow/variables/dataset_spec \
+    --secret-string '{"url": "https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv", "name": "iris.data", "feature_columns": ["sepal_length", "sepal_width", "petal_length", "petal_width"], "target_column": "variety"}'
+
 # Create MWAA environment with default AWS connection
 awslocal s3 mb s3://airflow
 awslocal mwaa create-environment \
@@ -20,4 +25,3 @@ awslocal mwaa create-environment \
 
 # Upload DAG to MWAA environment
 awslocal s3 cp --recursive airflow-bucket s3://airflow
-

From b9876a3ebb7b2db39a954de069149800c59158f7 Mon Sep 17 00:00:00 2001
From: Robert - Localstack <robert@localstack.cloud>
Date: Fri, 5 Jul 2024 17:40:16 +0300
Subject: [PATCH 03/16] WIP

---
 .../airflow-bucket/dags/train_and_deploy.py   | 55 ++++++++++---------
 mwaa-etl-processor/run.sh                     | 15 ++++-
 2 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/mwaa-etl-processor/airflow-bucket/dags/train_and_deploy.py b/mwaa-etl-processor/airflow-bucket/dags/train_and_deploy.py
index 5b6e88d..0606b5f 100644
--- a/mwaa-etl-processor/airflow-bucket/dags/train_and_deploy.py
+++ b/mwaa-etl-processor/airflow-bucket/dags/train_and_deploy.py
@@ -8,14 +8,6 @@
 import hashlib
 import pandas as pd
 
-from sklearn.preprocessing import LabelEncoder
-from sklearn.linear_model import LogisticRegression
-from sklearn.model_selection import train_test_split
-from sklearn.neighbors import KNeighborsClassifier
-from sklearn import svm
-from sklearn import metrics
-from sklearn.tree import DecisionTreeClassifier
-
 class DatasetSpec(BaseModel):
     url: str
     name: str
@@ -55,31 +47,40 @@ def read_dataset(dataset_spec):
 
         # Return the dataset and its ID.
         return {
-            "dataset": df.to_json(),
+            "dataset": df.to_dict(), 
             "dataset_id": dataset_id,
         }
     
     @task
     def train_model(dataset_spec: dict, dataset: dict, algorithm: str):
+        from sklearn import svm
+        from sklearn import metrics
+        from sklearn.preprocessing import LabelEncoder
+        from sklearn.linear_model import LogisticRegression
+        from sklearn.model_selection import train_test_split
+        from sklearn.neighbors import KNeighborsClassifier
+        from sklearn.tree import DecisionTreeClassifier
+
         df_json: Dict = dataset["dataset"]
         dataset_id: str = dataset["dataset_id"]
-
         dataset_spec: DatasetSpec = DatasetSpec(**dataset_spec)
-        data: pd.DataFrame = pd.DataFrame.from_dict(df_json)
+        data: pd.DataFrame = pd.DataFrame().from_dict(df_json)
+
+        print(data.head())
+
+        # Split the dataset into feature columns and target column.
+        X_data = data[dataset_spec.feature_columns]
+        Y_data = data[dataset_spec.target_column]
 
         # Split the dataset into training and testing sets.
-        X_train, Y_train, X_test, Y_test = train_test_split(data, test_size=0.2)
-        X_train = X_train[dataset_spec.feature_columns]
-        X_test = X_test[dataset_spec.feature_columns]
-        Y_train = Y_train[dataset_spec.target_column]
-        Y_test = Y_test[dataset_spec.target_column]
+        X_train, X_test, y_train, y_test = train_test_split(X_data, Y_data, test_size=0.2)
 
         # Encode the target column.
         label_encoder = LabelEncoder()
-        Y_train_encoded = label_encoder.fit_transform(Y_train)
-        Y_test_encoded = label_encoder.transform(Y_test)
-        Y_train = Y_train_encoded
-        Y_test = Y_test_encoded
+        y_train_encoded = label_encoder.fit_transform(y_train)
+        y_test_encoded = label_encoder.transform(y_test)
+        y_train = y_train_encoded
+        y_test = y_test_encoded
 
         # Print the dataset information.
         print(f"Feature columns: {dataset_spec.feature_columns}")
@@ -99,17 +100,17 @@ def train_model(dataset_spec: dict, dataset: dict, algorithm: str):
             raise ValueError(f"Unsupported algorithm: {algorithm}")
         
         # Train the model.
-        model.fit(X_train, Y_train)
+        model.fit(X_train, y_train)
 
         # Predict the target values.
-        Y_pred = model.predict(X_test)
+        y_pred = model.predict(X_test)
 
         # Compute the accuracy of the model.
-        accuracy = metrics.accuracy_score(Y_test, Y_pred)
-        precision = metrics.precision_score(Y_test, Y_pred)
-        recall = metrics.recall_score(Y_test, Y_pred)
-        f1 = metrics.f1_score(Y_test, Y_pred)
-        conf_matrix = metrics.confusion_matrix(Y_test, Y_pred)
+        accuracy = metrics.accuracy_score(y_test, y_pred)
+        precision = metrics.precision_score(y_test, y_pred, average=None)
+        recall = metrics.recall_score(y_test, y_pred, average=None)
+        f1 = metrics.f1_score(y_test, y_pred, average=None)
+        conf_matrix = metrics.confusion_matrix(y_test, y_pred)
 
         # Print or log the evaluation metrics
         print(f"Accuracy: {accuracy}")
diff --git a/mwaa-etl-processor/run.sh b/mwaa-etl-processor/run.sh
index 542510e..b24f571 100755
--- a/mwaa-etl-processor/run.sh
+++ b/mwaa-etl-processor/run.sh
@@ -9,7 +9,12 @@ awslocal secretsmanager create-secret \
 # Create variable that holds the URL to the Iris dataset
 awslocal secretsmanager create-secret \
     --name airflow/variables/dataset_spec \
-    --secret-string '{"url": "https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv", "name": "iris.data", "feature_columns": ["sepal_length", "sepal_width", "petal_length", "petal_width"], "target_column": "variety"}'
+    --secret-string '{
+        "url": "https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv",
+        "name": "iris.data",
+        "feature_columns": ["sepal.length", "sepal.width", "petal.length", "petal.width"],
+        "target_column": "variety"
+    }'
 
 # Create MWAA environment with default AWS connection
 awslocal s3 mb s3://airflow
@@ -17,7 +22,13 @@ awslocal mwaa create-environment \
     --name my-mwaa-env \
     --airflow-version 2.8.1 \
     --dag-s3-path /dags \
-    --airflow-configuration-options '{"secrets.backend": "airflow.providers.amazon.aws.secrets.secrets_manager.SecretsManagerBackend", "secrets.backend_kwargs": "{\"connections_prefix\": \"airflow/connections/\", \"variables_prefix\": \"airflow/variables/\", \"endpoint_url\": \"https://localhost.localstack.cloud:4566\", \"aws_access_key_id\": \"test\", \"aws_secret_access_key\": \"test\", \"region_name\": \"us-east-1\"}"}' \
+    --airflow-configuration-options '{
+        "core.dags_are_paused_at_creation": "False", 
+        "scheduler.min_file_process_interval": "0", 
+        "scheduler.dag_dir_list_interval": "10", 
+        "secrets.backend": "airflow.providers.amazon.aws.secrets.secrets_manager.SecretsManagerBackend", 
+        "secrets.backend_kwargs": "{\"connections_prefix\": \"airflow/connections/\", \"variables_prefix\": \"airflow/variables/\", \"endpoint_url\": \"https://localhost.localstack.cloud:4566\", \"aws_access_key_id\": \"test\", \"aws_secret_access_key\": \"test\", \"region_name\": \"us-east-1\"}"
+    }' \
     --execution-role-arn arn:aws:iam::000000000000:role/airflow-role \
     --source-bucket-arn arn:aws:s3:::airflow \
     --network-configuration {} \

From 97b42bc3d49afeadab62481fb3270476e148a215 Mon Sep 17 00:00:00 2001
From: Robert - Localstack <robert@localstack.cloud>
Date: Mon, 8 Jul 2024 19:07:51 +0300
Subject: [PATCH 04/16] WIP

---
 .../airflow-bucket/dags/train_and_deploy.py   | 20 +++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/mwaa-etl-processor/airflow-bucket/dags/train_and_deploy.py b/mwaa-etl-processor/airflow-bucket/dags/train_and_deploy.py
index 0606b5f..bc3bdc9 100644
--- a/mwaa-etl-processor/airflow-bucket/dags/train_and_deploy.py
+++ b/mwaa-etl-processor/airflow-bucket/dags/train_and_deploy.py
@@ -107,24 +107,36 @@ def train_model(dataset_spec: dict, dataset: dict, algorithm: str):
 
         # Compute the accuracy of the model.
         accuracy = metrics.accuracy_score(y_test, y_pred)
-        precision = metrics.precision_score(y_test, y_pred, average=None)
-        recall = metrics.recall_score(y_test, y_pred, average=None)
-        f1 = metrics.f1_score(y_test, y_pred, average=None)
+        precision = metrics.precision_score(y_test, y_pred, average="weighted")
+        recall = metrics.recall_score(y_test, y_pred, average="weighted")
+        f1 = metrics.f1_score(y_test, y_pred, average="weighted")
         conf_matrix = metrics.confusion_matrix(y_test, y_pred)
 
+        # Save the model and label encoder classes.
+        model.classes_names = label_encoder.classes_
+
         # Print or log the evaluation metrics
         print(f"Accuracy: {accuracy}")
         print(f"Precision: {precision}")
         print(f"Recall: {recall}")
         print(f"F1 Score: {f1}")
         print(f"Confusion Matrix:\n{conf_matrix}")
+
+        return accuracy
+    
+    @task
+    def deploy_model(accuracies: List[float]):
+        print(f"Model accuracies: {accuracies}")
     
     dataset_spec: Dict = retrieve_dataset()
     dataset = read_dataset(dataset_spec)
 
     ml_algorithms = ["SVM", "LogisticRegression", "DecisionTreeClassifier"]
+    accuracies = []
     for algorithm in ml_algorithms:
-        train_model(dataset_spec, dataset, algorithm)
+        accuracies += [train_model(dataset_spec, dataset, algorithm)]
+
+    deploy_model(accuracies)
 
 
 dag = train_and_deploy_classifier_model()

From f90454b38e95a22257df272e0105afe53ea7897b Mon Sep 17 00:00:00 2001
From: Robert - Localstack <robert@localstack.cloud>
Date: Mon, 8 Jul 2024 19:22:53 +0300
Subject: [PATCH 05/16] Multi-line configs

---
 mwaa-etl-processor/Makefile |  2 +-
 mwaa-etl-processor/run.sh   | 36 +++++++++++++++++++++++-------------
 2 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/mwaa-etl-processor/Makefile b/mwaa-etl-processor/Makefile
index 99deb48..d19285e 100644
--- a/mwaa-etl-processor/Makefile
+++ b/mwaa-etl-processor/Makefile
@@ -17,7 +17,7 @@ start:       ## Start LocalStack
 		echo "Creating localstack network"; \
 		docker network create --attachable localstack; \
 	fi
-	DOCKER_FLAGS='--network localstack --name=localhost.localstack.cloud -e GATEWAY_SERVER=hypercorn' localstack start -d
+	DOCKER_FLAGS='-e LS_LOG=trace --network localstack --name=localhost.localstack.cloud -e GATEWAY_SERVER=hypercorn' localstack start -d
 
 stop:        ## Stop LocalStack
 	@echo
diff --git a/mwaa-etl-processor/run.sh b/mwaa-etl-processor/run.sh
index b24f571..889ffae 100755
--- a/mwaa-etl-processor/run.sh
+++ b/mwaa-etl-processor/run.sh
@@ -1,6 +1,27 @@
 #!/bin/bash
 set -euxo pipefail
 
+AIRFLOW_CONFIG=$(cat <<EOF | jq -c .
+{
+  "core.dags_are_paused_at_creation": "False",
+  "scheduler.min_file_process_interval": "0",
+  "scheduler.dag_dir_list_interval": "10",
+  "secrets.backend": "airflow.providers.amazon.aws.secrets.secrets_manager.SecretsManagerBackend",
+  "secrets.backend_kwargs": "{\"connections_prefix\": \"airflow/connections/\", \"variables_prefix\": \"airflow/variables/\", \"endpoint_url\": \"https://localhost.localstack.cloud:4566\", \"aws_access_key_id\": \"test\", \"aws_secret_access_key\": \"test\", \"region_name\": \"us-east-1\"}"
+}
+EOF
+)
+
+DATASET_CONFIG=$(cat <<EOF | jq -c .
+{
+    "url": "https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv",
+    "name": "iris.data",
+    "feature_columns": ["sepal.length", "sepal.width", "petal.length", "petal.width"],
+    "target_column": "variety"
+}
+EOF
+)
+
 # Create default AWS connection for MWAA environment
 awslocal secretsmanager create-secret \
     --name airflow/connections/aws_default \
@@ -9,12 +30,7 @@ awslocal secretsmanager create-secret \
 # Create variable that holds the URL to the Iris dataset
 awslocal secretsmanager create-secret \
     --name airflow/variables/dataset_spec \
-    --secret-string '{
-        "url": "https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv",
-        "name": "iris.data",
-        "feature_columns": ["sepal.length", "sepal.width", "petal.length", "petal.width"],
-        "target_column": "variety"
-    }'
+    --secret-string "$DATASET_CONFIG"
 
 # Create MWAA environment with default AWS connection
 awslocal s3 mb s3://airflow
@@ -22,13 +38,7 @@ awslocal mwaa create-environment \
     --name my-mwaa-env \
     --airflow-version 2.8.1 \
     --dag-s3-path /dags \
-    --airflow-configuration-options '{
-        "core.dags_are_paused_at_creation": "False", 
-        "scheduler.min_file_process_interval": "0", 
-        "scheduler.dag_dir_list_interval": "10", 
-        "secrets.backend": "airflow.providers.amazon.aws.secrets.secrets_manager.SecretsManagerBackend", 
-        "secrets.backend_kwargs": "{\"connections_prefix\": \"airflow/connections/\", \"variables_prefix\": \"airflow/variables/\", \"endpoint_url\": \"https://localhost.localstack.cloud:4566\", \"aws_access_key_id\": \"test\", \"aws_secret_access_key\": \"test\", \"region_name\": \"us-east-1\"}"
-    }' \
+    --airflow-configuration-options "$AIRFLOW_CONFIG" \
     --execution-role-arn arn:aws:iam::000000000000:role/airflow-role \
     --source-bucket-arn arn:aws:s3:::airflow \
     --network-configuration {} \

From fb956146dd9399f8209dbbcd17aac840cddd82ad Mon Sep 17 00:00:00 2001
From: Robert - Localstack <robert@localstack.cloud>
Date: Tue, 9 Jul 2024 02:20:11 +0300
Subject: [PATCH 06/16] More WIP

---
 .gitignore                                    |  2 +
 .../airflow-bucket/dags/train_and_deploy.py   | 60 ++++++++++++++++++-
 mwaa-etl-processor/lambda/main.py             | 37 ++++++++++++
 mwaa-etl-processor/run.sh                     | 15 +++++
 4 files changed, 112 insertions(+), 2 deletions(-)
 create mode 100644 mwaa-etl-processor/lambda/main.py

diff --git a/.gitignore b/.gitignore
index ae41d4d..22542c2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,3 +29,5 @@ glacier-s3-select/*result.csv
 azure-functions/.python_packages/
 
 */logs.txt
+mwaa-etl-processor/.lambda-env
+mwaa-etl-processor/.pkgs
diff --git a/mwaa-etl-processor/airflow-bucket/dags/train_and_deploy.py b/mwaa-etl-processor/airflow-bucket/dags/train_and_deploy.py
index bc3bdc9..aa2cb03 100644
--- a/mwaa-etl-processor/airflow-bucket/dags/train_and_deploy.py
+++ b/mwaa-etl-processor/airflow-bucket/dags/train_and_deploy.py
@@ -1,6 +1,12 @@
 from airflow.decorators import dag, task
 from airflow.utils.dates import days_ago
 from airflow.models import Variable
+from airflow.providers.amazon.aws.hooks.s3 import S3Hook
+from airflow.providers.amazon.aws.hooks.lambda_function import LambdaHook
+from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
+
+import pickle
+import io
 
 from pydantic import BaseModel
 from typing import List, Dict, Optional
@@ -114,6 +120,18 @@ def train_model(dataset_spec: dict, dataset: dict, algorithm: str):
 
         # Save the model and label encoder classes.
         model.classes_names = label_encoder.classes_
+        
+        # Dump the model and label encoder to S3.
+        s3_hook = S3Hook(aws_conn_id="aws_default")
+        s3_hook.create_bucket(bucket_name="models")
+        model_bytes = pickle.dumps(model)
+        model_buffer = io.BytesIO(model_bytes)
+        s3_hook.load_bytes(
+            bytes_data=model_buffer.getvalue(),
+            key=f"models/{dataset_id}/{algorithm}.pkl",
+            bucket_name="models",
+            replace=True,
+        )
 
         # Print or log the evaluation metrics
         print(f"Accuracy: {accuracy}")
@@ -125,8 +143,46 @@ def train_model(dataset_spec: dict, dataset: dict, algorithm: str):
         return accuracy
     
     @task
-    def deploy_model(accuracies: List[float]):
+    def deploy_model(ml_algorithms: List[str], accuracies: List[float], dataset: dict):
         print(f"Model accuracies: {accuracies}")
+        print(f"ML algorithms: {ml_algorithms}")
+
+        dataset_id = dataset["dataset_id"]
+        best_model_index = accuracies.index(max(accuracies))
+        best_ml_algorithm = ml_algorithms[best_model_index]
+
+        print(f"Location of best model: s3://models/models/{dataset_id}/{best_ml_algorithm}.pkl")
+        lambda_hook = LambdaHook(aws_conn_id="aws_default")
+        lambda_client = lambda_hook.get_client_type()
+
+        try:
+            lambda_hook.create_lambda(
+                function_name=f"ml-model-{best_ml_algorithm}-{dataset_id}"[:64],
+                runtime="python3.9",
+                role="arn:aws:iam::000000000000:role/lambda-role",
+                handler="main.lambda_handler",
+                code={
+                    "S3Bucket": "lambda",
+                    "S3Key": "deploy_lambda.zip",
+                },
+                environment={
+                    "Variables": {
+                        "MODEL_BUCKET_NAME": "models",
+                        "MODEL_OBJECT_KEY": f"models/{dataset_id}/{best_ml_algorithm}.pkl",
+                    },
+                },
+            )
+        except Exception as e:
+            print(f"Error creating the function: {e}")
+
+        try:
+            lambda_client.create_function_url_config(
+                FunctionName=f"ml-model-{best_ml_algorithm}-{dataset_id}"[:64],
+                AuthType="NONE",
+                InvokeMode="BUFFERED",
+            )
+        except Exception as e:
+            print(f"Error creating the function URL config: {e}")
     
     dataset_spec: Dict = retrieve_dataset()
     dataset = read_dataset(dataset_spec)
@@ -136,7 +192,7 @@ def deploy_model(accuracies: List[float]):
     for algorithm in ml_algorithms:
         accuracies += [train_model(dataset_spec, dataset, algorithm)]
 
-    deploy_model(accuracies)
+    deploy_model(ml_algorithms, accuracies, dataset)
 
 
 dag = train_and_deploy_classifier_model()
diff --git a/mwaa-etl-processor/lambda/main.py b/mwaa-etl-processor/lambda/main.py
new file mode 100644
index 0000000..2e78b87
--- /dev/null
+++ b/mwaa-etl-processor/lambda/main.py
@@ -0,0 +1,37 @@
+import os
+import boto3
+import pickle
+import json
+
+def lambda_handler(event, context):
+    # Access the JSON payload
+    sample = event.get("sample")
+
+    # Specify the S3 bucket and object key
+    bucket_name = os.environ["MODEL_BUCKET_NAME"]
+    object_key = os.environ["MODEL_OBJECT_KEY"]
+
+    # Create an S3 client
+    s3 = boto3.client("s3")
+
+    # Download the file from S3
+    response = s3.get_object(Bucket=bucket_name, Key=object_key)
+    model_data = response["Body"].read()
+
+    # Load the model from the downloaded data
+    model = pickle.loads(model_data)
+
+    # Run inference.
+    print(f"Running inference on input data: {sample}")
+    index_prediction = int(model.predict(sample)[0])
+    print(f"Prediction index: {index_prediction}")
+    prediction = model.classes_names[index_prediction]
+    print(f"Prediction: {prediction}")
+
+    return {
+        "statusCode": 200,
+        "headers": {
+            "Content-Type": "application/json"
+        },
+        "body": json.dumps({"prediction": prediction})
+    }
diff --git a/mwaa-etl-processor/run.sh b/mwaa-etl-processor/run.sh
index 889ffae..f8f5e6e 100755
--- a/mwaa-etl-processor/run.sh
+++ b/mwaa-etl-processor/run.sh
@@ -32,6 +32,21 @@ awslocal secretsmanager create-secret \
     --name airflow/variables/dataset_spec \
     --secret-string "$DATASET_CONFIG"
 
+# Create the Lambda zip files
+rm -rf .pkgs lambda/deploy_lambda.zip
+docker rm -f mwaa-etl-processor-build-pip-pkgs || true
+docker run -it --name mwaa-etl-processor-build-pip-pkgs \
+    --entrypoint bash \
+    localstack/lambda-python:3.9 \
+    -c "mkdir -p /var/tmp/.pkgs && pip install scikit-learn==1.3.2 --target /var/tmp/.pkgs"
+docker cp mwaa-etl-processor-build-pip-pkgs:/var/tmp/.pkgs/ .
+cd .pkgs && zip -r9 ../lambda/deploy_lambda.zip . && cd ..
+cd lambda && zip -g deploy_lambda.zip * && cd ..
+
+# And upload it to S3
+awslocal s3 mb s3://lambda
+awslocal s3 cp lambda/deploy_lambda.zip s3://lambda
+
 # Create MWAA environment with default AWS connection
 awslocal s3 mb s3://airflow
 awslocal mwaa create-environment \

From 515200b735885d1f51668c8bd0929f65fcd3fa56 Mon Sep 17 00:00:00 2001
From: Robert - Localstack <robert@localstack.cloud>
Date: Tue, 9 Jul 2024 03:11:22 +0300
Subject: [PATCH 07/16] Fixes

---
 .../airflow-bucket/dags/train_and_deploy.py   | 28 +++++++++----
 mwaa-etl-processor/lambda/main.py             |  9 +++--
 mwaa-etl-processor/run.sh                     | 40 ++++++++++++++-----
 3 files changed, 56 insertions(+), 21 deletions(-)

diff --git a/mwaa-etl-processor/airflow-bucket/dags/train_and_deploy.py b/mwaa-etl-processor/airflow-bucket/dags/train_and_deploy.py
index aa2cb03..b6f745b 100644
--- a/mwaa-etl-processor/airflow-bucket/dags/train_and_deploy.py
+++ b/mwaa-etl-processor/airflow-bucket/dags/train_and_deploy.py
@@ -1,15 +1,14 @@
+import pickle
+import io
+from datetime import datetime, timedelta
+
 from airflow.decorators import dag, task
-from airflow.utils.dates import days_ago
 from airflow.models import Variable
 from airflow.providers.amazon.aws.hooks.s3 import S3Hook
 from airflow.providers.amazon.aws.hooks.lambda_function import LambdaHook
-from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook
-
-import pickle
-import io
 
 from pydantic import BaseModel
-from typing import List, Dict, Optional
+from typing import List, Dict
 
 import hashlib
 import pandas as pd
@@ -20,7 +19,22 @@ class DatasetSpec(BaseModel):
     feature_columns: List[str]
     target_column: str
 
-@dag(schedule_interval=None, start_date=days_ago(1), tags=['example'])
+default_args = {
+    'owner': 'airflow',
+    'depends_on_past': False,
+    'email_on_failure': False,
+    'email_on_retry': False,
+    'retries': 1,
+    'retry_delay': timedelta(minutes=5),
+}
+
+@dag(
+    default_args=default_args,
+    schedule_interval="@once",
+    start_date=datetime(2021, 1, 1),
+    catchup=False,
+    tags=["ml-classifier"]
+)
 def train_and_deploy_classifier_model():
     @task
     def retrieve_dataset():
diff --git a/mwaa-etl-processor/lambda/main.py b/mwaa-etl-processor/lambda/main.py
index 2e78b87..f9218ec 100644
--- a/mwaa-etl-processor/lambda/main.py
+++ b/mwaa-etl-processor/lambda/main.py
@@ -4,8 +4,11 @@
 import json
 
 def lambda_handler(event, context):
-    # Access the JSON payload
-    sample = event.get("sample")
+    print(f"Received event: {json.dumps(event)}")
+
+    # Retrieve JSON from body
+    payload = json.loads(event["body"])
+    sample = payload.get("sample")
 
     # Specify the S3 bucket and object key
     bucket_name = os.environ["MODEL_BUCKET_NAME"]
@@ -22,7 +25,7 @@ def lambda_handler(event, context):
     model = pickle.loads(model_data)
 
     # Run inference.
-    print(f"Running inference on input data: {sample}")
+    print(f"Running inference on sample: {sample}")
     index_prediction = int(model.predict(sample)[0])
     print(f"Prediction index: {index_prediction}")
     prediction = model.classes_names[index_prediction]
diff --git a/mwaa-etl-processor/run.sh b/mwaa-etl-processor/run.sh
index f8f5e6e..de75388 100755
--- a/mwaa-etl-processor/run.sh
+++ b/mwaa-etl-processor/run.sh
@@ -1,6 +1,16 @@
 #!/bin/bash
 set -euxo pipefail
 
+DATASET_CONFIG=$(cat <<EOF | jq -c .
+{
+    "url": "https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv",
+    "name": "iris.data",
+    "feature_columns": ["sepal.length", "sepal.width", "petal.length", "petal.width"],
+    "target_column": "variety"
+}
+EOF
+)
+
 AIRFLOW_CONFIG=$(cat <<EOF | jq -c .
 {
   "core.dags_are_paused_at_creation": "False",
@@ -12,16 +22,6 @@ AIRFLOW_CONFIG=$(cat <<EOF | jq -c .
 EOF
 )
 
-DATASET_CONFIG=$(cat <<EOF | jq -c .
-{
-    "url": "https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv",
-    "name": "iris.data",
-    "feature_columns": ["sepal.length", "sepal.width", "petal.length", "petal.width"],
-    "target_column": "variety"
-}
-EOF
-)
-
 # Create default AWS connection for MWAA environment
 awslocal secretsmanager create-secret \
     --name airflow/connections/aws_default \
@@ -34,7 +34,7 @@ awslocal secretsmanager create-secret \
 
 # Create the Lambda zip files
 rm -rf .pkgs lambda/deploy_lambda.zip
-docker rm -f mwaa-etl-processor-build-pip-pkgs || true
+docker rm -f mwaa-etl-processor-build-pip-pkgs
 docker run -it --name mwaa-etl-processor-build-pip-pkgs \
     --entrypoint bash \
     localstack/lambda-python:3.9 \
@@ -61,3 +61,21 @@ awslocal mwaa create-environment \
 
 # Upload DAG to MWAA environment
 awslocal s3 cp --recursive airflow-bucket s3://airflow
+
+# Clean up
+rm -rf .pkgs lambda/deploy_lambda.zip
+docker rm -f mwaa-etl-processor-build-pip-pkgs
+
+# Wait for the DAG to be processed
+# Wait until a Lambda function with the "ml-model-" prefix is created
+while [[ $(awslocal lambda list-functions --query "Functions[?starts_with(FunctionName, 'ml-model-')].FunctionName" --output text) == "" ]]; do
+    sleep 1
+done
+
+# Retrieve the function URL of the Lambda function.
+function_name=$(awslocal lambda list-functions --query "Functions[?starts_with(FunctionName, 'ml-model-')].FunctionName" --output text)
+function_url=$(awslocal lambda get-function-url-config --function-name $function_name --query "FunctionUrl" --output text)
+
+# Run a curl POST request to the Lambda function and fail if the response is not 200
+sleep 5
+curl -XPOST -H "Content-Type: application/json" --data '{"sample": [[5.1,3.5,1.4,0.2]]}' $function_url --fail

From 65976c77c645d007a7a41aff4d0028693d19ca12 Mon Sep 17 00:00:00 2001
From: Robert - Localstack <robert@localstack.cloud>
Date: Tue, 9 Jul 2024 03:24:26 +0300
Subject: [PATCH 08/16] Change name of sample app

---
 .gitignore                                                | 3 +--
 .../Makefile                                              | 0
 .../airflow-bucket/dags/train_and_deploy.py               | 0
 .../airflow-bucket/requirements.txt                       | 0
 .../lambda/main.py                                        | 0
 .../proxy.conf                                            | 0
 .../run.sh                                                | 8 ++++----
 7 files changed, 5 insertions(+), 6 deletions(-)
 rename {mwaa-etl-processor => mwaa-ml-classifier-deployment}/Makefile (100%)
 rename {mwaa-etl-processor => mwaa-ml-classifier-deployment}/airflow-bucket/dags/train_and_deploy.py (100%)
 rename {mwaa-etl-processor => mwaa-ml-classifier-deployment}/airflow-bucket/requirements.txt (100%)
 rename {mwaa-etl-processor => mwaa-ml-classifier-deployment}/lambda/main.py (100%)
 rename {mwaa-etl-processor => mwaa-ml-classifier-deployment}/proxy.conf (100%)
 rename {mwaa-etl-processor => mwaa-ml-classifier-deployment}/run.sh (92%)

diff --git a/.gitignore b/.gitignore
index 22542c2..e84db24 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,5 +29,4 @@ glacier-s3-select/*result.csv
 azure-functions/.python_packages/
 
 */logs.txt
-mwaa-etl-processor/.lambda-env
-mwaa-etl-processor/.pkgs
+mwaa-ml-classifier-deployment/.pkgs
diff --git a/mwaa-etl-processor/Makefile b/mwaa-ml-classifier-deployment/Makefile
similarity index 100%
rename from mwaa-etl-processor/Makefile
rename to mwaa-ml-classifier-deployment/Makefile
diff --git a/mwaa-etl-processor/airflow-bucket/dags/train_and_deploy.py b/mwaa-ml-classifier-deployment/airflow-bucket/dags/train_and_deploy.py
similarity index 100%
rename from mwaa-etl-processor/airflow-bucket/dags/train_and_deploy.py
rename to mwaa-ml-classifier-deployment/airflow-bucket/dags/train_and_deploy.py
diff --git a/mwaa-etl-processor/airflow-bucket/requirements.txt b/mwaa-ml-classifier-deployment/airflow-bucket/requirements.txt
similarity index 100%
rename from mwaa-etl-processor/airflow-bucket/requirements.txt
rename to mwaa-ml-classifier-deployment/airflow-bucket/requirements.txt
diff --git a/mwaa-etl-processor/lambda/main.py b/mwaa-ml-classifier-deployment/lambda/main.py
similarity index 100%
rename from mwaa-etl-processor/lambda/main.py
rename to mwaa-ml-classifier-deployment/lambda/main.py
diff --git a/mwaa-etl-processor/proxy.conf b/mwaa-ml-classifier-deployment/proxy.conf
similarity index 100%
rename from mwaa-etl-processor/proxy.conf
rename to mwaa-ml-classifier-deployment/proxy.conf
diff --git a/mwaa-etl-processor/run.sh b/mwaa-ml-classifier-deployment/run.sh
similarity index 92%
rename from mwaa-etl-processor/run.sh
rename to mwaa-ml-classifier-deployment/run.sh
index de75388..ffd044d 100755
--- a/mwaa-etl-processor/run.sh
+++ b/mwaa-ml-classifier-deployment/run.sh
@@ -34,12 +34,12 @@ awslocal secretsmanager create-secret \
 
 # Create the Lambda zip files
 rm -rf .pkgs lambda/deploy_lambda.zip
-docker rm -f mwaa-etl-processor-build-pip-pkgs
-docker run -it --name mwaa-etl-processor-build-pip-pkgs \
+docker rm -f mwaa-ml-classifier-deployment-build-pip-pkgs
+docker run -it --name mwaa-ml-classifier-deployment-build-pip-pkgs \
     --entrypoint bash \
     localstack/lambda-python:3.9 \
     -c "mkdir -p /var/tmp/.pkgs && pip install scikit-learn==1.3.2 --target /var/tmp/.pkgs"
-docker cp mwaa-etl-processor-build-pip-pkgs:/var/tmp/.pkgs/ .
+docker cp mwaa-ml-classifier-deployment-build-pip-pkgs:/var/tmp/.pkgs/ .
 cd .pkgs && zip -r9 ../lambda/deploy_lambda.zip . && cd ..
 cd lambda && zip -g deploy_lambda.zip * && cd ..
 
@@ -64,7 +64,7 @@ awslocal s3 cp --recursive airflow-bucket s3://airflow
 
 # Clean up
 rm -rf .pkgs lambda/deploy_lambda.zip
-docker rm -f mwaa-etl-processor-build-pip-pkgs
+docker rm -f mwaa-ml-classifier-deployment-build-pip-pkgs
 
 # Wait for the DAG to be processed
 # Wait until a Lambda function with the "ml-model-" prefix is created

From b4dbe3722073419febe0d54fccbe7604c58c7ebe Mon Sep 17 00:00:00 2001
From: Robert - Localstack <robert@localstack.cloud>
Date: Tue, 9 Jul 2024 03:46:45 +0300
Subject: [PATCH 09/16] Lock scikit-learn for Airflow

---
 mwaa-ml-classifier-deployment/airflow-bucket/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mwaa-ml-classifier-deployment/airflow-bucket/requirements.txt b/mwaa-ml-classifier-deployment/airflow-bucket/requirements.txt
index f68086b..193a330 100644
--- a/mwaa-ml-classifier-deployment/airflow-bucket/requirements.txt
+++ b/mwaa-ml-classifier-deployment/airflow-bucket/requirements.txt
@@ -1,5 +1,5 @@
 apache-airflow[amazon]==2.8.1
 pandas
 pydantic
-scikit-learn
+scikit-learn==1.3.2
 scipy

From 1c3e2dd56aa71f8fdded886803aa6cb7f5fa87f2 Mon Sep 17 00:00:00 2001
From: Robert - Localstack <robert@localstack.cloud>
Date: Thu, 11 Jul 2024 21:13:23 +0300
Subject: [PATCH 10/16] Add README.md file

---
 mwaa-ml-classifier-deployment/README.md | 55 +++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 mwaa-ml-classifier-deployment/README.md

diff --git a/mwaa-ml-classifier-deployment/README.md b/mwaa-ml-classifier-deployment/README.md
new file mode 100644
index 0000000..7cd92af
--- /dev/null
+++ b/mwaa-ml-classifier-deployment/README.md
@@ -0,0 +1,55 @@
+# Localstack Demo: Training and deploying ML classifier with MWAA
+
+App that creates a DAG inside MWAA that takes a dataset, and builds a classifier model based on the feature columns and targetting column. A classifier is trained and the one with the best accuracy out of a bunch of three algorithms is picked up: SVM, Logistic Regression, and Decision Tree. Finally, the model is deployed as a Lambda function.
+
+To keep it simple, no external dependencies (custom Docker images) were added, and the training happens locally in Airflow. Following that, the model gets deployed as a Lambda function. While not ideal, as usually all workloads are supposed to be off-loaded (i.e. with SageMaker, or EC2 / AWS Batch jobs), but easily trained models can still technically be run with the local executor.
+
+The only input the dag has is a `airflow/variables/dataset_spec` secret in `SecretsManager` service, like the following one:
+
+```json
+{
+    "url": "https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv",
+    "name": "iris.data",
+    "feature_columns": ["sepal.length", "sepal.width", "petal.length", "petal.width"],
+    "target_column": "variety"
+}
+```
+
+## Prerequisites
+
+* LocalStack
+* Docker
+* Python 3.8+ / Python Pip
+* `make`
+* `jq`
+* `curl`
+* `awslocal`
+
+## Installing
+
+To install the dependencies:
+
+```shell
+make install
+```
+
+## Starting LocalStack
+
+Make sure that LocalStack is started:
+
+```shell
+LOCALSTACK_AUTH_TOKEN=... make start
+```
+
+## Running
+
+Run the sample demo script:
+
+```shell
+make run
+```
+
+## License
+
+This code is available under the Apache 2.0 license.
+

From da8448ee13b7a824f6e86cc50871ac91055fd6e0 Mon Sep 17 00:00:00 2001
From: Robert - Localstack <robert@localstack.cloud>
Date: Thu, 11 Jul 2024 21:14:33 +0300
Subject: [PATCH 11/16] Capitalize dag word

---
 mwaa-ml-classifier-deployment/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mwaa-ml-classifier-deployment/README.md b/mwaa-ml-classifier-deployment/README.md
index 7cd92af..4282e60 100644
--- a/mwaa-ml-classifier-deployment/README.md
+++ b/mwaa-ml-classifier-deployment/README.md
@@ -4,7 +4,7 @@ App that creates a DAG inside MWAA that takes a dataset, and builds a classifier
 
 To keep it simple, no external dependencies (custom Docker images) were added, and the training happens locally in Airflow. Following that, the model gets deployed as a Lambda function. While not ideal, as usually all workloads are supposed to be off-loaded (i.e. with SageMaker, or EC2 / AWS Batch jobs), but easily trained models can still technically be run with the local executor.
 
-The only input the dag has is a `airflow/variables/dataset_spec` secret in `SecretsManager` service, like the following one:
+The only input the DAG has is a `airflow/variables/dataset_spec` secret in `SecretsManager` service, like the following one:
 
 ```json
 {

From 00c3de280d9c50684a14e92f0c3959650bd8ecb8 Mon Sep 17 00:00:00 2001
From: Robert - Localstack <robert@localstack.cloud>
Date: Sat, 13 Jul 2024 15:54:10 +0300
Subject: [PATCH 12/16] Add missing env var to makefile target

---
 mwaa-ml-classifier-deployment/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mwaa-ml-classifier-deployment/Makefile b/mwaa-ml-classifier-deployment/Makefile
index d19285e..eb049a8 100644
--- a/mwaa-ml-classifier-deployment/Makefile
+++ b/mwaa-ml-classifier-deployment/Makefile
@@ -17,7 +17,7 @@ start:       ## Start LocalStack
 		echo "Creating localstack network"; \
 		docker network create --attachable localstack; \
 	fi
-	DOCKER_FLAGS='-e LS_LOG=trace --network localstack --name=localhost.localstack.cloud -e GATEWAY_SERVER=hypercorn' localstack start -d
+	EXTRA_CORS_ALLOWED_ORIGINS='*' DOCKER_FLAGS='-e LS_LOG=trace --network localstack --name=localhost.localstack.cloud -e GATEWAY_SERVER=hypercorn' localstack start -d
 
 stop:        ## Stop LocalStack
 	@echo

From 87b038340e030d7d262a1fcd42a6c52fdc3a2b48 Mon Sep 17 00:00:00 2001
From: Robert - Localstack <robert.lucian.chiriac@gmail.com>
Date: Mon, 12 Aug 2024 15:12:48 +0300
Subject: [PATCH 13/16] Fix make start command when using sh

---
 mwaa-ml-classifier-deployment/Makefile | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/mwaa-ml-classifier-deployment/Makefile b/mwaa-ml-classifier-deployment/Makefile
index eb049a8..c6e02d1 100644
--- a/mwaa-ml-classifier-deployment/Makefile
+++ b/mwaa-ml-classifier-deployment/Makefile
@@ -13,10 +13,14 @@ run:         ## Run MWAA ETL Processor
 	./run.sh
 
 start:       ## Start LocalStack
-	@if [ -z `docker network ls --filter name=localstack -q` ]; then \
-		echo "Creating localstack network"; \
-		docker network create --attachable localstack; \
-	fi
+	@echo "Checking existing networks:"
+    @docker network ls --filter name=localstack -q | xargs
+    @if [ -z "$$(docker network ls --filter name=localstack -q | xargs)" ]; then \
+        echo "Creating localstack network"; \
+        docker network create --attachable localstack; \
+    else \
+        echo "LocalStack network already exists"; \
+    fi
 	EXTRA_CORS_ALLOWED_ORIGINS='*' DOCKER_FLAGS='-e LS_LOG=trace --network localstack --name=localhost.localstack.cloud -e GATEWAY_SERVER=hypercorn' localstack start -d
 
 stop:        ## Stop LocalStack

From b728cade011be015e22b624d7aa86023c07f4a22 Mon Sep 17 00:00:00 2001
From: Robert - Localstack <robert.lucian.chiriac@gmail.com>
Date: Mon, 12 Aug 2024 17:11:15 +0300
Subject: [PATCH 14/16] Improve readme doc

---
 mwaa-ml-classifier-deployment/README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/mwaa-ml-classifier-deployment/README.md b/mwaa-ml-classifier-deployment/README.md
index 4282e60..07c5489 100644
--- a/mwaa-ml-classifier-deployment/README.md
+++ b/mwaa-ml-classifier-deployment/README.md
@@ -49,6 +49,14 @@ Run the sample demo script:
 make run
 ```
 
+## Proxying Secrets
+
+To proxy Airflow variables to upstream AWS, you can use the [proxy.conf](proxy.conf) config file to only use upstream AWS secrets as the Airflow variables. That's because we're sourcing the Airflow variables from the AWS Secrets backend. This assumes you have the `localstack-extension-aws-replicator` extension installed onto the LocalStack instance: https://pypi.org/project/localstack-extension-aws-replicator/.
+
+```shell
+localstack aws proxy -c proxy.conf --container
+```
+
 ## License
 
 This code is available under the Apache 2.0 license.

From b8b350eea947abad1272fb493aa7c6f265df771c Mon Sep 17 00:00:00 2001
From: Robert - Localstack <robert.lucian.chiriac@gmail.com>
Date: Tue, 13 Aug 2024 13:41:04 +0300
Subject: [PATCH 15/16] Use tab indentation

---
 mwaa-ml-classifier-deployment/Makefile | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/mwaa-ml-classifier-deployment/Makefile b/mwaa-ml-classifier-deployment/Makefile
index c6e02d1..3e3a169 100644
--- a/mwaa-ml-classifier-deployment/Makefile
+++ b/mwaa-ml-classifier-deployment/Makefile
@@ -14,13 +14,13 @@ run:         ## Run MWAA ETL Processor
 
 start:       ## Start LocalStack
 	@echo "Checking existing networks:"
-    @docker network ls --filter name=localstack -q | xargs
-    @if [ -z "$$(docker network ls --filter name=localstack -q | xargs)" ]; then \
-        echo "Creating localstack network"; \
-        docker network create --attachable localstack; \
-    else \
-        echo "LocalStack network already exists"; \
-    fi
+	@docker network ls --filter name=localstack -q | xargs
+	@if [ -z "$$(docker network ls --filter name=localstack -q | xargs)" ]; then \
+		echo "Creating localstack network"; \
+		docker network create --attachable localstack; \
+	else \
+		echo "LocalStack network already exists"; \
+	fi
 	EXTRA_CORS_ALLOWED_ORIGINS='*' DOCKER_FLAGS='-e LS_LOG=trace --network localstack --name=localhost.localstack.cloud -e GATEWAY_SERVER=hypercorn' localstack start -d
 
 stop:        ## Stop LocalStack

From 6a629fb6ed66fd984e8bb0068e4f9deee348da16 Mon Sep 17 00:00:00 2001
From: Robert - Localstack <robert.lucian.chiriac@gmail.com>
Date: Tue, 13 Aug 2024 14:08:52 +0300
Subject: [PATCH 16/16] Remove if branch

---
 mwaa-ml-classifier-deployment/Makefile | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/mwaa-ml-classifier-deployment/Makefile b/mwaa-ml-classifier-deployment/Makefile
index 3e3a169..21e38e7 100644
--- a/mwaa-ml-classifier-deployment/Makefile
+++ b/mwaa-ml-classifier-deployment/Makefile
@@ -13,14 +13,9 @@ run:         ## Run MWAA ETL Processor
 	./run.sh
 
 start:       ## Start LocalStack
-	@echo "Checking existing networks:"
-	@docker network ls --filter name=localstack -q | xargs
-	@if [ -z "$$(docker network ls --filter name=localstack -q | xargs)" ]; then \
-		echo "Creating localstack network"; \
-		docker network create --attachable localstack; \
-	else \
-		echo "LocalStack network already exists"; \
-	fi
+	@echo "Attempting to create localstack network (if it doesn't already exist):"
+	@docker network create --attachable localstack || true
+	@echo "Starting LocalStack:"
 	EXTRA_CORS_ALLOWED_ORIGINS='*' DOCKER_FLAGS='-e LS_LOG=trace --network localstack --name=localhost.localstack.cloud -e GATEWAY_SERVER=hypercorn' localstack start -d
 
 stop:        ## Stop LocalStack