From 67825a0c0b442d93b8af43d23bd82afcad7fa549 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 17 Oct 2025 22:05:34 +0800
Subject: [PATCH 1/8] expand pvc size & fix inference-pool selector error

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 deploy/kubernetes/README.md                   | 32 +++++++-----
 .../inference-pool/inference-pool.yaml        |  2 +-
 deploy/kubernetes/pvc.yaml                    |  2 +-
 website/docs/installation/kubernetes.md       | 49 +++++++++++--------
 4 files changed, 52 insertions(+), 33 deletions(-)

diff --git a/deploy/kubernetes/README.md b/deploy/kubernetes/README.md
index 175763cdc..bfce72a8e 100644
--- a/deploy/kubernetes/README.md
+++ b/deploy/kubernetes/README.md
@@ -7,7 +7,7 @@ This directory contains Kubernetes manifests for deploying the Semantic Router u
 The deployment consists of:
 
 - **ConfigMap**: Contains `config.yaml` and `tools_db.json` configuration files
-- **PersistentVolumeClaim**: 10Gi storage for model files
+- **PersistentVolumeClaim**: 30Gi storage for model files (adjust based on models you enable)
 - **Deployment**:
   - **Init Container**: Downloads/copies model files to persistent volume
   - **Main Container**: Runs the semantic router service
@@ -29,11 +29,11 @@ The deployment consists of:
 kubectl apply -k deploy/kubernetes/
 
 # Check deployment status
-kubectl get pods -l app=semantic-router -n semantic-router
-kubectl get services -l app=semantic-router -n semantic-router
+kubectl get pods -l app=semantic-router -n vllm-semantic-router-system
+kubectl get services -l app=semantic-router -n vllm-semantic-router-system
 
 # View logs
-kubectl logs -l app=semantic-router -n semantic-router -f
+kubectl logs -l app=semantic-router -n vllm-semantic-router-system -f
 ```
 
 ### Kind (Kubernetes in Docker) Deployment
@@ -86,20 +86,20 @@ kubectl wait --for=condition=Ready nodes --all --timeout=300s
 kubectl apply -k deploy/kubernetes/
 
 # Wait for deployment to be ready
-kubectl wait --for=condition=Available deployment/semantic-router -n semantic-router --timeout=600s
+kubectl wait --for=condition=Available deployment/semantic-router -n vllm-semantic-router-system --timeout=600s
 ```
 
 **Step 3: Check deployment status**
 
 ```bash
 # Check pods
-kubectl get pods -n semantic-router -o wide
+kubectl get pods -n vllm-semantic-router-system -o wide
 
 # Check services
-kubectl get services -n semantic-router
+kubectl get services -n vllm-semantic-router-system
 
 # View logs
-kubectl logs -l app=semantic-router -n semantic-router -f
+kubectl logs -l app=semantic-router -n vllm-semantic-router-system -f
 ```
 
 #### Resource Requirements for Kind
@@ -137,13 +137,13 @@ Or using kubectl directly:
 
 ```bash
 # Access Classification API (HTTP REST)
-kubectl port-forward -n semantic-router svc/semantic-router 8080:8080
+kubectl port-forward -n vllm-semantic-router-system svc/semantic-router 8080:8080
 
 # Access gRPC API
-kubectl port-forward -n semantic-router svc/semantic-router 50051:50051
+kubectl port-forward -n vllm-semantic-router-system svc/semantic-router 50051:50051
 
 # Access metrics
-kubectl port-forward -n semantic-router svc/semantic-router-metrics 9190:9190
+kubectl port-forward -n vllm-semantic-router-system svc/semantic-router-metrics 9190:9190
 ```
 
 #### Testing the Deployment
@@ -195,6 +195,11 @@ kubectl delete -k deploy/kubernetes/
 kind delete cluster --name semantic-router-cluster
 ```
 
+## Notes on dependencies
+
+- Gateway API Inference Extension CRDs are required only when using the Envoy AI Gateway integration in `deploy/kubernetes/ai-gateway/`. Follow the installation steps in `website/docs/installation/kubernetes.md` if you plan to use the gateway path.
+- The core kustomize deployment in this folder does not install Envoy Gateway or AI Gateway; those are optional components documented separately.
+
 ## Make Commands Reference
 
 The project provides comprehensive make targets for managing kind clusters and deployments:
@@ -293,6 +298,11 @@ kubectl top pods -n semantic-router
 # Adjust resource limits in deployment.yaml if needed
 ```
 
+### Storage sizing
+
+- The default PVC is 30Gi. If the enabled models are small, you can reduce it; otherwise reserve at least 2–3x the total model size.
+- If your cluster's default StorageClass isn't named `standard`, change `storageClassName` in `pvc.yaml` accordingly or remove the field to use the default class.
+
 ### Resource Optimization
 
 For different environments, you can adjust resource requirements:
diff --git a/deploy/kubernetes/ai-gateway/inference-pool/inference-pool.yaml b/deploy/kubernetes/ai-gateway/inference-pool/inference-pool.yaml
index 64afc6f93..7b52e07b1 100644
--- a/deploy/kubernetes/ai-gateway/inference-pool/inference-pool.yaml
+++ b/deploy/kubernetes/ai-gateway/inference-pool/inference-pool.yaml
@@ -11,7 +11,7 @@ spec:
     - number: 50051
   selector:
     matchLabels:
-      app: vllm-semantic-router
+      app: semantic-router
   endpointPickerRef:
     name: semantic-router
     port:
diff --git a/deploy/kubernetes/pvc.yaml b/deploy/kubernetes/pvc.yaml
index 089293069..43b66eb95 100644
--- a/deploy/kubernetes/pvc.yaml
+++ b/deploy/kubernetes/pvc.yaml
@@ -9,5 +9,5 @@ spec:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 10Gi
+      storage: 30Gi
   storageClassName: standard
diff --git a/website/docs/installation/kubernetes.md b/website/docs/installation/kubernetes.md
index 80821ad9c..792ff935b 100644
--- a/website/docs/installation/kubernetes.md
+++ b/website/docs/installation/kubernetes.md
@@ -37,6 +37,13 @@ kubectl wait --for=condition=Ready nodes --all --timeout=300s
 
 Configure the semantic router by editing `deploy/kubernetes/config.yaml`. This file contains the vLLM configuration, including model config, endpoints, and policies.
 
+Important notes before you apply manifests:
+
+- `vllm_endpoints.address` must be an IP address (not hostname) reachable from inside the cluster. If your LLM backends run as K8s Services, use the ClusterIP (for example `10.96.0.10`) and set `port` accordingly. Do not include protocol or path.
+- The PVC in `deploy/kubernetes/pvc.yaml` uses `storageClassName: standard`. On some clouds or local clusters, the default StorageClass name may differ (e.g., `standard-rwo`, `gp2`, or a provisioner like local-path). Adjust as needed.
+- Default PVC size is 30Gi. Size it to at least 2–3x of your total model footprint to leave room for indexes and updates.
+- The initContainer downloads several models from Hugging Face on first run and writes them into the PVC. Ensure outbound egress to Hugging Face is allowed and there is at least ~6–8 GiB free space for the models specified.
+
 Deploy the semantic router service with all required components:
 
 ```bash
@@ -135,26 +142,28 @@ Expected output should show the inference pool in `Accepted` state:
 ```yaml
 status:
   parent:
-  - conditions:
-    - lastTransitionTime: "2025-09-27T09:27:32Z"
-      message: 'InferencePool has been Accepted by controller ai-gateway-controller:
-        InferencePool reconciled successfully'
-      observedGeneration: 1
-      reason: Accepted
-      status: "True"
-      type: Accepted
-    - lastTransitionTime: "2025-09-27T09:27:32Z"
-      message: 'Reference resolution by controller ai-gateway-controller: All references
-        resolved successfully'
-      observedGeneration: 1
-      reason: ResolvedRefs
-      status: "True"
-      type: ResolvedRefs
-    parentRef:
-      group: gateway.networking.k8s.io
-      kind: Gateway
-      name: vllm-semantic-router
-      namespace: vllm-semantic-router-system
+    - conditions:
+        - lastTransitionTime: "2025-09-27T09:27:32Z"
+          message:
+            "InferencePool has been Accepted by controller ai-gateway-controller:
+            InferencePool reconciled successfully"
+          observedGeneration: 1
+          reason: Accepted
+          status: "True"
+          type: Accepted
+        - lastTransitionTime: "2025-09-27T09:27:32Z"
+          message:
+            "Reference resolution by controller ai-gateway-controller: All references
+            resolved successfully"
+          observedGeneration: 1
+          reason: ResolvedRefs
+          status: "True"
+          type: ResolvedRefs
+      parentRef:
+        group: gateway.networking.k8s.io
+        kind: Gateway
+        name: vllm-semantic-router
+        namespace: vllm-semantic-router-system
 ```
 
 ## Testing the Deployment

From b9d8cb3275a720f3cdcf8cb5294893871caa4514 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 17 Oct 2025 22:11:52 +0800
Subject: [PATCH 2/8] add llm-katan to k8s

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 deploy/kubernetes/README.md                   |  18 ++
 deploy/kubernetes/config.yaml                 |  57 +++---
 .../kubernetes/deployment.with-llm-katan.yaml | 169 ++++++++++++++++++
 deploy/kubernetes/kustomization.yaml          |  20 +--
 4 files changed, 226 insertions(+), 38 deletions(-)
 create mode 100644 deploy/kubernetes/deployment.with-llm-katan.yaml

diff --git a/deploy/kubernetes/README.md b/deploy/kubernetes/README.md
index bfce72a8e..3c45a37c2 100644
--- a/deploy/kubernetes/README.md
+++ b/deploy/kubernetes/README.md
@@ -2,6 +2,8 @@
 
 This directory contains Kubernetes manifests for deploying the Semantic Router using Kustomize.
 
+By default, the base kustomization deploys a Pod with an `llm-katan` sidecar so that the default config (qwen3 on 127.0.0.1:8002) works out-of-the-box. If you prefer to run without the sidecar, replace `deployment.with-llm-katan.yaml` with `deployment.yaml` in `kustomization.yaml`.
+
 ## Architecture
 
 The deployment consists of:
@@ -318,6 +320,7 @@ Edit the `resources` section in `deployment.yaml` accordingly.
 ### Kubernetes Manifests (`deploy/kubernetes/`)
 
 - `deployment.yaml` - Main application deployment with optimized resource settings
+- `deployment.with-llm-katan.yaml` - Optional variant including an llm-katan sidecar listening on 8002 (works with default config pointing to qwen3 at 127.0.0.1:8002)
 - `service.yaml` - Services for gRPC, HTTP API, and metrics
 - `pvc.yaml` - Persistent volume claim for model storage
 - `namespace.yaml` - Dedicated namespace for the application
@@ -327,6 +330,21 @@ Edit the `resources` section in `deployment.yaml` accordingly.
 
 ### Development Tools
 
+## Optional: run with llm-katan sidecar
+
+To mimic the docker-compose default setup, you can deploy a variant that runs an `llm-katan` sidecar inside the same Pod. The provided `deployment.with-llm-katan.yaml` exposes llm-katan on `0.0.0.0:8002` and serves the model name `qwen3`.
+
+Notes:
+
+- Ensure the Qwen model content is available at `/app/models/Qwen/Qwen3-0.6B` in the PVC. You can pre-populate the PV or customize the init container to fetch from an internal source.
+- The default Kubernetes `config.yaml` has been aligned to use `qwen3` and endpoint `127.0.0.1:8002`, so it will work out-of-the-box with this sidecar.
+
+Apply the sidecar variant instead of the default deployment:
+
+```bash
+kubectl apply -n vllm-semantic-router-system -f deploy/kubernetes/deployment.with-llm-katan.yaml
+```
+
 - `tools/kind/kind-config.yaml` - Kind cluster configuration for local development
 - `tools/make/kube.mk` - Make targets for Kubernetes operations
 - `Makefile` - Root makefile including all make targets
diff --git a/deploy/kubernetes/config.yaml b/deploy/kubernetes/config.yaml
index 5bc40cbbe..777600130 100644
--- a/deploy/kubernetes/config.yaml
+++ b/deploy/kubernetes/config.yaml
@@ -1,15 +1,15 @@
 bert_model:
-  model_id: sentence-transformers/all-MiniLM-L12-v2
+  model_id: models/all-MiniLM-L12-v2
   threshold: 0.6
   use_cpu: true
 
 semantic_cache:
   enabled: true
-  backend_type: "memory"  # Options: "memory" or "milvus"
+  backend_type: "memory" # Options: "memory" or "milvus"
   similarity_threshold: 0.8
-  max_entries: 1000  # Only applies to memory backend
+  max_entries: 1000 # Only applies to memory backend
   ttl_seconds: 3600
-  eviction_policy: "fifo"  
+  eviction_policy: "fifo"
 
 tools:
   enabled: true
@@ -32,13 +32,13 @@ prompt_guard:
 # NOT supported: domain names (example.com), protocol prefixes (http://), paths (/api), ports in address (use 'port' field)
 vllm_endpoints:
   - name: "endpoint1"
-    address: "127.0.0.1"  # IPv4 address - REQUIRED format
-    port: 8000
+    address: "127.0.0.1" # llm-katan sidecar or local backend
+    port: 8002
     weight: 1
 
 model_config:
-  "openai/gpt-oss-20b":
-    reasoning_family: "gpt-oss"  # This model uses GPT-OSS reasoning syntax
+  "qwen3":
+    reasoning_family: "qwen3" # Match docker-compose default model name
     preferred_endpoints: ["endpoint1"]
     pii_policy:
       allow_by_default: true
@@ -62,76 +62,76 @@ classifier:
 categories:
   - name: business
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.7
-        use_reasoning: false  # Business performs better without reasoning
+        use_reasoning: false # Business performs better without reasoning
   - name: law
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.4
         use_reasoning: false
   - name: psychology
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.6
         use_reasoning: false
   - name: biology
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.9
         use_reasoning: false
   - name: chemistry
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.6
-        use_reasoning: true  # Enable reasoning for complex chemistry
+        use_reasoning: true # Enable reasoning for complex chemistry
   - name: history
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.7
         use_reasoning: false
   - name: other
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.7
         use_reasoning: false
   - name: health
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.5
         use_reasoning: false
   - name: economics
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 1.0
         use_reasoning: false
   - name: math
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 1.0
-        use_reasoning: true  # Enable reasoning for complex math
+        use_reasoning: true # Enable reasoning for complex math
   - name: physics
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.7
-        use_reasoning: true  # Enable reasoning for physics
+        use_reasoning: true # Enable reasoning for physics
   - name: computer science
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.6
         use_reasoning: false
   - name: philosophy
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.5
         use_reasoning: false
   - name: engineering
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.7
         use_reasoning: false
 
-default_model: openai/gpt-oss-20b
+default_model: qwen3
 
 # Reasoning family configurations
 reasoning_families:
@@ -164,5 +164,6 @@ api:
       detailed_goroutine_tracking: true
       high_resolution_timing: false
       sample_rate: 1.0
-      duration_buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
+      duration_buckets:
+        [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
       size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]
diff --git a/deploy/kubernetes/deployment.with-llm-katan.yaml b/deploy/kubernetes/deployment.with-llm-katan.yaml
new file mode 100644
index 000000000..acf45e1e4
--- /dev/null
+++ b/deploy/kubernetes/deployment.with-llm-katan.yaml
@@ -0,0 +1,169 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: semantic-router
+  namespace: vllm-semantic-router-system
+  labels:
+    app: semantic-router
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: semantic-router
+  template:
+    metadata:
+      labels:
+        app: semantic-router
+    spec:
+      initContainers:
+        - name: model-downloader
+          image: python:3.11-slim
+          securityContext:
+            runAsNonRoot: false
+            allowPrivilegeEscalation: false
+          command: ["/bin/bash", "-c"]
+          args:
+            - |
+              set -e
+              echo "Installing Hugging Face CLI..."
+              pip install --no-cache-dir huggingface_hub[cli]
+
+              echo "Downloading classifier models to persistent volume..."
+              cd /app/models
+
+              # Download category classifier model
+              if [ ! -d "category_classifier_modernbert-base_model" ]; then
+                echo "Downloading category classifier model..."
+                huggingface-cli download LLM-Semantic-Router/category_classifier_modernbert-base_model --local-dir category_classifier_modernbert-base_model
+              else
+                echo "Category classifier model already exists, skipping..."
+              fi
+
+              # Download PII classifier model
+              if [ ! -d "pii_classifier_modernbert-base_model" ]; then
+                echo "Downloading PII classifier model..."
+                huggingface-cli download LLM-Semantic-Router/pii_classifier_modernbert-base_model --local-dir pii_classifier_modernbert-base_model
+              else
+                echo "PII classifier model already exists, skipping..."
+              fi
+
+              # Download jailbreak classifier model
+              if [ ! -d "jailbreak_classifier_modernbert-base_model" ]; then
+                echo "Downloading jailbreak classifier model..."
+                huggingface-cli download LLM-Semantic-Router/jailbreak_classifier_modernbert-base_model --local-dir jailbreak_classifier_modernbert-base_model
+              else
+                echo "Jailbreak classifier model already exists, skipping..."
+              fi
+
+              # Download PII token classifier model
+              if [ ! -d "pii_classifier_modernbert-base_presidio_token_model" ]; then
+                echo "Downloading PII token classifier model..."
+                huggingface-cli download LLM-Semantic-Router/pii_classifier_modernbert-base_presidio_token_model --local-dir pii_classifier_modernbert-base_presidio_token_model
+              else
+                echo "PII token classifier model already exists, skipping..."
+              fi
+
+              # Optional: Prepare Qwen model directory for llm-katan sidecar
+              # NOTE: Provide the model content under /app/models/Qwen/Qwen3-0.6B via pre-populated PV
+              # or customize the following block to fetch from your internal artifact store.
+              if [ ! -d "Qwen/Qwen3-0.6B" ]; then
+                echo "Qwen3-0.6B directory not found. Please pre-populate /app/models/Qwen/Qwen3-0.6B in the PVC or customize init script to download it."
+              fi
+
+              echo "Model directory listing:" && ls -la /app/models/
+          env:
+            - name: HF_HUB_CACHE
+              value: /tmp/hf_cache
+          resources:
+            requests:
+              memory: "512Mi"
+              cpu: "250m"
+            limits:
+              memory: "1Gi"
+              cpu: "500m"
+          volumeMounts:
+            - name: models-volume
+              mountPath: /app/models
+      containers:
+        - name: semantic-router
+          image: ghcr.io/vllm-project/semantic-router/extproc:latest
+          args: ["--secure=true"]
+          securityContext:
+            runAsNonRoot: false
+            allowPrivilegeEscalation: false
+          ports:
+            - containerPort: 50051
+              name: grpc
+              protocol: TCP
+            - containerPort: 9190
+              name: metrics
+              protocol: TCP
+            - containerPort: 8080
+              name: classify-api
+              protocol: TCP
+          env:
+            - name: LD_LIBRARY_PATH
+              value: "/app/lib"
+          volumeMounts:
+            - name: config-volume
+              mountPath: /app/config
+              readOnly: true
+            - name: models-volume
+              mountPath: /app/models
+          livenessProbe:
+            tcpSocket:
+              port: 50051
+            initialDelaySeconds: 60
+            periodSeconds: 30
+            timeoutSeconds: 10
+            failureThreshold: 3
+          readinessProbe:
+            tcpSocket:
+              port: 50051
+            initialDelaySeconds: 90
+            periodSeconds: 30
+            timeoutSeconds: 10
+            failureThreshold: 3
+          resources:
+            requests:
+              memory: "3Gi"
+              cpu: "1"
+            limits:
+              memory: "6Gi"
+              cpu: "2"
+        - name: llm-katan
+          image: ghcr.io/vllm-project/semantic-router/llm-katan:latest
+          imagePullPolicy: IfNotPresent
+          args:
+            [
+              "llm-katan",
+              "--model",
+              "/app/models/Qwen/Qwen3-0.6B",
+              "--served-model-name",
+              "qwen3",
+              "--host",
+              "0.0.0.0",
+              "--port",
+              "8002",
+            ]
+          ports:
+            - containerPort: 8002
+              name: katan
+              protocol: TCP
+          volumeMounts:
+            - name: models-volume
+              mountPath: /app/models
+          resources:
+            requests:
+              memory: "1Gi"
+              cpu: "500m"
+            limits:
+              memory: "2Gi"
+              cpu: "1"
+      volumes:
+        - name: config-volume
+          configMap:
+            name: semantic-router-config
+        - name: models-volume
+          persistentVolumeClaim:
+            claimName: semantic-router-models
diff --git a/deploy/kubernetes/kustomization.yaml b/deploy/kubernetes/kustomization.yaml
index 3eae4ac99..f33d8c547 100644
--- a/deploy/kubernetes/kustomization.yaml
+++ b/deploy/kubernetes/kustomization.yaml
@@ -5,21 +5,21 @@ metadata:
   name: semantic-router
 
 resources:
-- namespace.yaml
-- pvc.yaml
-- deployment.yaml
-- service.yaml
+  - namespace.yaml
+  - pvc.yaml
+  - deployment.with-llm-katan.yaml
+  - service.yaml
 
 # Generate ConfigMap
 configMapGenerator:
-- name: semantic-router-config
-  files:
-  - config.yaml
-  - tools_db.json
+  - name: semantic-router-config
+    files:
+      - config.yaml
+      - tools_db.json
 
 # Namespace for all resources
 namespace: vllm-semantic-router-system
 
 images:
-- name: ghcr.io/vllm-project/semantic-router/extproc
-  newTag: latest
+  - name: ghcr.io/vllm-project/semantic-router/extproc
+    newTag: latest

From 3316c404cf56767d9ae19deccab57d32e36946f2 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 17 Oct 2025 22:27:18 +0800
Subject: [PATCH 3/8] seperate core and llm-katan

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 deploy/kubernetes/README.md                   |  44 ++--
 deploy/kubernetes/base/kustomization.yaml     |  19 ++
 ...h-llm-katan.yaml => deployment.katan.yaml} |  14 +-
 deploy/kubernetes/deployment.yaml             | 227 +++++++++---------
 deploy/kubernetes/kustomization.yaml          |  23 +-
 .../overlays/core/kustomization.yaml          |   6 +
 .../overlays/llm-katan/kustomization.yaml     |   6 +
 7 files changed, 192 insertions(+), 147 deletions(-)
 create mode 100644 deploy/kubernetes/base/kustomization.yaml
 rename deploy/kubernetes/{deployment.with-llm-katan.yaml => deployment.katan.yaml} (88%)
 create mode 100644 deploy/kubernetes/overlays/core/kustomization.yaml
 create mode 100644 deploy/kubernetes/overlays/llm-katan/kustomization.yaml

diff --git a/deploy/kubernetes/README.md b/deploy/kubernetes/README.md
index 3c45a37c2..ab225a43b 100644
--- a/deploy/kubernetes/README.md
+++ b/deploy/kubernetes/README.md
@@ -1,8 +1,9 @@
 # Semantic Router Kubernetes Deployment
 
-This directory contains Kubernetes manifests for deploying the Semantic Router using Kustomize.
+This directory contains Kubernetes manifests for deploying the Semantic Router using Kustomize. It provides two modes similar to docker-compose profiles:
 
-By default, the base kustomization deploys a Pod with an `llm-katan` sidecar so that the default config (qwen3 on 127.0.0.1:8002) works out-of-the-box. If you prefer to run without the sidecar, replace `deployment.with-llm-katan.yaml` with `deployment.yaml` in `kustomization.yaml`.
+- core: only the semantic-router (no llm-katan)
+- llm-katan: semantic-router plus an llm-katan sidecar listening on 8002 (served model name `qwen3`)
 
 ## Architecture
 
@@ -319,31 +320,42 @@ Edit the `resources` section in `deployment.yaml` accordingly.
 
 ### Kubernetes Manifests (`deploy/kubernetes/`)
 
-- `deployment.yaml` - Main application deployment with optimized resource settings
-- `deployment.with-llm-katan.yaml` - Optional variant including an llm-katan sidecar listening on 8002 (works with default config pointing to qwen3 at 127.0.0.1:8002)
-- `service.yaml` - Services for gRPC, HTTP API, and metrics
+- `base/` - Shared resources (Namespace, PVC, Service, ConfigMap)
+- `overlays/core/` - Core deployment (no llm-katan)
+- `overlays/llm-katan/` - Deployment with llm-katan sidecar
+- `deployment.yaml` - Plain deployment (used by core overlay)
+- `deployment.katan.yaml` - Sidecar deployment (used by llm-katan overlay)
+- `service.yaml` - gRPC, HTTP API, and metrics services
 - `pvc.yaml` - Persistent volume claim for model storage
 - `namespace.yaml` - Dedicated namespace for the application
-- `config.yaml` - Application configuration
+- `config.yaml` - Application configuration (defaults to qwen3 @ 127.0.0.1:8002)
 - `tools_db.json` - Tools database for semantic routing
-- `kustomization.yaml` - Kustomize configuration for easy deployment
+- `kustomization.yaml` - Root entry (defaults to core overlay)
 
 ### Development Tools
 
-## Optional: run with llm-katan sidecar
+## Choose a mode: core or llm-katan
 
-To mimic the docker-compose default setup, you can deploy a variant that runs an `llm-katan` sidecar inside the same Pod. The provided `deployment.with-llm-katan.yaml` exposes llm-katan on `0.0.0.0:8002` and serves the model name `qwen3`.
+- Core mode (default root points here):
 
-Notes:
+  ```bash
+  kubectl apply -k deploy/kubernetes
+  # or explicitly
+  kubectl apply -k deploy/kubernetes/overlays/core
+  ```
 
-- Ensure the Qwen model content is available at `/app/models/Qwen/Qwen3-0.6B` in the PVC. You can pre-populate the PV or customize the init container to fetch from an internal source.
-- The default Kubernetes `config.yaml` has been aligned to use `qwen3` and endpoint `127.0.0.1:8002`, so it will work out-of-the-box with this sidecar.
+- llm-katan mode:
 
-Apply the sidecar variant instead of the default deployment:
+  ```bash
+  kubectl apply -k deploy/kubernetes/overlays/llm-katan
+  ```
 
-```bash
-kubectl apply -n vllm-semantic-router-system -f deploy/kubernetes/deployment.with-llm-katan.yaml
-```
+Notes for llm-katan:
+
+Notes for llm-katan:
+
+- The init container will attempt to download `Qwen/Qwen3-0.6B` into `/app/models/Qwen/Qwen3-0.6B` and the embedding model `sentence-transformers/all-MiniLM-L12-v2` into `/app/models/all-MiniLM-L12-v2`. In restricted networks, these downloads may fail—pre-populate the PV or point the init script to your internal artifact store as needed.
+- The default Kubernetes `config.yaml` has been aligned to use `qwen3` and endpoint `127.0.0.1:8002`.
 
 - `tools/kind/kind-config.yaml` - Kind cluster configuration for local development
 - `tools/make/kube.mk` - Make targets for Kubernetes operations
diff --git a/deploy/kubernetes/base/kustomization.yaml b/deploy/kubernetes/base/kustomization.yaml
new file mode 100644
index 000000000..90192015c
--- /dev/null
+++ b/deploy/kubernetes/base/kustomization.yaml
@@ -0,0 +1,19 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - ../namespace.yaml
+  - ../pvc.yaml
+  - ../service.yaml
+
+configMapGenerator:
+  - name: semantic-router-config
+    files:
+      - ../config.yaml
+      - ../tools_db.json
+
+namespace: vllm-semantic-router-system
+
+images:
+  - name: ghcr.io/vllm-project/semantic-router/extproc
+    newTag: latest
diff --git a/deploy/kubernetes/deployment.with-llm-katan.yaml b/deploy/kubernetes/deployment.katan.yaml
similarity index 88%
rename from deploy/kubernetes/deployment.with-llm-katan.yaml
rename to deploy/kubernetes/deployment.katan.yaml
index acf45e1e4..3aa74d5b5 100644
--- a/deploy/kubernetes/deployment.with-llm-katan.yaml
+++ b/deploy/kubernetes/deployment.katan.yaml
@@ -63,11 +63,23 @@ spec:
                 echo "PII token classifier model already exists, skipping..."
               fi
 
+              # Download embedding model all-MiniLM-L12-v2
+              if [ ! -d "all-MiniLM-L12-v2" ]; then
+                echo "Downloading all-MiniLM-L12-v2 embedding model..."
+                huggingface-cli download sentence-transformers/all-MiniLM-L12-v2 --local-dir all-MiniLM-L12-v2
+              else
+                echo "all-MiniLM-L12-v2 already exists, skipping..."
+              fi
+
               # Optional: Prepare Qwen model directory for llm-katan sidecar
               # NOTE: Provide the model content under /app/models/Qwen/Qwen3-0.6B via pre-populated PV
               # or customize the following block to fetch from your internal artifact store.
               if [ ! -d "Qwen/Qwen3-0.6B" ]; then
-                echo "Qwen3-0.6B directory not found. Please pre-populate /app/models/Qwen/Qwen3-0.6B in the PVC or customize init script to download it."
+                echo "Downloading Qwen/Qwen3-0.6B for llm-katan..."
+                mkdir -p Qwen
+                huggingface-cli download Qwen/Qwen3-0.6B --local-dir Qwen/Qwen3-0.6B || echo "Warning: Qwen3-0.6B download failed; ensure offline pre-population if needed."
+              else
+                echo "Qwen/Qwen3-0.6B already exists, skipping..."
               fi
 
               echo "Model directory listing:" && ls -la /app/models/
diff --git a/deploy/kubernetes/deployment.yaml b/deploy/kubernetes/deployment.yaml
index ab7000f9a..560b9850a 100644
--- a/deploy/kubernetes/deployment.yaml
+++ b/deploy/kubernetes/deployment.yaml
@@ -16,121 +16,130 @@ spec:
         app: semantic-router
     spec:
       initContainers:
-      - name: model-downloader
-        image: python:3.11-slim
-        securityContext:
-          runAsNonRoot: false
-          allowPrivilegeEscalation: false
-        command: ["/bin/bash", "-c"]
-        args:
-        - |
-          set -e
-          echo "Installing Hugging Face CLI..."
-          pip install --no-cache-dir huggingface_hub[cli]
+        - name: model-downloader
+          image: python:3.11-slim
+          securityContext:
+            runAsNonRoot: false
+            allowPrivilegeEscalation: false
+          command: ["/bin/bash", "-c"]
+          args:
+            - |
+              set -e
+              echo "Installing Hugging Face CLI..."
+              pip install --no-cache-dir huggingface_hub[cli]
 
-          echo "Downloading models to persistent volume..."
-          cd /app/models
+              echo "Downloading models to persistent volume..."
+              cd /app/models
 
-          # Download category classifier model
-          if [ ! -d "category_classifier_modernbert-base_model" ]; then
-            echo "Downloading category classifier model..."
-            huggingface-cli download LLM-Semantic-Router/category_classifier_modernbert-base_model --local-dir category_classifier_modernbert-base_model
-          else
-            echo "Category classifier model already exists, skipping..."
-          fi
+              # Download category classifier model
+              if [ ! -d "category_classifier_modernbert-base_model" ]; then
+                echo "Downloading category classifier model..."
+                huggingface-cli download LLM-Semantic-Router/category_classifier_modernbert-base_model --local-dir category_classifier_modernbert-base_model
+              else
+                echo "Category classifier model already exists, skipping..."
+              fi
 
-          # Download PII classifier model
-          if [ ! -d "pii_classifier_modernbert-base_model" ]; then
-            echo "Downloading PII classifier model..."
-            huggingface-cli download LLM-Semantic-Router/pii_classifier_modernbert-base_model --local-dir pii_classifier_modernbert-base_model
-          else
-            echo "PII classifier model already exists, skipping..."
-          fi
+              # Download PII classifier model
+              if [ ! -d "pii_classifier_modernbert-base_model" ]; then
+                echo "Downloading PII classifier model..."
+                huggingface-cli download LLM-Semantic-Router/pii_classifier_modernbert-base_model --local-dir pii_classifier_modernbert-base_model
+              else
+                echo "PII classifier model already exists, skipping..."
+              fi
 
-          # Download jailbreak classifier model
-          if [ ! -d "jailbreak_classifier_modernbert-base_model" ]; then
-            echo "Downloading jailbreak classifier model..."
-            huggingface-cli download LLM-Semantic-Router/jailbreak_classifier_modernbert-base_model --local-dir jailbreak_classifier_modernbert-base_model
-          else
-            echo "Jailbreak classifier model already exists, skipping..."
-          fi
+              # Download jailbreak classifier model
+              if [ ! -d "jailbreak_classifier_modernbert-base_model" ]; then
+                echo "Downloading jailbreak classifier model..."
+                huggingface-cli download LLM-Semantic-Router/jailbreak_classifier_modernbert-base_model --local-dir jailbreak_classifier_modernbert-base_model
+              else
+                echo "Jailbreak classifier model already exists, skipping..."
+              fi
 
-          # Download PII token classifier model
-          if [ ! -d "pii_classifier_modernbert-base_presidio_token_model" ]; then
-            echo "Downloading PII token classifier model..."
-            huggingface-cli download LLM-Semantic-Router/pii_classifier_modernbert-base_presidio_token_model --local-dir pii_classifier_modernbert-base_presidio_token_model
-          else
-            echo "PII token classifier model already exists, skipping..."
-          fi
+              # Download PII token classifier model
+              if [ ! -d "pii_classifier_modernbert-base_presidio_token_model" ]; then
+                echo "Downloading PII token classifier model..."
+                huggingface-cli download LLM-Semantic-Router/pii_classifier_modernbert-base_presidio_token_model --local-dir pii_classifier_modernbert-base_presidio_token_model
+              else
+                echo "PII token classifier model already exists, skipping..."
+              fi
 
-          echo "All models downloaded successfully!"
-          ls -la /app/models/
-        env:
-        - name: HF_HUB_CACHE
-          value: /tmp/hf_cache
-        # Reduced resource requirements for init container
-        resources:
-          requests:
-            memory: "512Mi"
-            cpu: "250m"
-          limits:
-            memory: "1Gi"
-            cpu: "500m"
-        volumeMounts:
-        - name: models-volume
-          mountPath: /app/models
+              # Download embedding model all-MiniLM-L12-v2
+              if [ ! -d "all-MiniLM-L12-v2" ]; then
+                echo "Downloading all-MiniLM-L12-v2 embedding model..."
+                huggingface-cli download sentence-transformers/all-MiniLM-L12-v2 --local-dir all-MiniLM-L12-v2
+              else
+                echo "all-MiniLM-L12-v2 already exists, skipping..."
+              fi
+
+
+              echo "Model setup complete."
+              ls -la /app/models/
+          env:
+            - name: HF_HUB_CACHE
+              value: /tmp/hf_cache
+          # Reduced resource requirements for init container
+          resources:
+            requests:
+              memory: "512Mi"
+              cpu: "250m"
+            limits:
+              memory: "1Gi"
+              cpu: "500m"
+          volumeMounts:
+            - name: models-volume
+              mountPath: /app/models
       containers:
-      - name: semantic-router
-        image: ghcr.io/vllm-project/semantic-router/extproc:latest
-        args: ["--secure=true"]
-        securityContext:
-          runAsNonRoot: false
-          allowPrivilegeEscalation: false
-        ports:
-        - containerPort: 50051
-          name: grpc
-          protocol: TCP
-        - containerPort: 9190
-          name: metrics
-          protocol: TCP
-        - containerPort: 8080
-          name: classify-api
-          protocol: TCP
-        env:
-        - name: LD_LIBRARY_PATH
-          value: "/app/lib"
-        volumeMounts:
+        - name: semantic-router
+          image: ghcr.io/vllm-project/semantic-router/extproc:latest
+          args: ["--secure=true"]
+          securityContext:
+            runAsNonRoot: false
+            allowPrivilegeEscalation: false
+          ports:
+            - containerPort: 50051
+              name: grpc
+              protocol: TCP
+            - containerPort: 9190
+              name: metrics
+              protocol: TCP
+            - containerPort: 8080
+              name: classify-api
+              protocol: TCP
+          env:
+            - name: LD_LIBRARY_PATH
+              value: "/app/lib"
+          volumeMounts:
+            - name: config-volume
+              mountPath: /app/config
+              readOnly: true
+            - name: models-volume
+              mountPath: /app/models
+          livenessProbe:
+            tcpSocket:
+              port: 50051
+            initialDelaySeconds: 60
+            periodSeconds: 30
+            timeoutSeconds: 10
+            failureThreshold: 3
+          readinessProbe:
+            tcpSocket:
+              port: 50051
+            initialDelaySeconds: 90
+            periodSeconds: 30
+            timeoutSeconds: 10
+            failureThreshold: 3
+          # Significantly reduced resource requirements for kind cluster
+          resources:
+            requests:
+              memory: "3Gi" # Reduced from 8Gi
+              cpu: "1" # Reduced from 2
+            limits:
+              memory: "6Gi" # Reduced from 12Gi
+              cpu: "2" # Reduced from 4
+      volumes:
         - name: config-volume
-          mountPath: /app/config
-          readOnly: true
+          configMap:
+            name: semantic-router-config
         - name: models-volume
-          mountPath: /app/models
-        livenessProbe:
-          tcpSocket:
-            port: 50051
-          initialDelaySeconds: 60
-          periodSeconds: 30
-          timeoutSeconds: 10
-          failureThreshold: 3
-        readinessProbe:
-          tcpSocket:
-            port: 50051
-          initialDelaySeconds: 90
-          periodSeconds: 30
-          timeoutSeconds: 10
-          failureThreshold: 3
-        # Significantly reduced resource requirements for kind cluster
-        resources:
-          requests:
-            memory: "3Gi"    # Reduced from 8Gi
-            cpu: "1"         # Reduced from 2
-          limits:
-            memory: "6Gi"    # Reduced from 12Gi
-            cpu: "2"         # Reduced from 4
-      volumes:
-      - name: config-volume
-        configMap:
-          name: semantic-router-config
-      - name: models-volume
-        persistentVolumeClaim:
-          claimName: semantic-router-models
+          persistentVolumeClaim:
+            claimName: semantic-router-models
diff --git a/deploy/kubernetes/kustomization.yaml b/deploy/kubernetes/kustomization.yaml
index f33d8c547..65b2ccae5 100644
--- a/deploy/kubernetes/kustomization.yaml
+++ b/deploy/kubernetes/kustomization.yaml
@@ -1,25 +1,6 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 
-metadata:
-  name: semantic-router
-
+# This root points to the 'core' overlay by default for clarity.
 resources:
-  - namespace.yaml
-  - pvc.yaml
-  - deployment.with-llm-katan.yaml
-  - service.yaml
-
-# Generate ConfigMap
-configMapGenerator:
-  - name: semantic-router-config
-    files:
-      - config.yaml
-      - tools_db.json
-
-# Namespace for all resources
-namespace: vllm-semantic-router-system
-
-images:
-  - name: ghcr.io/vllm-project/semantic-router/extproc
-    newTag: latest
+  - overlays/core
diff --git a/deploy/kubernetes/overlays/core/kustomization.yaml b/deploy/kubernetes/overlays/core/kustomization.yaml
new file mode 100644
index 000000000..59d6cf23d
--- /dev/null
+++ b/deploy/kubernetes/overlays/core/kustomization.yaml
@@ -0,0 +1,6 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - ../../base
+  - ../../deployment.yaml
diff --git a/deploy/kubernetes/overlays/llm-katan/kustomization.yaml b/deploy/kubernetes/overlays/llm-katan/kustomization.yaml
new file mode 100644
index 000000000..a20ca3707
--- /dev/null
+++ b/deploy/kubernetes/overlays/llm-katan/kustomization.yaml
@@ -0,0 +1,6 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - ../../base
+  - ../../deployment.katan.yaml

From a7191cf0f7d72a0214371039dc732ba4c485026e Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 17 Oct 2025 22:32:33 +0800
Subject: [PATCH 4/8] update k8s install docs

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 website/docs/installation/kubernetes.md | 28 +++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/website/docs/installation/kubernetes.md b/website/docs/installation/kubernetes.md
index 792ff935b..f12ed6de9 100644
--- a/website/docs/installation/kubernetes.md
+++ b/website/docs/installation/kubernetes.md
@@ -35,7 +35,12 @@ kubectl wait --for=condition=Ready nodes --all --timeout=300s
 
 ## Step 2: Deploy vLLM Semantic Router
 
-Configure the semantic router by editing `deploy/kubernetes/config.yaml`. This file contains the vLLM configuration, including model config, endpoints, and policies.
+Configure the semantic router by editing `deploy/kubernetes/config.yaml`. This file contains the vLLM configuration, including model config, endpoints, and policies. The repository provides two Kustomize overlays similar to docker-compose profiles:
+
+- core (default): only the semantic-router
+  - Path: `deploy/kubernetes/overlays/core` (root `deploy/kubernetes/` points here by default)
+- llm-katan: semantic-router + an llm-katan sidecar listening on 8002 and serving model name `qwen3`
+  - Path: `deploy/kubernetes/overlays/llm-katan`
 
 Important notes before you apply manifests:
 
@@ -43,11 +48,15 @@ Important notes before you apply manifests:
 - The PVC in `deploy/kubernetes/pvc.yaml` uses `storageClassName: standard`. On some clouds or local clusters, the default StorageClass name may differ (e.g., `standard-rwo`, `gp2`, or a provisioner like local-path). Adjust as needed.
 - Default PVC size is 30Gi. Size it to at least 2–3x of your total model footprint to leave room for indexes and updates.
 - The initContainer downloads several models from Hugging Face on first run and writes them into the PVC. Ensure outbound egress to Hugging Face is allowed and there is at least ~6–8 GiB free space for the models specified.
+- Per mode, the init container downloads differ:
+  - core: classifiers + the embedding model `sentence-transformers/all-MiniLM-L12-v2` into `/app/models/all-MiniLM-L12-v2`.
+  - llm-katan: everything in core, plus `Qwen/Qwen3-0.6B` into `/app/models/Qwen/Qwen3-0.6B`.
+- The default `config.yaml` points to `qwen3` at `127.0.0.1:8002`, which matches the llm-katan overlay. If you use core (no sidecar), either change `vllm_endpoints` to your actual backend Service IP:Port, or deploy the llm-katan overlay.
 
-Deploy the semantic router service with all required components:
+Deploy the semantic router service with all required components (core mode by default):
 
-```bash
-# Deploy semantic router using Kustomize
+````bash
+# Deploy semantic router (core mode)
 kubectl apply -k deploy/kubernetes/
 
 # Wait for deployment to be ready (this may take several minutes for model downloads)
@@ -55,7 +64,14 @@ kubectl wait --for=condition=Available deployment/semantic-router -n vllm-semant
 
 # Verify deployment status
 kubectl get pods -n vllm-semantic-router-system
-```
+
+To run with the llm-katan overlay instead:
+
+```bash
+kubectl apply -k deploy/kubernetes/overlays/llm-katan
+````
+
+````
 
 ## Step 3: Install Envoy Gateway
 
@@ -70,7 +86,7 @@ helm upgrade -i eg oci://docker.io/envoyproxy/gateway-helm \
 
 # Wait for Envoy Gateway to be ready
 kubectl wait --timeout=300s -n envoy-gateway-system deployment/envoy-gateway --for=condition=Available
-```
+````
 
 ## Step 4: Install Envoy AI Gateway
 

From a245d5cde6fe862cf0207d005ab6eebf0c191553 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Fri, 17 Oct 2025 23:02:25 +0800
Subject: [PATCH 5/8] try fix CI error

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 deploy/kubernetes/base/config.yaml        | 169 ++++++++++++++++++++++
 deploy/kubernetes/base/kustomization.yaml |  10 +-
 deploy/kubernetes/base/namespace.yaml     |   4 +
 deploy/kubernetes/base/pvc.yaml           |  13 ++
 deploy/kubernetes/base/service.yaml       |  38 +++++
 deploy/kubernetes/base/tools_db.json      | 142 ++++++++++++++++++
 6 files changed, 371 insertions(+), 5 deletions(-)
 create mode 100644 deploy/kubernetes/base/config.yaml
 create mode 100644 deploy/kubernetes/base/namespace.yaml
 create mode 100644 deploy/kubernetes/base/pvc.yaml
 create mode 100644 deploy/kubernetes/base/service.yaml
 create mode 100644 deploy/kubernetes/base/tools_db.json

diff --git a/deploy/kubernetes/base/config.yaml b/deploy/kubernetes/base/config.yaml
new file mode 100644
index 000000000..5f5159a3e
--- /dev/null
+++ b/deploy/kubernetes/base/config.yaml
@@ -0,0 +1,169 @@
+bert_model:
+	model_id: models/all-MiniLM-L12-v2
+	threshold: 0.6
+	use_cpu: true
+
+semantic_cache:
+	enabled: true
+	backend_type: "memory" # Options: "memory" or "milvus"
+	similarity_threshold: 0.8
+	max_entries: 1000 # Only applies to memory backend
+	ttl_seconds: 3600
+	eviction_policy: "fifo"
+
+tools:
+	enabled: true
+	top_k: 3
+	similarity_threshold: 0.2
+	tools_db_path: "config/tools_db.json"
+	fallback_to_empty: true
+
+prompt_guard:
+	enabled: true
+	use_modernbert: true
+	model_id: "models/jailbreak_classifier_modernbert-base_model"
+	threshold: 0.7
+	use_cpu: true
+	jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
+
+# vLLM Endpoints Configuration
+# IMPORTANT: 'address' field must be a valid IP address (IPv4 or IPv6)
+# Supported formats: 127.0.0.1, 192.168.1.1, ::1, 2001:db8::1
+# NOT supported: domain names (example.com), protocol prefixes (http://), paths (/api), ports in address (use 'port' field)
+vllm_endpoints:
+	- name: "endpoint1"
+		address: "127.0.0.1" # llm-katan sidecar or local backend
+		port: 8002
+		weight: 1
+
+model_config:
+	"qwen3":
+		reasoning_family: "qwen3" # Match docker-compose default model name
+		preferred_endpoints: ["endpoint1"]
+		pii_policy:
+			allow_by_default: true
+
+# Classifier configuration
+classifier:
+	category_model:
+		model_id: "models/category_classifier_modernbert-base_model"
+		use_modernbert: true
+		threshold: 0.6
+		use_cpu: true
+		category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
+	pii_model:
+		model_id: "models/pii_classifier_modernbert-base_presidio_token_model"
+		use_modernbert: true
+		threshold: 0.7
+		use_cpu: true
+		pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json"
+
+# Categories with new use_reasoning field structure
+categories:
+	- name: business
+		model_scores:
+			- model: qwen3
+				score: 0.7
+				use_reasoning: false # Business performs better without reasoning
+	- name: law
+		model_scores:
+			- model: qwen3
+				score: 0.4
+				use_reasoning: false
+	- name: psychology
+		model_scores:
+			- model: qwen3
+				score: 0.6
+				use_reasoning: false
+	- name: biology
+		model_scores:
+			- model: qwen3
+				score: 0.9
+				use_reasoning: false
+	- name: chemistry
+		model_scores:
+			- model: qwen3
+				score: 0.6
+				use_reasoning: true # Enable reasoning for complex chemistry
+	- name: history
+		model_scores:
+			- model: qwen3
+				score: 0.7
+				use_reasoning: false
+	- name: other
+		model_scores:
+			- model: qwen3
+				score: 0.7
+				use_reasoning: false
+	- name: health
+		model_scores:
+			- model: qwen3
+				score: 0.5
+				use_reasoning: false
+	- name: economics
+		model_scores:
+			- model: qwen3
+				score: 1.0
+				use_reasoning: false
+	- name: math
+		model_scores:
+			- model: qwen3
+				score: 1.0
+				use_reasoning: true # Enable reasoning for complex math
+	- name: physics
+		model_scores:
+			- model: qwen3
+				score: 0.7
+				use_reasoning: true # Enable reasoning for physics
+	- name: computer science
+		model_scores:
+			- model: qwen3
+				score: 0.6
+				use_reasoning: false
+	- name: philosophy
+		model_scores:
+			- model: qwen3
+				score: 0.5
+				use_reasoning: false
+	- name: engineering
+		model_scores:
+			- model: qwen3
+				score: 0.7
+				use_reasoning: false
+
+default_model: qwen3
+
+# Reasoning family configurations
+reasoning_families:
+	deepseek:
+		type: "chat_template_kwargs"
+		parameter: "thinking"
+
+	qwen3:
+		type: "chat_template_kwargs"
+		parameter: "enable_thinking"
+
+	gpt-oss:
+		type: "reasoning_effort"
+		parameter: "reasoning_effort"
+	gpt:
+		type: "reasoning_effort"
+		parameter: "reasoning_effort"
+
+# Global default reasoning effort level
+default_reasoning_effort: high
+
+# API Configuration
+api:
+	batch_classification:
+		max_batch_size: 100
+		concurrency_threshold: 5
+		max_concurrency: 8
+		metrics:
+			enabled: true
+			detailed_goroutine_tracking: true
+			high_resolution_timing: false
+			sample_rate: 1.0
+			duration_buckets:
+				[0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
+			size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]
diff --git a/deploy/kubernetes/base/kustomization.yaml b/deploy/kubernetes/base/kustomization.yaml
index 90192015c..df0b58882 100644
--- a/deploy/kubernetes/base/kustomization.yaml
+++ b/deploy/kubernetes/base/kustomization.yaml
@@ -2,15 +2,15 @@ apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 
 resources:
-  - ../namespace.yaml
-  - ../pvc.yaml
-  - ../service.yaml
+  - ./namespace.yaml
+  - ./pvc.yaml
+  - ./service.yaml
 
 configMapGenerator:
   - name: semantic-router-config
     files:
-      - ../config.yaml
-      - ../tools_db.json
+      - ./config.yaml
+      - ./tools_db.json
 
 namespace: vllm-semantic-router-system
 
diff --git a/deploy/kubernetes/base/namespace.yaml b/deploy/kubernetes/base/namespace.yaml
new file mode 100644
index 000000000..0bdc316f5
--- /dev/null
+++ b/deploy/kubernetes/base/namespace.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: vllm-semantic-router-system
diff --git a/deploy/kubernetes/base/pvc.yaml b/deploy/kubernetes/base/pvc.yaml
new file mode 100644
index 000000000..43b66eb95
--- /dev/null
+++ b/deploy/kubernetes/base/pvc.yaml
@@ -0,0 +1,13 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: semantic-router-models
+  labels:
+    app: semantic-router
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 30Gi
+  storageClassName: standard
diff --git a/deploy/kubernetes/base/service.yaml b/deploy/kubernetes/base/service.yaml
new file mode 100644
index 000000000..5d2ed1b61
--- /dev/null
+++ b/deploy/kubernetes/base/service.yaml
@@ -0,0 +1,38 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: semantic-router
+  namespace: vllm-semantic-router-system
+  labels:
+    app: semantic-router
+spec:
+  type: ClusterIP
+  ports:
+    - port: 50051
+      targetPort: grpc
+      protocol: TCP
+      name: grpc
+    - port: 8080
+      targetPort: 8080
+      protocol: TCP
+      name: classify-api
+  selector:
+    app: semantic-router
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: semantic-router-metrics
+  namespace: vllm-semantic-router-system
+  labels:
+    app: semantic-router
+    service: metrics
+spec:
+  type: ClusterIP
+  ports:
+    - port: 9190
+      targetPort: metrics
+      protocol: TCP
+      name: metrics
+  selector:
+    app: semantic-router
diff --git a/deploy/kubernetes/base/tools_db.json b/deploy/kubernetes/base/tools_db.json
new file mode 100644
index 000000000..4f62f26e7
--- /dev/null
+++ b/deploy/kubernetes/base/tools_db.json
@@ -0,0 +1,142 @@
+[
+	{
+		"tool": {
+			"type": "function",
+			"function": {
+				"name": "get_weather",
+				"description": "Get current weather information for a location",
+				"parameters": {
+					"type": "object",
+					"properties": {
+						"location": {
+							"type": "string",
+							"description": "The city and state, e.g. San Francisco, CA"
+						},
+						"unit": {
+							"type": "string",
+							"enum": ["celsius", "fahrenheit"],
+							"description": "Temperature unit"
+						}
+					},
+					"required": ["location"]
+				}
+			}
+		},
+		"description": "Get current weather information, temperature, conditions, forecast for any location, city, or place. Check weather today, now, current conditions, temperature, rain, sun, cloudy, hot, cold, storm, snow",
+		"category": "weather",
+		"tags": ["weather", "temperature", "forecast", "climate"]
+	},
+	{
+		"tool": {
+			"type": "function",
+			"function": {
+				"name": "search_web",
+				"description": "Search the web for information",
+				"parameters": {
+					"type": "object",
+					"properties": {
+						"query": {
+							"type": "string",
+							"description": "The search query"
+						},
+						"num_results": {
+							"type": "integer",
+							"description": "Number of results to return",
+							"default": 5
+						}
+					},
+					"required": ["query"]
+				}
+			}
+		},
+		"description": "Search the internet, web search, find information online, browse web content, lookup, research, google, find answers, discover, investigate",
+		"category": "search",
+		"tags": ["search", "web", "internet", "information", "browse"]
+	},
+	{
+		"tool": {
+			"type": "function",
+			"function": {
+				"name": "calculate",
+				"description": "Perform mathematical calculations",
+				"parameters": {
+					"type": "object",
+					"properties": {
+						"expression": {
+							"type": "string",
+							"description": "Mathematical expression to evaluate"
+						}
+					},
+					"required": ["expression"]
+				}
+			}
+		},
+		"description": "Calculate mathematical expressions, solve math problems, arithmetic operations, compute numbers, addition, subtraction, multiplication, division, equations, formula",
+		"category": "math",
+		"tags": ["math", "calculation", "arithmetic", "compute", "numbers"]
+	},
+	{
+		"tool": {
+			"type": "function",
+			"function": {
+				"name": "send_email",
+				"description": "Send an email message",
+				"parameters": {
+					"type": "object",
+					"properties": {
+						"to": {
+							"type": "string",
+							"description": "Recipient email address"
+						},
+						"subject": {
+							"type": "string",
+							"description": "Email subject"
+						},
+						"body": {
+							"type": "string",
+							"description": "Email body content"
+						}
+					},
+					"required": ["to", "subject", "body"]
+				}
+			}
+		},
+		"description": "Send email messages, email communication, contact people via email, mail, message, correspondence, notify, inform",
+		"category": "communication",
+		"tags": ["email", "send", "communication", "message", "contact"]
+	},
+	{
+		"tool": {
+			"type": "function",
+			"function": {
+				"name": "create_calendar_event",
+				"description": "Create a new calendar event or appointment",
+				"parameters": {
+					"type": "object",
+					"properties": {
+						"title": {
+							"type": "string",
+							"description": "Event title"
+						},
+						"date": {
+							"type": "string",
+							"description": "Event date in YYYY-MM-DD format"
+						},
+						"time": {
+							"type": "string",
+							"description": "Event time in HH:MM format"
+						},
+						"duration": {
+							"type": "integer",
+							"description": "Duration in minutes"
+						}
+					},
+					"required": ["title", "date", "time"]
+				}
+			}
+		},
+		"description": "Schedule meetings, create calendar events, set appointments, manage calendar, book time, plan meeting, organize schedule, reminder, agenda",
+		"category": "productivity",
+		"tags": ["calendar", "event", "meeting", "appointment", "schedule"]
+	}
+] 

From f9d1346d982284e468b6bf75a2a54ae5561b0008 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Mon, 20 Oct 2025 14:16:36 +0800
Subject: [PATCH 6/8] add init models fall back

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 deploy/kubernetes/{ => base}/deployment.yaml  |  11 +-
 deploy/kubernetes/base/kustomization.yaml     |   5 +-
 deploy/kubernetes/base/pv.example.yaml        |  16 ++
 deploy/kubernetes/deployment.katan.yaml       | 181 ------------------
 .../overlays/core/kustomization.yaml          |   1 -
 .../overlays/llm-katan/kustomization.yaml     |   7 +-
 .../overlays/llm-katan/patch-llm-katan.yaml   |  30 +++
 website/docs/installation/kubernetes.md       |  36 +++-
 website/docs/troubleshooting/network-tips.md  |  87 ++++++++-
 9 files changed, 171 insertions(+), 203 deletions(-)
 rename deploy/kubernetes/{ => base}/deployment.yaml (94%)
 create mode 100644 deploy/kubernetes/base/pv.example.yaml
 delete mode 100644 deploy/kubernetes/deployment.katan.yaml
 create mode 100644 deploy/kubernetes/overlays/llm-katan/patch-llm-katan.yaml

diff --git a/deploy/kubernetes/deployment.yaml b/deploy/kubernetes/base/deployment.yaml
similarity index 94%
rename from deploy/kubernetes/deployment.yaml
rename to deploy/kubernetes/base/deployment.yaml
index 560b9850a..5baecc953 100644
--- a/deploy/kubernetes/deployment.yaml
+++ b/deploy/kubernetes/base/deployment.yaml
@@ -77,7 +77,6 @@ spec:
           env:
             - name: HF_HUB_CACHE
               value: /tmp/hf_cache
-          # Reduced resource requirements for init container
           resources:
             requests:
               memory: "512Mi"
@@ -91,6 +90,7 @@ spec:
       containers:
         - name: semantic-router
           image: ghcr.io/vllm-project/semantic-router/extproc:latest
+          imagePullPolicy: IfNotPresent
           args: ["--secure=true"]
           securityContext:
             runAsNonRoot: false
@@ -128,14 +128,13 @@ spec:
             periodSeconds: 30
             timeoutSeconds: 10
             failureThreshold: 3
-          # Significantly reduced resource requirements for kind cluster
           resources:
             requests:
-              memory: "3Gi" # Reduced from 8Gi
-              cpu: "1" # Reduced from 2
+              memory: "3Gi"
+              cpu: "1"
             limits:
-              memory: "6Gi" # Reduced from 12Gi
-              cpu: "2" # Reduced from 4
+              memory: "6Gi"
+              cpu: "2"
       volumes:
         - name: config-volume
           configMap:
diff --git a/deploy/kubernetes/base/kustomization.yaml b/deploy/kubernetes/base/kustomization.yaml
index df0b58882..07cfed2c6 100644
--- a/deploy/kubernetes/base/kustomization.yaml
+++ b/deploy/kubernetes/base/kustomization.yaml
@@ -5,6 +5,7 @@ resources:
   - ./namespace.yaml
   - ./pvc.yaml
   - ./service.yaml
+  - ./deployment.yaml
 
 configMapGenerator:
   - name: semantic-router-config
@@ -13,7 +14,3 @@ configMapGenerator:
       - ./tools_db.json
 
 namespace: vllm-semantic-router-system
-
-images:
-  - name: ghcr.io/vllm-project/semantic-router/extproc
-    newTag: latest
diff --git a/deploy/kubernetes/base/pv.example.yaml b/deploy/kubernetes/base/pv.example.yaml
new file mode 100644
index 000000000..7ea00f491
--- /dev/null
+++ b/deploy/kubernetes/base/pv.example.yaml
@@ -0,0 +1,16 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: semantic-router-models-pv
+  labels:
+    app: semantic-router
+spec:
+  capacity:
+    storage: 50Gi
+  accessModes:
+    - ReadWriteOnce
+  storageClassName: standard
+  persistentVolumeReclaimPolicy: Retain
+  hostPath:
+    path: /tmp/hostpath-provisioner/models
+    type: DirectoryOrCreate
diff --git a/deploy/kubernetes/deployment.katan.yaml b/deploy/kubernetes/deployment.katan.yaml
deleted file mode 100644
index 3aa74d5b5..000000000
--- a/deploy/kubernetes/deployment.katan.yaml
+++ /dev/null
@@ -1,181 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: semantic-router
-  namespace: vllm-semantic-router-system
-  labels:
-    app: semantic-router
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: semantic-router
-  template:
-    metadata:
-      labels:
-        app: semantic-router
-    spec:
-      initContainers:
-        - name: model-downloader
-          image: python:3.11-slim
-          securityContext:
-            runAsNonRoot: false
-            allowPrivilegeEscalation: false
-          command: ["/bin/bash", "-c"]
-          args:
-            - |
-              set -e
-              echo "Installing Hugging Face CLI..."
-              pip install --no-cache-dir huggingface_hub[cli]
-
-              echo "Downloading classifier models to persistent volume..."
-              cd /app/models
-
-              # Download category classifier model
-              if [ ! -d "category_classifier_modernbert-base_model" ]; then
-                echo "Downloading category classifier model..."
-                huggingface-cli download LLM-Semantic-Router/category_classifier_modernbert-base_model --local-dir category_classifier_modernbert-base_model
-              else
-                echo "Category classifier model already exists, skipping..."
-              fi
-
-              # Download PII classifier model
-              if [ ! -d "pii_classifier_modernbert-base_model" ]; then
-                echo "Downloading PII classifier model..."
-                huggingface-cli download LLM-Semantic-Router/pii_classifier_modernbert-base_model --local-dir pii_classifier_modernbert-base_model
-              else
-                echo "PII classifier model already exists, skipping..."
-              fi
-
-              # Download jailbreak classifier model
-              if [ ! -d "jailbreak_classifier_modernbert-base_model" ]; then
-                echo "Downloading jailbreak classifier model..."
-                huggingface-cli download LLM-Semantic-Router/jailbreak_classifier_modernbert-base_model --local-dir jailbreak_classifier_modernbert-base_model
-              else
-                echo "Jailbreak classifier model already exists, skipping..."
-              fi
-
-              # Download PII token classifier model
-              if [ ! -d "pii_classifier_modernbert-base_presidio_token_model" ]; then
-                echo "Downloading PII token classifier model..."
-                huggingface-cli download LLM-Semantic-Router/pii_classifier_modernbert-base_presidio_token_model --local-dir pii_classifier_modernbert-base_presidio_token_model
-              else
-                echo "PII token classifier model already exists, skipping..."
-              fi
-
-              # Download embedding model all-MiniLM-L12-v2
-              if [ ! -d "all-MiniLM-L12-v2" ]; then
-                echo "Downloading all-MiniLM-L12-v2 embedding model..."
-                huggingface-cli download sentence-transformers/all-MiniLM-L12-v2 --local-dir all-MiniLM-L12-v2
-              else
-                echo "all-MiniLM-L12-v2 already exists, skipping..."
-              fi
-
-              # Optional: Prepare Qwen model directory for llm-katan sidecar
-              # NOTE: Provide the model content under /app/models/Qwen/Qwen3-0.6B via pre-populated PV
-              # or customize the following block to fetch from your internal artifact store.
-              if [ ! -d "Qwen/Qwen3-0.6B" ]; then
-                echo "Downloading Qwen/Qwen3-0.6B for llm-katan..."
-                mkdir -p Qwen
-                huggingface-cli download Qwen/Qwen3-0.6B --local-dir Qwen/Qwen3-0.6B || echo "Warning: Qwen3-0.6B download failed; ensure offline pre-population if needed."
-              else
-                echo "Qwen/Qwen3-0.6B already exists, skipping..."
-              fi
-
-              echo "Model directory listing:" && ls -la /app/models/
-          env:
-            - name: HF_HUB_CACHE
-              value: /tmp/hf_cache
-          resources:
-            requests:
-              memory: "512Mi"
-              cpu: "250m"
-            limits:
-              memory: "1Gi"
-              cpu: "500m"
-          volumeMounts:
-            - name: models-volume
-              mountPath: /app/models
-      containers:
-        - name: semantic-router
-          image: ghcr.io/vllm-project/semantic-router/extproc:latest
-          args: ["--secure=true"]
-          securityContext:
-            runAsNonRoot: false
-            allowPrivilegeEscalation: false
-          ports:
-            - containerPort: 50051
-              name: grpc
-              protocol: TCP
-            - containerPort: 9190
-              name: metrics
-              protocol: TCP
-            - containerPort: 8080
-              name: classify-api
-              protocol: TCP
-          env:
-            - name: LD_LIBRARY_PATH
-              value: "/app/lib"
-          volumeMounts:
-            - name: config-volume
-              mountPath: /app/config
-              readOnly: true
-            - name: models-volume
-              mountPath: /app/models
-          livenessProbe:
-            tcpSocket:
-              port: 50051
-            initialDelaySeconds: 60
-            periodSeconds: 30
-            timeoutSeconds: 10
-            failureThreshold: 3
-          readinessProbe:
-            tcpSocket:
-              port: 50051
-            initialDelaySeconds: 90
-            periodSeconds: 30
-            timeoutSeconds: 10
-            failureThreshold: 3
-          resources:
-            requests:
-              memory: "3Gi"
-              cpu: "1"
-            limits:
-              memory: "6Gi"
-              cpu: "2"
-        - name: llm-katan
-          image: ghcr.io/vllm-project/semantic-router/llm-katan:latest
-          imagePullPolicy: IfNotPresent
-          args:
-            [
-              "llm-katan",
-              "--model",
-              "/app/models/Qwen/Qwen3-0.6B",
-              "--served-model-name",
-              "qwen3",
-              "--host",
-              "0.0.0.0",
-              "--port",
-              "8002",
-            ]
-          ports:
-            - containerPort: 8002
-              name: katan
-              protocol: TCP
-          volumeMounts:
-            - name: models-volume
-              mountPath: /app/models
-          resources:
-            requests:
-              memory: "1Gi"
-              cpu: "500m"
-            limits:
-              memory: "2Gi"
-              cpu: "1"
-      volumes:
-        - name: config-volume
-          configMap:
-            name: semantic-router-config
-        - name: models-volume
-          persistentVolumeClaim:
-            claimName: semantic-router-models
diff --git a/deploy/kubernetes/overlays/core/kustomization.yaml b/deploy/kubernetes/overlays/core/kustomization.yaml
index 59d6cf23d..774a422d0 100644
--- a/deploy/kubernetes/overlays/core/kustomization.yaml
+++ b/deploy/kubernetes/overlays/core/kustomization.yaml
@@ -3,4 +3,3 @@ kind: Kustomization
 
 resources:
   - ../../base
-  - ../../deployment.yaml
diff --git a/deploy/kubernetes/overlays/llm-katan/kustomization.yaml b/deploy/kubernetes/overlays/llm-katan/kustomization.yaml
index a20ca3707..dacb15f6b 100644
--- a/deploy/kubernetes/overlays/llm-katan/kustomization.yaml
+++ b/deploy/kubernetes/overlays/llm-katan/kustomization.yaml
@@ -3,4 +3,9 @@ kind: Kustomization
 
 resources:
   - ../../base
-  - ../../deployment.katan.yaml
+
+patches:
+  - target:
+      kind: Deployment
+      name: semantic-router
+    path: patch-llm-katan.yaml
diff --git a/deploy/kubernetes/overlays/llm-katan/patch-llm-katan.yaml b/deploy/kubernetes/overlays/llm-katan/patch-llm-katan.yaml
new file mode 100644
index 000000000..6d149109f
--- /dev/null
+++ b/deploy/kubernetes/overlays/llm-katan/patch-llm-katan.yaml
@@ -0,0 +1,30 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: semantic-router
+spec:
+  template:
+    spec:
+      containers:
+        - name: semantic-router
+          imagePullPolicy: IfNotPresent
+        - name: llm-katan
+          image: ghcr.io/vllm-project/semantic-router/llm-katan:latest
+          imagePullPolicy: IfNotPresent
+          args:
+            - llm-katan
+            - --model
+            - /app/models/Qwen/Qwen3-0.6B
+            - --served-model-name
+            - qwen3
+            - --host
+            - 0.0.0.0
+            - --port
+            - "8002"
+          ports:
+            - containerPort: 8002
+              name: katan
+              protocol: TCP
+          volumeMounts:
+            - name: models-volume
+              mountPath: /app/models
diff --git a/website/docs/installation/kubernetes.md b/website/docs/installation/kubernetes.md
index f12ed6de9..c91f739cc 100644
--- a/website/docs/installation/kubernetes.md
+++ b/website/docs/installation/kubernetes.md
@@ -42,6 +42,38 @@ Configure the semantic router by editing `deploy/kubernetes/config.yaml`. This f
 - llm-katan: semantic-router + an llm-katan sidecar listening on 8002 and serving model name `qwen3`
   - Path: `deploy/kubernetes/overlays/llm-katan`
 
+### Repository layout (deploy/kubernetes/)
+
+```
+deploy/kubernetes/
+  base/
+    kustomization.yaml        # base kustomize: namespace, PVC, service, deployment
+    namespace.yaml            # Namespace for all resources
+    pvc.yaml                  # PVC for models (storageClass and size adjustable)
+    service.yaml              # Service exposing gRPC/metrics/HTTP ports
+    deployment.yaml           # Semantic Router Deployment (init downloads by default)
+    config.yaml               # Router config (mounted via ConfigMap)
+    tools_db.json             # Tools DB (mounted via ConfigMap)
+    pv.example.yaml           # OPTIONAL: hostPath PV example for local models
+  overlays/
+    core/
+      kustomization.yaml      # Uses only base
+    llm-katan/
+      kustomization.yaml      # Patches base to add llm-katan sidecar
+      patch-llm-katan.yaml    # Strategic-merge patch injecting sidecar
+  kustomization.yaml          # Root points to overlays/core by default
+  README.md                   # Additional notes
+  namespace.yaml, pvc.yaml, service.yaml (top-level shortcuts kept for backward compat)
+```
+
+Notes:
+
+- The base deployment includes an initContainer that downloads required models on first run.
+- If your cluster has limited egress, prefer mounting local models via a PV/PVC and skip downloads:
+  - Copy `base/pv.example.yaml` to `base/pv.yaml`, apply it, and ensure `base/pvc.yaml` is bound to that PV.
+  - Mount point remains `/app/models` in the pod.
+  - See “Network Tips” for details on hostPath PV, image mirrors, and preloading images.
+
 Important notes before you apply manifests:
 
 - `vllm_endpoints.address` must be an IP address (not hostname) reachable from inside the cluster. If your LLM backends run as K8s Services, use the ClusterIP (for example `10.96.0.10`) and set `port` accordingly. Do not include protocol or path.
@@ -69,9 +101,9 @@ To run with the llm-katan overlay instead:
 
 ```bash
 kubectl apply -k deploy/kubernetes/overlays/llm-katan
-````
+```
 
-````
+Note: The llm-katan overlay no longer references parent files directly. It uses a local patch (`deploy/kubernetes/overlays/llm-katan/patch-llm-katan.yaml`) to inject the sidecar, avoiding kustomize parent-directory restrictions.
 
 ## Step 3: Install Envoy Gateway
 
diff --git a/website/docs/troubleshooting/network-tips.md b/website/docs/troubleshooting/network-tips.md
index c4a29bcef..fafcc7855 100644
--- a/website/docs/troubleshooting/network-tips.md
+++ b/website/docs/troubleshooting/network-tips.md
@@ -26,12 +26,12 @@ The router will download embedding models on first run unless you provide them l
 
 ### Option A — Use local models (no external network)
 
-1) Download the required model(s) with any reachable method (VPN/offline) into the repo’s `./models` folder. Example layout:
+1. Download the required model(s) with any reachable method (VPN/offline) into the repo’s `./models` folder. Example layout:
 
    - `models/all-MiniLM-L12-v2/`
    - `models/category_classifier_modernbert-base_model`
 
-2) In `config/config.yaml`, point to the local path. Example:
+2. In `config/config.yaml`, point to the local path. Example:
 
    ```yaml
    bert_model:
@@ -39,7 +39,7 @@ The router will download embedding models on first run unless you provide them l
      model_id: /app/models/all-MiniLM-L12-v2
    ```
 
-3) No extra env is required. `deploy/docker-compose/docker-compose.yml` already mounts `./models:/app/models:ro`.
+3. No extra env is required. `deploy/docker-compose/docker-compose.yml` already mounts `./models:/app/models:ro`.
 
 ### Option B — Use HF cache + mirror
 
@@ -53,7 +53,7 @@ services:
     environment:
       - HUGGINGFACE_HUB_CACHE=/root/.cache/huggingface
       - HF_HUB_ENABLE_HF_TRANSFER=1
-      - HF_ENDPOINT=https://hf-mirror.com  # example mirror endpoint (China)
+      - HF_ENDPOINT=https://hf-mirror.com # example mirror endpoint (China)
 ```
 
 Optional: pre-warm cache on the host (only if you have `huggingface_hub` installed):
@@ -70,7 +70,7 @@ PY
 
 When building `Dockerfile.extproc`, the Go stage may hang on `proxy.golang.org`. Create an override Dockerfile that enables mirrors without touching the original.
 
-1) Create `Dockerfile.extproc.cn` at repo root with this content:
+1. Create `Dockerfile.extproc.cn` at repo root with this content:
 
 ```Dockerfile
 # syntax=docker/dockerfile:1
@@ -118,7 +118,7 @@ RUN chmod +x /app/entrypoint.sh
 ENTRYPOINT ["/app/entrypoint.sh"]
 ```
 
-2) Point compose to the override Dockerfile by extending `docker-compose.override.yml`:
+2. Point compose to the override Dockerfile by extending `docker-compose.override.yml`:
 
 ```yaml
 services:
@@ -131,7 +131,7 @@ services:
 
 For the optional testing profile, create an override Dockerfile to configure pip mirrors.
 
-1) Create `tools/mock-vllm/Dockerfile.cn`:
+1. Create `tools/mock-vllm/Dockerfile.cn`:
 
 ```Dockerfile
 FROM python:3.11-slim
@@ -150,7 +150,7 @@ EXPOSE 8000
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
 ```
 
-2) Extend `docker-compose.override.yml` to use the override Dockerfile for `mock-vllm`:
+2. Extend `docker-compose.override.yml` to use the override Dockerfile for `mock-vllm`:
 
 ```yaml
 services:
@@ -203,13 +203,84 @@ Container runtimes on Kubernetes nodes do not automatically reuse the host Docke
 - Use `kubectl describe pod <name>` or `kubectl get events` to confirm pull errors disappear.
 - Check that services such as `semantic-router-metrics` now expose endpoints and respond via port-forward (`kubectl port-forward svc/<service> <local-port>:<service-port>`).
 
+### 5.6 Mount local models via hostPath PV (no external HF)
+
+When you already have models under `./models` locally, you can mount them into the Pod without any download:
+
+1. Create a hostPath PV and a matching PVC (example paths assume Kind; for other clusters, pick a node path visible to kubelet):
+
+```yaml
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: semantic-router-models-pv
+spec:
+  capacity:
+    storage: 50Gi
+  accessModes: ["ReadWriteOnce"]
+  storageClassName: standard
+  persistentVolumeReclaimPolicy: Retain
+  hostPath:
+    path: /tmp/hostpath-provisioner/models
+    type: DirectoryOrCreate
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: semantic-router-models
+spec:
+  accessModes: ["ReadWriteOnce"]
+  resources:
+    requests:
+      storage: 30Gi
+  storageClassName: standard
+  volumeName: semantic-router-models-pv
+```
+
+2. Copy your local models into the node path (Kind example):
+
+```bash
+docker cp ./models semantic-router-cluster-control-plane:/tmp/hostpath-provisioner/
+```
+
+3. Ensure the Deployment mounts the PVC at `/app/models` and set `imagePullPolicy: IfNotPresent`:
+
+```yaml
+volumes:
+  - name: models-volume
+    persistentVolumeClaim:
+      claimName: semantic-router-models
+containers:
+  - name: semantic-router
+    imagePullPolicy: IfNotPresent
+    volumeMounts:
+      - name: models-volume
+        mountPath: /app/models
+```
+
+4. If the PV is tied to a specific node path, schedule the Pod onto that node via `nodeSelector` or add tolerations if you untainted the control-plane node:
+
+```yaml
+spec:
+  nodeSelector:
+    kubernetes.io/hostname: semantic-router-cluster-control-plane
+  tolerations:
+    - key: "node-role.kubernetes.io/control-plane"
+      effect: "NoSchedule"
+      operator: "Exists"
+```
+
+This approach completely avoids Hugging Face downloads inside the cluster and is the most reliable in restricted networks.
+
 ## 6. Troubleshooting
 
 - Go modules still time out:
+
   - Verify `GOPROXY` and `GOSUMDB` are present in the go-builder stage logs.
   - Try a clean build: `docker compose build --no-cache`.
 
 - HF models still download slowly:
+
   - Prefer Option A (local models).
   - Ensure the cache volume is mounted and `HF_ENDPOINT`/`HF_HUB_ENABLE_HF_TRANSFER` are set.
 

From 3d11e5fbcbc2dba82ab2afad38f915b1c401f243 Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Mon, 20 Oct 2025 14:27:19 +0800
Subject: [PATCH 7/8] get rig of redudent files

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 deploy/kubernetes/config.yaml    | 169 -------------------------------
 deploy/kubernetes/namespace.yaml |   4 -
 deploy/kubernetes/pvc.yaml       |  13 ---
 deploy/kubernetes/service.yaml   |  38 -------
 deploy/kubernetes/tools_db.json  | 142 --------------------------
 5 files changed, 366 deletions(-)
 delete mode 100644 deploy/kubernetes/config.yaml
 delete mode 100644 deploy/kubernetes/namespace.yaml
 delete mode 100644 deploy/kubernetes/pvc.yaml
 delete mode 100644 deploy/kubernetes/service.yaml
 delete mode 100644 deploy/kubernetes/tools_db.json

diff --git a/deploy/kubernetes/config.yaml b/deploy/kubernetes/config.yaml
deleted file mode 100644
index 777600130..000000000
--- a/deploy/kubernetes/config.yaml
+++ /dev/null
@@ -1,169 +0,0 @@
-bert_model:
-  model_id: models/all-MiniLM-L12-v2
-  threshold: 0.6
-  use_cpu: true
-
-semantic_cache:
-  enabled: true
-  backend_type: "memory" # Options: "memory" or "milvus"
-  similarity_threshold: 0.8
-  max_entries: 1000 # Only applies to memory backend
-  ttl_seconds: 3600
-  eviction_policy: "fifo"
-
-tools:
-  enabled: true
-  top_k: 3
-  similarity_threshold: 0.2
-  tools_db_path: "config/tools_db.json"
-  fallback_to_empty: true
-
-prompt_guard:
-  enabled: true
-  use_modernbert: true
-  model_id: "models/jailbreak_classifier_modernbert-base_model"
-  threshold: 0.7
-  use_cpu: true
-  jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
-
-# vLLM Endpoints Configuration
-# IMPORTANT: 'address' field must be a valid IP address (IPv4 or IPv6)
-# Supported formats: 127.0.0.1, 192.168.1.1, ::1, 2001:db8::1
-# NOT supported: domain names (example.com), protocol prefixes (http://), paths (/api), ports in address (use 'port' field)
-vllm_endpoints:
-  - name: "endpoint1"
-    address: "127.0.0.1" # llm-katan sidecar or local backend
-    port: 8002
-    weight: 1
-
-model_config:
-  "qwen3":
-    reasoning_family: "qwen3" # Match docker-compose default model name
-    preferred_endpoints: ["endpoint1"]
-    pii_policy:
-      allow_by_default: true
-
-# Classifier configuration
-classifier:
-  category_model:
-    model_id: "models/category_classifier_modernbert-base_model"
-    use_modernbert: true
-    threshold: 0.6
-    use_cpu: true
-    category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
-  pii_model:
-    model_id: "models/pii_classifier_modernbert-base_presidio_token_model"
-    use_modernbert: true
-    threshold: 0.7
-    use_cpu: true
-    pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json"
-
-# Categories with new use_reasoning field structure
-categories:
-  - name: business
-    model_scores:
-      - model: qwen3
-        score: 0.7
-        use_reasoning: false # Business performs better without reasoning
-  - name: law
-    model_scores:
-      - model: qwen3
-        score: 0.4
-        use_reasoning: false
-  - name: psychology
-    model_scores:
-      - model: qwen3
-        score: 0.6
-        use_reasoning: false
-  - name: biology
-    model_scores:
-      - model: qwen3
-        score: 0.9
-        use_reasoning: false
-  - name: chemistry
-    model_scores:
-      - model: qwen3
-        score: 0.6
-        use_reasoning: true # Enable reasoning for complex chemistry
-  - name: history
-    model_scores:
-      - model: qwen3
-        score: 0.7
-        use_reasoning: false
-  - name: other
-    model_scores:
-      - model: qwen3
-        score: 0.7
-        use_reasoning: false
-  - name: health
-    model_scores:
-      - model: qwen3
-        score: 0.5
-        use_reasoning: false
-  - name: economics
-    model_scores:
-      - model: qwen3
-        score: 1.0
-        use_reasoning: false
-  - name: math
-    model_scores:
-      - model: qwen3
-        score: 1.0
-        use_reasoning: true # Enable reasoning for complex math
-  - name: physics
-    model_scores:
-      - model: qwen3
-        score: 0.7
-        use_reasoning: true # Enable reasoning for physics
-  - name: computer science
-    model_scores:
-      - model: qwen3
-        score: 0.6
-        use_reasoning: false
-  - name: philosophy
-    model_scores:
-      - model: qwen3
-        score: 0.5
-        use_reasoning: false
-  - name: engineering
-    model_scores:
-      - model: qwen3
-        score: 0.7
-        use_reasoning: false
-
-default_model: qwen3
-
-# Reasoning family configurations
-reasoning_families:
-  deepseek:
-    type: "chat_template_kwargs"
-    parameter: "thinking"
-
-  qwen3:
-    type: "chat_template_kwargs"
-    parameter: "enable_thinking"
-
-  gpt-oss:
-    type: "reasoning_effort"
-    parameter: "reasoning_effort"
-  gpt:
-    type: "reasoning_effort"
-    parameter: "reasoning_effort"
-
-# Global default reasoning effort level
-default_reasoning_effort: high
-
-# API Configuration
-api:
-  batch_classification:
-    max_batch_size: 100
-    concurrency_threshold: 5
-    max_concurrency: 8
-    metrics:
-      enabled: true
-      detailed_goroutine_tracking: true
-      high_resolution_timing: false
-      sample_rate: 1.0
-      duration_buckets:
-        [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
-      size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]
diff --git a/deploy/kubernetes/namespace.yaml b/deploy/kubernetes/namespace.yaml
deleted file mode 100644
index 0bdc316f5..000000000
--- a/deploy/kubernetes/namespace.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-apiVersion: v1
-kind: Namespace
-metadata:
-  name: vllm-semantic-router-system
diff --git a/deploy/kubernetes/pvc.yaml b/deploy/kubernetes/pvc.yaml
deleted file mode 100644
index 43b66eb95..000000000
--- a/deploy/kubernetes/pvc.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: semantic-router-models
-  labels:
-    app: semantic-router
-spec:
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 30Gi
-  storageClassName: standard
diff --git a/deploy/kubernetes/service.yaml b/deploy/kubernetes/service.yaml
deleted file mode 100644
index 5d674a6fd..000000000
--- a/deploy/kubernetes/service.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: semantic-router
-  namespace: vllm-semantic-router-system
-  labels:
-    app: semantic-router
-spec:
-  type: ClusterIP
-  ports:
-  - port: 50051
-    targetPort: grpc
-    protocol: TCP
-    name: grpc
-  - port: 8080
-    targetPort: 8080
-    protocol: TCP
-    name: classify-api
-  selector:
-    app: semantic-router
----
-apiVersion: v1
-kind: Service
-metadata:
-  name: semantic-router-metrics
-  namespace: vllm-semantic-router-system
-  labels:
-    app: semantic-router
-    service: metrics
-spec:
-  type: ClusterIP
-  ports:
-  - port: 9190
-    targetPort: metrics
-    protocol: TCP
-    name: metrics
-  selector:
-    app: semantic-router
diff --git a/deploy/kubernetes/tools_db.json b/deploy/kubernetes/tools_db.json
deleted file mode 100644
index dccbf48aa..000000000
--- a/deploy/kubernetes/tools_db.json
+++ /dev/null
@@ -1,142 +0,0 @@
-[
-  {
-    "tool": {
-      "type": "function",
-      "function": {
-        "name": "get_weather",
-        "description": "Get current weather information for a location",
-        "parameters": {
-          "type": "object",
-          "properties": {
-            "location": {
-              "type": "string",
-              "description": "The city and state, e.g. San Francisco, CA"
-            },
-            "unit": {
-              "type": "string",
-              "enum": ["celsius", "fahrenheit"],
-              "description": "Temperature unit"
-            }
-          },
-          "required": ["location"]
-        }
-      }
-    },
-    "description": "Get current weather information, temperature, conditions, forecast for any location, city, or place. Check weather today, now, current conditions, temperature, rain, sun, cloudy, hot, cold, storm, snow",
-    "category": "weather",
-    "tags": ["weather", "temperature", "forecast", "climate"]
-  },
-  {
-    "tool": {
-      "type": "function",
-      "function": {
-        "name": "search_web",
-        "description": "Search the web for information",
-        "parameters": {
-          "type": "object",
-          "properties": {
-            "query": {
-              "type": "string",
-              "description": "The search query"
-            },
-            "num_results": {
-              "type": "integer",
-              "description": "Number of results to return",
-              "default": 5
-            }
-          },
-          "required": ["query"]
-        }
-      }
-    },
-    "description": "Search the internet, web search, find information online, browse web content, lookup, research, google, find answers, discover, investigate",
-    "category": "search",
-    "tags": ["search", "web", "internet", "information", "browse"]
-  },
-  {
-    "tool": {
-      "type": "function",
-      "function": {
-        "name": "calculate",
-        "description": "Perform mathematical calculations",
-        "parameters": {
-          "type": "object",
-          "properties": {
-            "expression": {
-              "type": "string",
-              "description": "Mathematical expression to evaluate"
-            }
-          },
-          "required": ["expression"]
-        }
-      }
-    },
-    "description": "Calculate mathematical expressions, solve math problems, arithmetic operations, compute numbers, addition, subtraction, multiplication, division, equations, formula",
-    "category": "math",
-    "tags": ["math", "calculation", "arithmetic", "compute", "numbers"]
-  },
-  {
-    "tool": {
-      "type": "function",
-      "function": {
-        "name": "send_email",
-        "description": "Send an email message",
-        "parameters": {
-          "type": "object",
-          "properties": {
-            "to": {
-              "type": "string",
-              "description": "Recipient email address"
-            },
-            "subject": {
-              "type": "string",
-              "description": "Email subject"
-            },
-            "body": {
-              "type": "string",
-              "description": "Email body content"
-            }
-          },
-          "required": ["to", "subject", "body"]
-        }
-      }
-    },
-    "description": "Send email messages, email communication, contact people via email, mail, message, correspondence, notify, inform",
-    "category": "communication",
-    "tags": ["email", "send", "communication", "message", "contact"]
-  },
-  {
-    "tool": {
-      "type": "function",
-      "function": {
-        "name": "create_calendar_event",
-        "description": "Create a new calendar event or appointment",
-        "parameters": {
-          "type": "object",
-          "properties": {
-            "title": {
-              "type": "string",
-              "description": "Event title"
-            },
-            "date": {
-              "type": "string",
-              "description": "Event date in YYYY-MM-DD format"
-            },
-            "time": {
-              "type": "string",
-              "description": "Event time in HH:MM format"
-            },
-            "duration": {
-              "type": "integer",
-              "description": "Duration in minutes"
-            }
-          },
-          "required": ["title", "date", "time"]
-        }
-      }
-    },
-    "description": "Schedule meetings, create calendar events, set appointments, manage calendar, book time, plan meeting, organize schedule, reminder, agenda",
-    "category": "productivity",
-    "tags": ["calendar", "event", "meeting", "appointment", "schedule"]
-  }
-] 
\ No newline at end of file

From ee082b4ca24c39b4c7b595de44b6cd43255d03fa Mon Sep 17 00:00:00 2001
From: JaredforReal <w13431838023@gmail.com>
Date: Mon, 20 Oct 2025 15:03:48 +0800
Subject: [PATCH 8/8] add pvc to k8s & update docs

Signed-off-by: JaredforReal <w13431838023@gmail.com>
---
 deploy/kubernetes/README.md                   |  41 ++++---
 deploy/kubernetes/base/kustomization.yaml     |   1 -
 .../base/{pv.example.yaml => pv.yaml}         |   0
 .../overlays/storage/kustomization.yaml       |   6 ++
 .../overlays/storage/namespace.yaml           |   4 +
 .../{base => overlays/storage}/pvc.yaml       |   0
 website/docs/installation/kubernetes.md       |  49 +++++----
 website/docs/troubleshooting/network-tips.md  | 102 ++++++++----------
 8 files changed, 111 insertions(+), 92 deletions(-)
 rename deploy/kubernetes/base/{pv.example.yaml => pv.yaml} (100%)
 create mode 100644 deploy/kubernetes/overlays/storage/kustomization.yaml
 create mode 100644 deploy/kubernetes/overlays/storage/namespace.yaml
 rename deploy/kubernetes/{base => overlays/storage}/pvc.yaml (100%)

diff --git a/deploy/kubernetes/README.md b/deploy/kubernetes/README.md
index ab225a43b..e51d6aa45 100644
--- a/deploy/kubernetes/README.md
+++ b/deploy/kubernetes/README.md
@@ -28,8 +28,11 @@ The deployment consists of:
 
 ### Standard Kubernetes Deployment
 
+First-time apply (creates PVC via storage overlay):
+
 ```bash
-kubectl apply -k deploy/kubernetes/
+kubectl apply -k deploy/kubernetes/overlays/storage
+kubectl apply -k deploy/kubernetes/overlays/core   # or overlays/llm-katan
 
 # Check deployment status
 kubectl get pods -l app=semantic-router -n vllm-semantic-router-system
@@ -39,6 +42,12 @@ kubectl get services -l app=semantic-router -n vllm-semantic-router-system
 kubectl logs -l app=semantic-router -n vllm-semantic-router-system -f
 ```
 
+Day-2 updates (do not touch PVC):
+
+```bash
+kubectl apply -k deploy/kubernetes/overlays/core   # or overlays/llm-katan
+```
+
 ### Kind (Kubernetes in Docker) Deployment
 
 For local development and testing, you can deploy to a kind cluster with optimized resource settings.
@@ -86,6 +95,10 @@ kubectl wait --for=condition=Ready nodes --all --timeout=300s
 **Step 2: Deploy the application**
 
 ```bash
+# First-time storage (PVC)
+kubectl apply -k deploy/kubernetes/overlays/storage
+
+# Deploy app
 kubectl apply -k deploy/kubernetes/
 
 # Wait for deployment to be ready
@@ -298,7 +311,7 @@ kubectl logs -n semantic-router -l app=semantic-router -c model-downloader
 # Check resource usage
 kubectl top pods -n semantic-router
 
-# Adjust resource limits in deployment.yaml if needed
+# Adjust resource limits in base/deployment.yaml if needed
 ```
 
 ### Storage sizing
@@ -314,23 +327,23 @@ For different environments, you can adjust resource requirements:
 - **Testing**: 4Gi memory, 1 CPU
 - **Production**: 8Gi+ memory, 2+ CPU
 
-Edit the `resources` section in `deployment.yaml` accordingly.
+Edit the `resources` section in `base/deployment.yaml` accordingly.
 
 ## Files Overview
 
 ### Kubernetes Manifests (`deploy/kubernetes/`)
 
-- `base/` - Shared resources (Namespace, PVC, Service, ConfigMap)
-- `overlays/core/` - Core deployment (no llm-katan)
-- `overlays/llm-katan/` - Deployment with llm-katan sidecar
-- `deployment.yaml` - Plain deployment (used by core overlay)
-- `deployment.katan.yaml` - Sidecar deployment (used by llm-katan overlay)
-- `service.yaml` - gRPC, HTTP API, and metrics services
-- `pvc.yaml` - Persistent volume claim for model storage
-- `namespace.yaml` - Dedicated namespace for the application
-- `config.yaml` - Application configuration (defaults to qwen3 @ 127.0.0.1:8002)
-- `tools_db.json` - Tools database for semantic routing
-- `kustomization.yaml` - Root entry (defaults to core overlay)
+- `base/` - Shared resources (Namespace, Service, ConfigMap, Deployment)
+  - `namespace.yaml` - Dedicated namespace for the application
+  - `service.yaml` - gRPC, HTTP API, and metrics services
+  - `deployment.yaml` - App deployment (init downloads by default; imagePullPolicy IfNotPresent)
+  - `config.yaml` - Application configuration (defaults to qwen3 @ 127.0.0.1:8002)
+  - `tools_db.json` - Tools database for semantic routing
+  - `pv.yaml` - OPTIONAL hostPath PV for local models (edit path as needed)
+- `overlays/core/` - Core deployment (no llm-katan), references `base/`
+- `overlays/llm-katan/` - Adds llm-katan sidecar via local patch (no parent file references)
+- `overlays/storage/` - PVC only (self-contained `namespace.yaml` + `pvc.yaml`), run once to create storage
+- `kustomization.yaml` - Root entry (defaults to `overlays/core`)
 
 ### Development Tools
 
diff --git a/deploy/kubernetes/base/kustomization.yaml b/deploy/kubernetes/base/kustomization.yaml
index 07cfed2c6..eeb933939 100644
--- a/deploy/kubernetes/base/kustomization.yaml
+++ b/deploy/kubernetes/base/kustomization.yaml
@@ -3,7 +3,6 @@ kind: Kustomization
 
 resources:
   - ./namespace.yaml
-  - ./pvc.yaml
   - ./service.yaml
   - ./deployment.yaml
 
diff --git a/deploy/kubernetes/base/pv.example.yaml b/deploy/kubernetes/base/pv.yaml
similarity index 100%
rename from deploy/kubernetes/base/pv.example.yaml
rename to deploy/kubernetes/base/pv.yaml
diff --git a/deploy/kubernetes/overlays/storage/kustomization.yaml b/deploy/kubernetes/overlays/storage/kustomization.yaml
new file mode 100644
index 000000000..349f724d9
--- /dev/null
+++ b/deploy/kubernetes/overlays/storage/kustomization.yaml
@@ -0,0 +1,6 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - ./namespace.yaml
+  - ./pvc.yaml
diff --git a/deploy/kubernetes/overlays/storage/namespace.yaml b/deploy/kubernetes/overlays/storage/namespace.yaml
new file mode 100644
index 000000000..0bdc316f5
--- /dev/null
+++ b/deploy/kubernetes/overlays/storage/namespace.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: vllm-semantic-router-system
diff --git a/deploy/kubernetes/base/pvc.yaml b/deploy/kubernetes/overlays/storage/pvc.yaml
similarity index 100%
rename from deploy/kubernetes/base/pvc.yaml
rename to deploy/kubernetes/overlays/storage/pvc.yaml
diff --git a/website/docs/installation/kubernetes.md b/website/docs/installation/kubernetes.md
index c91f739cc..a679bd5d7 100644
--- a/website/docs/installation/kubernetes.md
+++ b/website/docs/installation/kubernetes.md
@@ -31,11 +31,11 @@ kind create cluster --name semantic-router-cluster --config tools/kind/kind-conf
 kubectl wait --for=condition=Ready nodes --all --timeout=300s
 ```
 
-**Note**: The kind configuration provides sufficient resources (8GB+ RAM, 4+ CPU cores) for running the semantic router and AI gateway components.
+Note: The kind configuration provides sufficient resources (8GB+ RAM, 4+ CPU cores).
 
 ## Step 2: Deploy vLLM Semantic Router
 
-Configure the semantic router by editing `deploy/kubernetes/config.yaml`. This file contains the vLLM configuration, including model config, endpoints, and policies. The repository provides two Kustomize overlays similar to docker-compose profiles:
+Edit `deploy/kubernetes/config.yaml` (models, endpoints, policies). Two overlays are provided:
 
 - core (default): only the semantic-router
   - Path: `deploy/kubernetes/overlays/core` (root `deploy/kubernetes/` points here by default)
@@ -49,18 +49,21 @@ deploy/kubernetes/
   base/
     kustomization.yaml        # base kustomize: namespace, PVC, service, deployment
     namespace.yaml            # Namespace for all resources
-    pvc.yaml                  # PVC for models (storageClass and size adjustable)
     service.yaml              # Service exposing gRPC/metrics/HTTP ports
     deployment.yaml           # Semantic Router Deployment (init downloads by default)
     config.yaml               # Router config (mounted via ConfigMap)
     tools_db.json             # Tools DB (mounted via ConfigMap)
-    pv.example.yaml           # OPTIONAL: hostPath PV example for local models
+    pv.yaml                   # OPTIONAL: hostPath PV for local models (edit path as needed)
   overlays/
     core/
       kustomization.yaml      # Uses only base
     llm-katan/
       kustomization.yaml      # Patches base to add llm-katan sidecar
       patch-llm-katan.yaml    # Strategic-merge patch injecting sidecar
+    storage/
+      kustomization.yaml      # PVC only; run once to create storage, not for day-2 updates
+      namespace.yaml          # Local copy for self-contained apply
+      pvc.yaml                # PVC definition
   kustomization.yaml          # Root points to overlays/core by default
   README.md                   # Additional notes
   namespace.yaml, pvc.yaml, service.yaml (top-level shortcuts kept for backward compat)
@@ -68,22 +71,28 @@ deploy/kubernetes/
 
 Notes:
 
-- The base deployment includes an initContainer that downloads required models on first run.
-- If your cluster has limited egress, prefer mounting local models via a PV/PVC and skip downloads:
-  - Copy `base/pv.example.yaml` to `base/pv.yaml`, apply it, and ensure `base/pvc.yaml` is bound to that PV.
-  - Mount point remains `/app/models` in the pod.
-  - See “Network Tips” for details on hostPath PV, image mirrors, and preloading images.
-
-Important notes before you apply manifests:
-
-- `vllm_endpoints.address` must be an IP address (not hostname) reachable from inside the cluster. If your LLM backends run as K8s Services, use the ClusterIP (for example `10.96.0.10`) and set `port` accordingly. Do not include protocol or path.
-- The PVC in `deploy/kubernetes/pvc.yaml` uses `storageClassName: standard`. On some clouds or local clusters, the default StorageClass name may differ (e.g., `standard-rwo`, `gp2`, or a provisioner like local-path). Adjust as needed.
-- Default PVC size is 30Gi. Size it to at least 2–3x of your total model footprint to leave room for indexes and updates.
-- The initContainer downloads several models from Hugging Face on first run and writes them into the PVC. Ensure outbound egress to Hugging Face is allowed and there is at least ~6–8 GiB free space for the models specified.
-- Per mode, the init container downloads differ:
-  - core: classifiers + the embedding model `sentence-transformers/all-MiniLM-L12-v2` into `/app/models/all-MiniLM-L12-v2`.
-  - llm-katan: everything in core, plus `Qwen/Qwen3-0.6B` into `/app/models/Qwen/Qwen3-0.6B`.
-- The default `config.yaml` points to `qwen3` at `127.0.0.1:8002`, which matches the llm-katan overlay. If you use core (no sidecar), either change `vllm_endpoints` to your actual backend Service IP:Port, or deploy the llm-katan overlay.
+- Base downloads models on first run (initContainer).
+- In restricted networks, prefer local models via PV/PVC; see Network Tips for hostPath PV, mirrors, and image preload. Mount point is `/app/models`.
+
+First-time apply (creates PVC):
+
+```bash
+kubectl apply -k deploy/kubernetes/overlays/storage
+kubectl apply -k deploy/kubernetes/overlays/core        # or overlays/llm-katan
+```
+
+Day-2 updates (do not touch PVC):
+
+```bash
+kubectl apply -k deploy/kubernetes/overlays/core        # or overlays/llm-katan
+```
+
+Important:
+
+- `vllm_endpoints.address` must be an IP reachable inside the cluster (no scheme/path).
+- PVC default size is 30Gi; adjust to model footprint. StorageClass name may differ by cluster.
+- core downloads classifiers + `all-MiniLM-L12-v2`; llm-katan also prepares `Qwen/Qwen3-0.6B`.
+- Default config uses `qwen3@127.0.0.1:8002` (matches llm-katan); if using core, update endpoints accordingly.
 
 Deploy the semantic router service with all required components (core mode by default):
 
diff --git a/website/docs/troubleshooting/network-tips.md b/website/docs/troubleshooting/network-tips.md
index fafcc7855..2e27f3e92 100644
--- a/website/docs/troubleshooting/network-tips.md
+++ b/website/docs/troubleshooting/network-tips.md
@@ -180,10 +180,43 @@ Container runtimes on Kubernetes nodes do not automatically reuse the host Docke
 
 ### 5.1 Configure containerd or CRI mirrors
 
-- For clusters backed by containerd (Kind, k3s, kubeadm), edit `/etc/containerd/config.toml` or use Kind’s `containerdConfigPatches` to add regional mirror endpoints for registries such as `docker.io`, `ghcr.io`, or `quay.io`.
+- For clusters backed by containerd (Kind, k3s, kubeadm), edit `/etc/containerd/config.toml` or use Kind’s `containerdConfigPatches` to add regional mirror endpoints for registries such as `docker.io`, `ghcr.io`, or `registry.k8s.io`.
 - Restart containerd and kubelet after changes so the new mirrors take effect.
 - Avoid pointing mirrors to loopback proxies unless every node can reach that proxy address.
 
+Example `/etc/containerd/config.toml` mirrors (China):
+
+```toml
+[plugins."io.containerd.grpc.v1.cri".registry.mirrors]
+  [plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"]
+    endpoint = [
+      "https://docker.m.daocloud.io",
+      "https://mirror.ccs.tencentyun.com",
+      "https://mirror.baidubce.com",
+      "https://docker.mirrors.ustc.edu.cn",
+      "https://hub-mirror.c.163.com"
+    ]
+  [plugins."io.containerd.grpc.v1.cri".registry.mirrors."ghcr.io"]
+    endpoint = [
+      "https://ghcr.nju.edu.cn",
+      "https://ghcr.dockerproxy.com",
+      "https://ghcr.bj.bcebos.com"
+    ]
+  [plugins."io.containerd.grpc.v1.cri".registry.mirrors."registry.k8s.io"]
+    endpoint = [
+      "https://k8s.m.daocloud.io",
+      "https://mirror.ccs.tencentyun.com",
+      "https://registry.aliyuncs.com"
+    ]
+```
+
+Apply and restart:
+
+```bash
+sudo systemctl restart containerd
+sudo systemctl restart kubelet
+```
+
 ### 5.2 Preload or sideload images
 
 - Build required images locally, then push them into the cluster runtime. For Kind, run `kind load docker-image --name <cluster> <image:tag>`; for other clusters, use `crictl pull` or `ctr -n k8s.io images import` on each node.
@@ -203,74 +236,29 @@ Container runtimes on Kubernetes nodes do not automatically reuse the host Docke
 - Use `kubectl describe pod <name>` or `kubectl get events` to confirm pull errors disappear.
 - Check that services such as `semantic-router-metrics` now expose endpoints and respond via port-forward (`kubectl port-forward svc/<service> <local-port>:<service-port>`).
 
-### 5.6 Mount local models via hostPath PV (no external HF)
+### 5.6 Mount local models via PV/PVC (no external HF)
 
-When you already have models under `./models` locally, you can mount them into the Pod without any download:
+When you already have models under `./models` locally, mount them into the Pod and skip downloads:
 
-1. Create a hostPath PV and a matching PVC (example paths assume Kind; for other clusters, pick a node path visible to kubelet):
-
-```yaml
-apiVersion: v1
-kind: PersistentVolume
-metadata:
-  name: semantic-router-models-pv
-spec:
-  capacity:
-    storage: 50Gi
-  accessModes: ["ReadWriteOnce"]
-  storageClassName: standard
-  persistentVolumeReclaimPolicy: Retain
-  hostPath:
-    path: /tmp/hostpath-provisioner/models
-    type: DirectoryOrCreate
----
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: semantic-router-models
-spec:
-  accessModes: ["ReadWriteOnce"]
-  resources:
-    requests:
-      storage: 30Gi
-  storageClassName: standard
-  volumeName: semantic-router-models-pv
-```
+1. Create a PV (optional; edit `deploy/kubernetes/base/pv.yaml` hostPath to your node path and apply it). If you use a dynamic StorageClass, you can skip the PV.
 
-2. Copy your local models into the node path (Kind example):
+2. Create the PVC once via the storage overlay:
 
 ```bash
-docker cp ./models semantic-router-cluster-control-plane:/tmp/hostpath-provisioner/
+kubectl apply -k deploy/kubernetes/overlays/storage
 ```
 
-3. Ensure the Deployment mounts the PVC at `/app/models` and set `imagePullPolicy: IfNotPresent`:
+3. Copy your local models to the node path (hostPath example for kind):
 
-```yaml
-volumes:
-  - name: models-volume
-    persistentVolumeClaim:
-      claimName: semantic-router-models
-containers:
-  - name: semantic-router
-    imagePullPolicy: IfNotPresent
-    volumeMounts:
-      - name: models-volume
-        mountPath: /app/models
+```bash
+docker cp ./models semantic-router-cluster-control-plane:/tmp/hostpath-provisioner/
 ```
 
-4. If the PV is tied to a specific node path, schedule the Pod onto that node via `nodeSelector` or add tolerations if you untainted the control-plane node:
+4. Ensure the Deployment mounts the PVC at `/app/models` and set `imagePullPolicy: IfNotPresent` (already configured in `base/deployment.yaml`).
 
-```yaml
-spec:
-  nodeSelector:
-    kubernetes.io/hostname: semantic-router-cluster-control-plane
-  tolerations:
-    - key: "node-role.kubernetes.io/control-plane"
-      effect: "NoSchedule"
-      operator: "Exists"
-```
+5. If the PV is tied to a specific node path, pin the Pod to that node using `nodeSelector` or add tolerations if you untainted the control-plane node.
 
-This approach completely avoids Hugging Face downloads inside the cluster and is the most reliable in restricted networks.
+This path avoids Hugging Face downloads and is the most reliable in restricted networks.
 
 ## 6. Troubleshooting