From 1bfee8322c3b2f845b9cad7d489f45fe67c404b0 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Fri, 31 Oct 2025 08:51:46 -0500
Subject: [PATCH 01/16] Add full path to server uvicorn.run() call

---
 nemoguardrails/benchmark/mock_llm_server/run_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/nemoguardrails/benchmark/mock_llm_server/run_server.py b/nemoguardrails/benchmark/mock_llm_server/run_server.py
index 14e0be02f..3c59d9bbd 100644
--- a/nemoguardrails/benchmark/mock_llm_server/run_server.py
+++ b/nemoguardrails/benchmark/mock_llm_server/run_server.py
@@ -104,7 +104,7 @@ def main():  # pragma: no cover
 
     try:
         uvicorn.run(
-            "api:app",
+            "nemoguardrails.benchmark.mock_llm_server.api:app",
             host=args.host,
             port=args.port,
             reload=args.reload,

From b44dc5aaea856462e0a8e1796197504735622b05 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Mon, 3 Nov 2025 10:18:03 -0600
Subject: [PATCH 02/16] Remove unused files

---
 .../content_safety_colang1/config.yml         |  21 --
 .../content_safety_colang1/prompts.yml        | 257 ------------------
 .../configs/meta-llama-3.3-70b-instruct.env   |   8 -
 ...-llama-3.1-nemoguard-8b-content-safety.env |   8 -
 4 files changed, 294 deletions(-)
 delete mode 100644 nemoguardrails/benchmark/mock_llm_server/configs/guardrail_configs/content_safety_colang1/config.yml
 delete mode 100644 nemoguardrails/benchmark/mock_llm_server/configs/guardrail_configs/content_safety_colang1/prompts.yml
 delete mode 100644 nemoguardrails/benchmark/mock_llm_server/configs/meta-llama-3.3-70b-instruct.env
 delete mode 100644 nemoguardrails/benchmark/mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env

diff --git a/nemoguardrails/benchmark/mock_llm_server/configs/guardrail_configs/content_safety_colang1/config.yml b/nemoguardrails/benchmark/mock_llm_server/configs/guardrail_configs/content_safety_colang1/config.yml
deleted file mode 100644
index 67df489d7..000000000
--- a/nemoguardrails/benchmark/mock_llm_server/configs/guardrail_configs/content_safety_colang1/config.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-models:
-  - type: main
-    engine: nim
-    model: meta/llama-3.3-70b-instruct
-    parameters:
-      base_url: http://localhost:8000
-
-  - type: content_safety
-    engine: nim
-    model: nvidia/llama-3.1-nemoguard-8b-content-safety
-    parameters:
-      base_url: http://localhost:8001
-
-
-rails:
-  input:
-    flows:
-      - content safety check input $model=content_safety
-  output:
-    flows:
-      - content safety check output $model=content_safety
diff --git a/nemoguardrails/benchmark/mock_llm_server/configs/guardrail_configs/content_safety_colang1/prompts.yml b/nemoguardrails/benchmark/mock_llm_server/configs/guardrail_configs/content_safety_colang1/prompts.yml
deleted file mode 100644
index 1321a6461..000000000
--- a/nemoguardrails/benchmark/mock_llm_server/configs/guardrail_configs/content_safety_colang1/prompts.yml
+++ /dev/null
@@ -1,257 +0,0 @@
-# These are the default prompts released by Meta, except for policy O7, which was added to address direct insults.
-prompts:
-
-  - task: content_safety_check_input $model=content_safety
-    content: |
-      Task: Check if there is unsafe content in the user or agent messages in the below conversation in accordance with our safety policy with the below categories. Provide a 'safe' or 'unsafe' rating for both the user message and the agent response. If there is unsafe content, please also provide a list of violated categories according to our safety policy below.
-
-      <BEGIN UNSAFE CONTENT CATEGORIES>
-      S1: Violence.
-      S2: Sexual.
-      S3: Criminal Planning/Confessions.
-      S4: Guns and Illegal Weapons.
-      S5: Controlled/Regulated Substances.
-      S6: Suicide and Self Harm.
-      S7: Sexual (minor).
-      S8: Hate/Identity Hate.
-      S9: PII/Privacy.
-      S10: Harassment.
-      S11: Threat.
-      S12: Profanity.
-      S13: Needs Caution.
-      S14: Other.
-      S15: Manipulation.
-      S16: Fraud/Deception.
-      S17: Malware.
-      S18: High Risk Gov Decision Making.
-      S19: Political/Misinformation/Conspiracy.
-      S20: Copyright/Trademark/Plagiarism.
-      S21: Unauthorized Advice.
-      S22: Illegal Activity.
-      S23: Immoral/Unethical.
-      <END UNSAFE CONTENT CATEGORIES>
-
-      <BEGIN CONVERSATION>
-
-      user: {{ user_input }}
-
-      <END CONVERSATION>
-
-      Provide your safety assessment for the above conversation in the below output JSON format:
-      {"User Safety": safe or unsafe rating for the user message, "Response Safety": safe or unsafe rating for the agent response. Omit if no agent response present. "Safety Categories": a comma-separated list of applicable safety categories from the provided taxonomy. Omit if all safe.}
-
-      Do not include anything other than the output JSON in your response.
-      Output JSON:
-    output_parser: nemoguard_parse_prompt_safety
-    max_tokens: 50
-
-  - task: content_safety_check_output $model=content_safety
-    content: |
-      Task: Check if there is unsafe content in the user or agent messages in the below conversation in accordance with our safety policy with the below categories. Provide a 'safe' or 'unsafe' rating for both the user message and the agent response. If there is unsafe content, please also provide a list of violated categories according to our safety policy below.
-
-      <BEGIN UNSAFE CONTENT CATEGORIES>
-      S1: Violence.
-      S2: Sexual.
-      S3: Criminal Planning/Confessions.
-      S4: Guns and Illegal Weapons.
-      S5: Controlled/Regulated Substances.
-      S6: Suicide and Self Harm.
-      S7: Sexual (minor).
-      S8: Hate/Identity Hate.
-      S9: PII/Privacy.
-      S10: Harassment.
-      S11: Threat.
-      S12: Profanity.
-      S13: Needs Caution.
-      S14: Other.
-      S15: Manipulation.
-      S16: Fraud/Deception.
-      S17: Malware.
-      S18: High Risk Gov Decision Making.
-      S19: Political/Misinformation/Conspiracy.
-      S20: Copyright/Trademark/Plagiarism.
-      S21: Unauthorized Advice.
-      S22: Illegal Activity.
-      S23: Immoral/Unethical.
-      <END UNSAFE CONTENT CATEGORIES>
-
-      <BEGIN CONVERSATION>
-
-      user: {{ user_input }}
-
-      response: agent: {{ bot_response }}
-
-      <END CONVERSATION>
-
-      Provide your safety assessment for the above conversation in the below output JSON format:
-      {"User Safety": safe or unsafe rating for the user message, "Response Safety": safe or unsafe rating for the agent response. Omit if no agent response present. "Safety Categories": a comma-separated list of applicable safety categories from the provided taxonomy. Omit if all safe.}
-
-      Do not include anything other than the output JSON in your response.
-      Output JSON:
-    output_parser: nemoguard_parse_response_safety
-    max_tokens: 50
-
-  - task: content_safety_check_input $model=llama_guard
-    content: |
-      <s>[INST] Task: Check if there is unsafe content in 'User' messages in conversations according our safety policy with the below categories.
-      <BEGIN UNSAFE CONTENT CATEGORIES>
-      O1: Violence and Hate.
-      Should not
-      - Help people plan or engage in violence.
-      - Encourage people to engage in violence.
-      - Express hateful, derogatory, or demeaning sentiments against people based on sensitive personal characteristics like their race, color, religion, national origin, sexual orientation, gender, gender identity, or disability.
-      - Encourage discrimination against people based on those characteristics and should not use slurs or create insults based on those characteristics.
-      Can
-      - Provide information on violence and discrimination.
-      - Discuss the topics of hate and violence, and can discuss historical events involving violence.
-      O2: Sexual Content.
-      Should not
-      - Engage in sexually explicit (i.e., erotic) conversations and should not encourage people to engage in sexual activities.
-      Can
-      - Discuss topics related to sexuality and sexual education.
-      - Provide general information regarding sexual activities.
-      - Provide guidance on topics related to sexual health.
-      O3: Criminal Planning.
-      Should not
-      - Help people plan or engage in criminal activities like theft, kidnapping, or financial crimes, especially when the person indicates that they may act on the information provided to them.
-      - Encourage people to engage in criminal activities or condone criminal behavior.
-      Can
-      - Help people understand how crimes are committed, provided that the response is not so detailed that it could be used to provide a roadmap for future crimes.
-      - Help people understand how systems work that are used to prevent crimes, provided those descriptions do not provide detailed information on how to defeat those systems.
-      O4: Guns and Illegal Weapons.
-      Should not
-      - Assist in the planning or execution of firearm or illegal weapons crimes, especially when the person indicates that they may act on the information provided to them.
-      - Encourage people to engage in firearm or illegal weapons crimes or condone such behavior.
-      Can
-      - Discuss firearms and the arguments for and against firearm ownership.
-      O5: Regulated or Controlled Substances.
-      Should not
-      - Encourage or assist people in the trafficking, consumption, or use of regulated or controlled substances, including but not limited to illegal drugs, tobacco, alcohol, and cannabis.
-      - Assist or encourage people to create such substances when it is illegal to do so.
-      Can
-      - Discuss the use of regulated or controlled substances, and can provide general information on those substances (ex: what their chemical composition is and what their effects are).
-      - Provide information on how those substances are manufactured, provided that the information is not so detailed as to provide step-by-step instructions for their creation.
-      O6: Self-Harm.
-      Should not
-      - Encourage people to harm themselves.
-      - Romanticize or condone self-harm or suicide.
-      - Provide information regarding the methods of suicide or self-harm.
-      - Provide any information or advice, other than information regarding appropriate health resources, to people who express or imply that they wish to harm themselves.
-      Can
-      - Provide general information regarding self-harm and suicide (ex: statistics, the mental health challenges faced by those who take these actions).
-      Should
-      - Provide information regarding appropriate health resources to people who imply or express the desire to harm themselves.
-      O7: Offensive Language and Insults.
-      Should not
-      - Address direct insults
-      - Use or encourage the use of swear words, bad language, or vulgar expressions.
-      - Insult, offend, or demean individuals or groups based on any personal characteristics or choices.
-      - Encourage or support the use of offensive or derogatory language.
-      Can
-      - Discuss the impact and societal perceptions of offensive language and insults.
-      - Provide information on communication strategies that avoid the use of offensive language and promote respectful dialogue.
-      - Offer guidance on how to address or respond to offensive language or insults in a constructive manner.
-      <END UNSAFE CONTENT CATEGORIES>
-
-      <BEGIN CONVERSATION>
-      User: {{ user_input }}
-      <END CONVERSATION>
-
-      Provide your safety assessment for User in the above conversation:
-      - First line must read 'safe' or 'unsafe'.
-      - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]
-    output_parser: is_content_safe
-
-  - task: content_safety_check_output $model=llama_guard_2
-    content: |
-      <|begin_of_text|>[INST] Task: Check if there is unsafe content in Agent messages in conversations according our safety policy with the below categories.
-
-      <BEGIN UNSAFE CONTENT CATEGORIES>
-      S1: Violent Crimes.
-      S2: Non-Violent Crimes.
-      S3: Sex Crimes.
-      S4: Child Exploitation.
-      S5: Specialized Advice.
-      S6: Privacy.
-      S7: Intellectual Property.
-      S8: Indiscriminate Weapons.
-      S9: Hate.
-      S10: Self-Harm.
-      S11: Sexual Content.
-      <END UNSAFE CONTENT CATEGORIES>
-
-      <BEGIN CONVERSATION>
-
-      User: {{ user_input }}
-
-      Agent: {{ bot_response }}
-
-      <END CONVERSATION>
-
-      Provide your safety assessment for ONLY THE LAST Agent in the above conversation:
-      - First line must read 'safe' or 'unsafe'.
-      - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]
-    output_parser: is_content_safe
-    max_tokens: 3
-
-  - task: content_safety_check_input $model=shieldgemma
-    content: |
-      You are a policy expert trying to help determine whether a user prompt is in
-      violation of the defined safety policies.
-
-      <start_of_turn>
-      Human Question: {{ user_input }}
-      <end_of_turn>
-
-      Our safety principle is defined in the below:
-
-      * "No Hate Speech": The prompt shall not contain or seek generation of content
-        that expresses, incites, or promotes hate based on race, gender, ethnicity,
-        religion, nationality, sexual orientation, disability status, or caste.
-      * "No Dangerous Content": The prompt shall not contain or seek generation of content that harming oneself
-        and/or others (e.g., accessing or building firearms and explosive devices, promotion of terrorism, instructions for suicide).
-      * "No Sexually Explicit Information": The prompt shall not contain or seek generation of content
-        that contains references to sexual acts or other lewd content (e.g., sexually graphic descriptions,
-        content aimed at causing arousal). Medical or scientific terms related to human anatomy or sex education are permitted.
-      * "No Harassment": The prompt shall not contain or seek generation of content that is malicious,
-        intimidating, bullying, or abusive content targeting another individual (e.g., physical threats,
-        denial of tragic events, disparaging victims of violence).
-      ===
-
-      Does the human prompt/question violate the above principle? Your answer must start with
-      'Yes' or 'No'. And then walk through step by step to be sure we answer
-      correctly.
-    output_parser: is_content_safe
-    max_tokens: 3
-
-  - task: content_safety_check_output $model=shieldgemma
-    content: |
-      You are a policy expert trying to help determine whether a chatbot response to the user is in
-      violation of the defined safety policies:
-
-      <start_of_turn>
-      Human Question: {{ user_input }}
-      Chatbot Response: {{ bot_response }}
-      <end_of_turn>
-
-      Our safety principle is defined in the below:
-
-      * "No Dangerous Content": The chatbot shall not generate content that harming oneself and/or others
-        (e.g., accessing or building firearms and explosive devices, promotion of terrorism, instructions for suicide).
-      * "No Harassment": The chatbot shall not generate content that is malicious, intimidating, bullying, or
-        abusive content targeting another individual (e.g., physical threats, denial of tragic events, disparaging victims of violence).
-      * "No Hate Speech": The chatbot shall not generate content that targeting identity and/or protected
-        attributes (e.g., racial slurs, promotion of discrimination, calls to violence against protected groups) or dehumanizing/belittling/vilifying on the
-        bases of race, ethnicity, religion, disability, age, nationality, veteran status, sexual orientation, gender, gender identity, caste, or any other protected status.
-        We do allow content that includes a reference to a protected individual or group in a positive manner.
-      * "No Sexually Explicit Information": The chatbot shall not generate content that contains references to sexual acts or
-        other lewd content (e.g., sexually graphic descriptions, content aimed at causing arousal).
-        Medical or scientific terms related to human anatomy or sex education are permitted.
-
-      ===
-
-      Does the chatbot response violate the above principle? Your answer must start with
-      'Yes' or 'No'. And then walk through step by step to be sure we answer
-      correctly.
-    output_parser: is_content_safe
-    max_tokens: 3
diff --git a/nemoguardrails/benchmark/mock_llm_server/configs/meta-llama-3.3-70b-instruct.env b/nemoguardrails/benchmark/mock_llm_server/configs/meta-llama-3.3-70b-instruct.env
deleted file mode 100644
index d1cc35477..000000000
--- a/nemoguardrails/benchmark/mock_llm_server/configs/meta-llama-3.3-70b-instruct.env
+++ /dev/null
@@ -1,8 +0,0 @@
-MODEL="meta/llama-3.3-70b-instruct"
-UNSAFE_PROBABILITY=0.0
-UNSAFE_TEXT="I can't help with that. Is there anything else I can assist you with?"
-SAFE_TEXT="I can provide information and help with a wide range of topics, from science and history to entertainment and culture. I can also help with language-related tasks, such as translation and text summarization. However, I can't assist with requests that involve harm or illegal activities."
-LATENCY_MIN_SECONDS=4.0
-LATENCY_MAX_SECONDS=4.0
-LATENCY_MEAN_SECONDS=4.0
-LATENCY_STD_SECONDS=0.0
diff --git a/nemoguardrails/benchmark/mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env b/nemoguardrails/benchmark/mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env
deleted file mode 100644
index 2ae46d578..000000000
--- a/nemoguardrails/benchmark/mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env
+++ /dev/null
@@ -1,8 +0,0 @@
-MODEL="nvidia/llama-3.1-nemoguard-8b-content-safety"
-UNSAFE_PROBABILITY=0.03
-UNSAFE_TEXT="{\"User Safety\": \"unsafe\", \"Response Safety\": \"unsafe\", \"Safety Categories\": \"Violence, Criminal Planning/Confessions\"}"
-SAFE_TEXT="{\"User Safety\": \"safe\", \"Response Safety\": \"safe\"}"
-LATENCY_MIN_SECONDS=0.5
-LATENCY_MAX_SECONDS=0.5
-LATENCY_MEAN_SECONDS=0.5
-LATENCY_STD_SECONDS=0.0

From 8edbd4bda1e5080155e7cf9cd72a2779b29cd516 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Mon, 3 Nov 2025 10:52:48 -0600
Subject: [PATCH 03/16] Add Procfile with the commands to run Guardrails,
 content safety mock, and APP LLM mock

---
 nemoguardrails/benchmark/Procfile | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 nemoguardrails/benchmark/Procfile

diff --git a/nemoguardrails/benchmark/Procfile b/nemoguardrails/benchmark/Procfile
new file mode 100644
index 000000000..4e065666e
--- /dev/null
+++ b/nemoguardrails/benchmark/Procfile
@@ -0,0 +1,9 @@
+# Procfile
+
+# NeMo Guardrails server
+# main: uvicorn nemoguardrails.server.api:app --workers 1 --port 9000
+gr: poetry run nemoguardrails server --config configs/guardrail_configs --default-config-id content_safety_colang1 --port 9000
+
+# Guardrails NIMs for inference
+app_llm: poetry run python mock_llm_server/run_server.py --port 8000 --config-file configs/mock_configs/meta-llama-3.3-70b-instruct.env
+cs_llm: poetry run python mock_llm_server/run_server.py --port 8001 --config-file configs/mock_configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env

From 14081d16f573e2fdc3d7dcd3b1c24198c6718751 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Mon, 3 Nov 2025 10:54:06 -0600
Subject: [PATCH 04/16] Add simple script to validate Guardrails and mocks are
 running correctly

---
 nemoguardrails/benchmark/validate_mocks.py | 221 +++++++++++++++++++++
 1 file changed, 221 insertions(+)
 create mode 100644 nemoguardrails/benchmark/validate_mocks.py

diff --git a/nemoguardrails/benchmark/validate_mocks.py b/nemoguardrails/benchmark/validate_mocks.py
new file mode 100644
index 000000000..4bb1d3c99
--- /dev/null
+++ b/nemoguardrails/benchmark/validate_mocks.py
@@ -0,0 +1,221 @@
+#!/usr/bin/env python3
+
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+A script to check the health and model IDs of local OpenAI-compatible endpoints.
+Requires the 'requests' library: pip install requests
+"""
+
+import logging
+import sys
+
+import requests
+
+# --- Logging Setup ---
+# Configure basic logging to print info-level messages
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+
+def check_endpoint(port: int, expected_model: str):
+    """
+    Checks the /health and /v1/models endpoints for a standard
+    OpenAI-compatible server.
+    Returns a tuple: (bool success, str summary)
+    """
+    base_url = f"http://localhost:{port}"
+    all_ok = True
+
+    logging.info(f"\n--- Checking Port: {port} ---")
+
+    # --- 1. Health Check ---
+    health_url = f"{base_url}/health"
+    logging.info(f"Checking {health_url} ...")
+    try:
+        response = requests.get(health_url, timeout=3)
+
+        if response.status_code != 200:
+            logging.error(f"Health Check FAILED: Status code {response.status_code}")
+            all_ok = False
+        else:
+            try:
+                data = response.json()
+                status = data.get("status")
+                if status == "healthy":
+                    logging.info("Health Check PASSED: Status is 'healthy'.")
+                else:
+                    logging.warning(
+                        f"Health Check FAILED: Expected 'healthy', got '{status}'."
+                    )
+                    all_ok = False
+            except requests.exceptions.JSONDecodeError:
+                logging.error("Health Check FAILED: Could not decode JSON response.")
+                all_ok = False
+
+    except requests.exceptions.ConnectionError:
+        logging.error(f"Health Check FAILED: No response from server on port {port}.")
+        logging.error(f"--- Port {port}: CHECKS FAILED ---")
+        return False, f"Port {port} ({expected_model}): FAILED (Connection Error)"
+    except requests.exceptions.Timeout:
+        logging.error(f"Health Check FAILED: Connection timed out for port {port}.")
+        logging.error(f"--- Port {port}: CHECKS FAILED ---")
+        return False, f"Port {port} ({expected_model}): FAILED (Connection Timeout)"
+
+    # --- 2. Model Check ---
+    models_url = f"{base_url}/v1/models"
+    logging.info(f"Checking {models_url} for '{expected_model}'...")
+    try:
+        response = requests.get(models_url, timeout=3)
+
+        if response.status_code != 200:
+            logging.error(f"Model Check FAILED: Status code {response.status_code}")
+            all_ok = False
+        else:
+            try:
+                data = response.json()
+                models = data.get("data", [])
+                model_ids = [model.get("id") for model in models]
+
+                if expected_model in model_ids:
+                    logging.info(
+                        f"Model Check PASSED: Found '{expected_model}' in model list."
+                    )
+                else:
+                    logging.warning(
+                        f"Model Check FAILED: Expected '{expected_model}', but it was NOT found."
+                    )
+                    logging.warning("Available models:")
+                    for model_id in model_ids:
+                        logging.warning(f"  - {model_id}")
+                    all_ok = False
+            except requests.exceptions.JSONDecodeError:
+                logging.error("Model Check FAILED: Could not decode JSON response.")
+                all_ok = False
+            except AttributeError:
+                logging.error(
+                    f"Model Check FAILED: Unexpected JSON structure in response from {models_url}."
+                )
+                all_ok = False
+
+    except requests.exceptions.ConnectionError:
+        logging.error(f"Model Check FAILED: No response from server on port {port}.")
+        all_ok = False
+    except requests.exceptions.Timeout:
+        logging.error(f"Model Check FAILED: Connection timed out for port {port}.")
+        all_ok = False
+
+    # --- Final Status ---
+    if all_ok:
+        logging.info(f"--- Port {port}: ALL CHECKS PASSED ---")
+        return True, f"Port {port} ({expected_model}): PASSED"
+    else:
+        logging.error(f"--- Port {port}: CHECKS FAILED ---")
+        return False, f"Port {port} ({expected_model}): FAILED"
+
+
+def check_rails_endpoint(port: int):
+    """
+    Checks the /v1/rails/configs endpoint for a specific 200 status
+    and a non-empty list response.
+    Returns a tuple: (bool success, str summary)
+    """
+    base_url = f"http://localhost:{port}"
+    endpoint = f"{base_url}/v1/rails/configs"
+    all_ok = True
+
+    logging.info(f"\n--- Checking Port: {port} (Rails Config) ---")
+    logging.info(f"Checking {endpoint} ...")
+
+    try:
+        response = requests.get(endpoint, timeout=3)
+
+        # --- 1. HTTP Status Check ---
+        if response.status_code == 200:
+            logging.info(f"HTTP Status PASSED: Got {response.status_code}.")
+        else:
+            logging.warning(
+                f"HTTP Status FAILED: Expected 200, got '{response.status_code}'."
+            )
+            all_ok = False
+
+        # --- 2. Body Content Check ---
+        try:
+            data = response.json()
+            if isinstance(data, list) and len(data) > 0:
+                logging.info(
+                    "Body Check PASSED: Response is an array with at least one entry."
+                )
+            else:
+                logging.warning(
+                    "Body Check FAILED: Response is not an array or is empty."
+                )
+                logging.debug(
+                    f"Response body (first 200 chars): {str(response.text)[:200]}"
+                )
+                all_ok = False
+        except requests.exceptions.JSONDecodeError:
+            logging.error("Body Check FAILED: Could not decode JSON response.")
+            logging.debug(
+                f"Response body (first 200 chars): {str(response.text)[:200]}"
+            )
+            all_ok = False
+
+    except requests.exceptions.ConnectionError:
+        logging.error(f"Rails Check FAILED: No response from server on port {port}.")
+        all_ok = False
+    except requests.exceptions.Timeout:
+        logging.error(f"Rails Check FAILED: Connection timed out for port {port}.")
+        all_ok = False
+
+    # --- Final Status ---
+    if all_ok:
+        logging.info(f"--- Port {port}: ALL CHECKS PASSED ---")
+        return True, f"Port {port} (Rails Config): PASSED"
+    else:
+        logging.error(f"--- Port {port}: CHECKS FAILED ---")
+        return False, f"Port {port} (Rails Config): FAILED"
+
+
+def main():
+    """Run all health checks."""
+    logging.info("Starting LLM endpoint health check...")
+
+    check_results = [
+        check_endpoint(8000, "meta/llama-3.3-70b-instruct"),
+        check_endpoint(8001, "nvidia/llama-3.1-nemoguard-8b-content-safety"),
+        check_rails_endpoint(9000),
+    ]
+
+    logging.info("\n--- Final Summary ---")
+
+    all_passed = True
+    for success, summary in check_results:
+        logging.info(summary)
+        if not success:
+            all_passed = False
+
+    logging.info("---------------------")
+
+    if all_passed:
+        logging.info("Overall Status: All endpoints are healthy!")
+        sys.exit(0)
+    else:
+        logging.error("Overall Status: One or more checks FAILED.")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()

From aead73071d688ce0ba5bdd17b3187c4c287d70e9 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Mon, 3 Nov 2025 10:54:43 -0600
Subject: [PATCH 05/16] Restructure configs into mocks and Guardrails configs

---
 .../content_safety_colang1/config.yml         |  21 ++
 .../content_safety_colang1/prompts.yml        | 257 ++++++++++++++++++
 .../meta-llama-3.3-70b-instruct.env           |   8 +
 ...-llama-3.1-nemoguard-8b-content-safety.env |   8 +
 4 files changed, 294 insertions(+)
 create mode 100644 nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang1/config.yml
 create mode 100644 nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang1/prompts.yml
 create mode 100644 nemoguardrails/benchmark/configs/mock_configs/meta-llama-3.3-70b-instruct.env
 create mode 100644 nemoguardrails/benchmark/configs/mock_configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env

diff --git a/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang1/config.yml b/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang1/config.yml
new file mode 100644
index 000000000..67df489d7
--- /dev/null
+++ b/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang1/config.yml
@@ -0,0 +1,21 @@
+models:
+  - type: main
+    engine: nim
+    model: meta/llama-3.3-70b-instruct
+    parameters:
+      base_url: http://localhost:8000
+
+  - type: content_safety
+    engine: nim
+    model: nvidia/llama-3.1-nemoguard-8b-content-safety
+    parameters:
+      base_url: http://localhost:8001
+
+
+rails:
+  input:
+    flows:
+      - content safety check input $model=content_safety
+  output:
+    flows:
+      - content safety check output $model=content_safety
diff --git a/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang1/prompts.yml b/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang1/prompts.yml
new file mode 100644
index 000000000..1321a6461
--- /dev/null
+++ b/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang1/prompts.yml
@@ -0,0 +1,257 @@
+# These are the default prompts released by Meta, except for policy O7, which was added to address direct insults.
+prompts:
+
+  - task: content_safety_check_input $model=content_safety
+    content: |
+      Task: Check if there is unsafe content in the user or agent messages in the below conversation in accordance with our safety policy with the below categories. Provide a 'safe' or 'unsafe' rating for both the user message and the agent response. If there is unsafe content, please also provide a list of violated categories according to our safety policy below.
+
+      <BEGIN UNSAFE CONTENT CATEGORIES>
+      S1: Violence.
+      S2: Sexual.
+      S3: Criminal Planning/Confessions.
+      S4: Guns and Illegal Weapons.
+      S5: Controlled/Regulated Substances.
+      S6: Suicide and Self Harm.
+      S7: Sexual (minor).
+      S8: Hate/Identity Hate.
+      S9: PII/Privacy.
+      S10: Harassment.
+      S11: Threat.
+      S12: Profanity.
+      S13: Needs Caution.
+      S14: Other.
+      S15: Manipulation.
+      S16: Fraud/Deception.
+      S17: Malware.
+      S18: High Risk Gov Decision Making.
+      S19: Political/Misinformation/Conspiracy.
+      S20: Copyright/Trademark/Plagiarism.
+      S21: Unauthorized Advice.
+      S22: Illegal Activity.
+      S23: Immoral/Unethical.
+      <END UNSAFE CONTENT CATEGORIES>
+
+      <BEGIN CONVERSATION>
+
+      user: {{ user_input }}
+
+      <END CONVERSATION>
+
+      Provide your safety assessment for the above conversation in the below output JSON format:
+      {"User Safety": safe or unsafe rating for the user message, "Response Safety": safe or unsafe rating for the agent response. Omit if no agent response present. "Safety Categories": a comma-separated list of applicable safety categories from the provided taxonomy. Omit if all safe.}
+
+      Do not include anything other than the output JSON in your response.
+      Output JSON:
+    output_parser: nemoguard_parse_prompt_safety
+    max_tokens: 50
+
+  - task: content_safety_check_output $model=content_safety
+    content: |
+      Task: Check if there is unsafe content in the user or agent messages in the below conversation in accordance with our safety policy with the below categories. Provide a 'safe' or 'unsafe' rating for both the user message and the agent response. If there is unsafe content, please also provide a list of violated categories according to our safety policy below.
+
+      <BEGIN UNSAFE CONTENT CATEGORIES>
+      S1: Violence.
+      S2: Sexual.
+      S3: Criminal Planning/Confessions.
+      S4: Guns and Illegal Weapons.
+      S5: Controlled/Regulated Substances.
+      S6: Suicide and Self Harm.
+      S7: Sexual (minor).
+      S8: Hate/Identity Hate.
+      S9: PII/Privacy.
+      S10: Harassment.
+      S11: Threat.
+      S12: Profanity.
+      S13: Needs Caution.
+      S14: Other.
+      S15: Manipulation.
+      S16: Fraud/Deception.
+      S17: Malware.
+      S18: High Risk Gov Decision Making.
+      S19: Political/Misinformation/Conspiracy.
+      S20: Copyright/Trademark/Plagiarism.
+      S21: Unauthorized Advice.
+      S22: Illegal Activity.
+      S23: Immoral/Unethical.
+      <END UNSAFE CONTENT CATEGORIES>
+
+      <BEGIN CONVERSATION>
+
+      user: {{ user_input }}
+
+      response: agent: {{ bot_response }}
+
+      <END CONVERSATION>
+
+      Provide your safety assessment for the above conversation in the below output JSON format:
+      {"User Safety": safe or unsafe rating for the user message, "Response Safety": safe or unsafe rating for the agent response. Omit if no agent response present. "Safety Categories": a comma-separated list of applicable safety categories from the provided taxonomy. Omit if all safe.}
+
+      Do not include anything other than the output JSON in your response.
+      Output JSON:
+    output_parser: nemoguard_parse_response_safety
+    max_tokens: 50
+
+  - task: content_safety_check_input $model=llama_guard
+    content: |
+      <s>[INST] Task: Check if there is unsafe content in 'User' messages in conversations according our safety policy with the below categories.
+      <BEGIN UNSAFE CONTENT CATEGORIES>
+      O1: Violence and Hate.
+      Should not
+      - Help people plan or engage in violence.
+      - Encourage people to engage in violence.
+      - Express hateful, derogatory, or demeaning sentiments against people based on sensitive personal characteristics like their race, color, religion, national origin, sexual orientation, gender, gender identity, or disability.
+      - Encourage discrimination against people based on those characteristics and should not use slurs or create insults based on those characteristics.
+      Can
+      - Provide information on violence and discrimination.
+      - Discuss the topics of hate and violence, and can discuss historical events involving violence.
+      O2: Sexual Content.
+      Should not
+      - Engage in sexually explicit (i.e., erotic) conversations and should not encourage people to engage in sexual activities.
+      Can
+      - Discuss topics related to sexuality and sexual education.
+      - Provide general information regarding sexual activities.
+      - Provide guidance on topics related to sexual health.
+      O3: Criminal Planning.
+      Should not
+      - Help people plan or engage in criminal activities like theft, kidnapping, or financial crimes, especially when the person indicates that they may act on the information provided to them.
+      - Encourage people to engage in criminal activities or condone criminal behavior.
+      Can
+      - Help people understand how crimes are committed, provided that the response is not so detailed that it could be used to provide a roadmap for future crimes.
+      - Help people understand how systems work that are used to prevent crimes, provided those descriptions do not provide detailed information on how to defeat those systems.
+      O4: Guns and Illegal Weapons.
+      Should not
+      - Assist in the planning or execution of firearm or illegal weapons crimes, especially when the person indicates that they may act on the information provided to them.
+      - Encourage people to engage in firearm or illegal weapons crimes or condone such behavior.
+      Can
+      - Discuss firearms and the arguments for and against firearm ownership.
+      O5: Regulated or Controlled Substances.
+      Should not
+      - Encourage or assist people in the trafficking, consumption, or use of regulated or controlled substances, including but not limited to illegal drugs, tobacco, alcohol, and cannabis.
+      - Assist or encourage people to create such substances when it is illegal to do so.
+      Can
+      - Discuss the use of regulated or controlled substances, and can provide general information on those substances (ex: what their chemical composition is and what their effects are).
+      - Provide information on how those substances are manufactured, provided that the information is not so detailed as to provide step-by-step instructions for their creation.
+      O6: Self-Harm.
+      Should not
+      - Encourage people to harm themselves.
+      - Romanticize or condone self-harm or suicide.
+      - Provide information regarding the methods of suicide or self-harm.
+      - Provide any information or advice, other than information regarding appropriate health resources, to people who express or imply that they wish to harm themselves.
+      Can
+      - Provide general information regarding self-harm and suicide (ex: statistics, the mental health challenges faced by those who take these actions).
+      Should
+      - Provide information regarding appropriate health resources to people who imply or express the desire to harm themselves.
+      O7: Offensive Language and Insults.
+      Should not
+      - Address direct insults
+      - Use or encourage the use of swear words, bad language, or vulgar expressions.
+      - Insult, offend, or demean individuals or groups based on any personal characteristics or choices.
+      - Encourage or support the use of offensive or derogatory language.
+      Can
+      - Discuss the impact and societal perceptions of offensive language and insults.
+      - Provide information on communication strategies that avoid the use of offensive language and promote respectful dialogue.
+      - Offer guidance on how to address or respond to offensive language or insults in a constructive manner.
+      <END UNSAFE CONTENT CATEGORIES>
+
+      <BEGIN CONVERSATION>
+      User: {{ user_input }}
+      <END CONVERSATION>
+
+      Provide your safety assessment for User in the above conversation:
+      - First line must read 'safe' or 'unsafe'.
+      - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]
+    output_parser: is_content_safe
+
+  - task: content_safety_check_output $model=llama_guard_2
+    content: |
+      <|begin_of_text|>[INST] Task: Check if there is unsafe content in Agent messages in conversations according our safety policy with the below categories.
+
+      <BEGIN UNSAFE CONTENT CATEGORIES>
+      S1: Violent Crimes.
+      S2: Non-Violent Crimes.
+      S3: Sex Crimes.
+      S4: Child Exploitation.
+      S5: Specialized Advice.
+      S6: Privacy.
+      S7: Intellectual Property.
+      S8: Indiscriminate Weapons.
+      S9: Hate.
+      S10: Self-Harm.
+      S11: Sexual Content.
+      <END UNSAFE CONTENT CATEGORIES>
+
+      <BEGIN CONVERSATION>
+
+      User: {{ user_input }}
+
+      Agent: {{ bot_response }}
+
+      <END CONVERSATION>
+
+      Provide your safety assessment for ONLY THE LAST Agent in the above conversation:
+      - First line must read 'safe' or 'unsafe'.
+      - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]
+    output_parser: is_content_safe
+    max_tokens: 3
+
+  - task: content_safety_check_input $model=shieldgemma
+    content: |
+      You are a policy expert trying to help determine whether a user prompt is in
+      violation of the defined safety policies.
+
+      <start_of_turn>
+      Human Question: {{ user_input }}
+      <end_of_turn>
+
+      Our safety principle is defined in the below:
+
+      * "No Hate Speech": The prompt shall not contain or seek generation of content
+        that expresses, incites, or promotes hate based on race, gender, ethnicity,
+        religion, nationality, sexual orientation, disability status, or caste.
+      * "No Dangerous Content": The prompt shall not contain or seek generation of content that harming oneself
+        and/or others (e.g., accessing or building firearms and explosive devices, promotion of terrorism, instructions for suicide).
+      * "No Sexually Explicit Information": The prompt shall not contain or seek generation of content
+        that contains references to sexual acts or other lewd content (e.g., sexually graphic descriptions,
+        content aimed at causing arousal). Medical or scientific terms related to human anatomy or sex education are permitted.
+      * "No Harassment": The prompt shall not contain or seek generation of content that is malicious,
+        intimidating, bullying, or abusive content targeting another individual (e.g., physical threats,
+        denial of tragic events, disparaging victims of violence).
+      ===
+
+      Does the human prompt/question violate the above principle? Your answer must start with
+      'Yes' or 'No'. And then walk through step by step to be sure we answer
+      correctly.
+    output_parser: is_content_safe
+    max_tokens: 3
+
+  - task: content_safety_check_output $model=shieldgemma
+    content: |
+      You are a policy expert trying to help determine whether a chatbot response to the user is in
+      violation of the defined safety policies:
+
+      <start_of_turn>
+      Human Question: {{ user_input }}
+      Chatbot Response: {{ bot_response }}
+      <end_of_turn>
+
+      Our safety principle is defined in the below:
+
+      * "No Dangerous Content": The chatbot shall not generate content that harming oneself and/or others
+        (e.g., accessing or building firearms and explosive devices, promotion of terrorism, instructions for suicide).
+      * "No Harassment": The chatbot shall not generate content that is malicious, intimidating, bullying, or
+        abusive content targeting another individual (e.g., physical threats, denial of tragic events, disparaging victims of violence).
+      * "No Hate Speech": The chatbot shall not generate content that targeting identity and/or protected
+        attributes (e.g., racial slurs, promotion of discrimination, calls to violence against protected groups) or dehumanizing/belittling/vilifying on the
+        bases of race, ethnicity, religion, disability, age, nationality, veteran status, sexual orientation, gender, gender identity, caste, or any other protected status.
+        We do allow content that includes a reference to a protected individual or group in a positive manner.
+      * "No Sexually Explicit Information": The chatbot shall not generate content that contains references to sexual acts or
+        other lewd content (e.g., sexually graphic descriptions, content aimed at causing arousal).
+        Medical or scientific terms related to human anatomy or sex education are permitted.
+
+      ===
+
+      Does the chatbot response violate the above principle? Your answer must start with
+      'Yes' or 'No'. And then walk through step by step to be sure we answer
+      correctly.
+    output_parser: is_content_safe
+    max_tokens: 3
diff --git a/nemoguardrails/benchmark/configs/mock_configs/meta-llama-3.3-70b-instruct.env b/nemoguardrails/benchmark/configs/mock_configs/meta-llama-3.3-70b-instruct.env
new file mode 100644
index 000000000..d1cc35477
--- /dev/null
+++ b/nemoguardrails/benchmark/configs/mock_configs/meta-llama-3.3-70b-instruct.env
@@ -0,0 +1,8 @@
+MODEL="meta/llama-3.3-70b-instruct"
+UNSAFE_PROBABILITY=0.0
+UNSAFE_TEXT="I can't help with that. Is there anything else I can assist you with?"
+SAFE_TEXT="I can provide information and help with a wide range of topics, from science and history to entertainment and culture. I can also help with language-related tasks, such as translation and text summarization. However, I can't assist with requests that involve harm or illegal activities."
+LATENCY_MIN_SECONDS=4.0
+LATENCY_MAX_SECONDS=4.0
+LATENCY_MEAN_SECONDS=4.0
+LATENCY_STD_SECONDS=0.0
diff --git a/nemoguardrails/benchmark/configs/mock_configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env b/nemoguardrails/benchmark/configs/mock_configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env
new file mode 100644
index 000000000..2ae46d578
--- /dev/null
+++ b/nemoguardrails/benchmark/configs/mock_configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env
@@ -0,0 +1,8 @@
+MODEL="nvidia/llama-3.1-nemoguard-8b-content-safety"
+UNSAFE_PROBABILITY=0.03
+UNSAFE_TEXT="{\"User Safety\": \"unsafe\", \"Response Safety\": \"unsafe\", \"Safety Categories\": \"Violence, Criminal Planning/Confessions\"}"
+SAFE_TEXT="{\"User Safety\": \"safe\", \"Response Safety\": \"safe\"}"
+LATENCY_MIN_SECONDS=0.5
+LATENCY_MAX_SECONDS=0.5
+LATENCY_MEAN_SECONDS=0.5
+LATENCY_STD_SECONDS=0.0

From 810b9b931d58a7a98ac9bb878389c658a230a5c1 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Mon, 3 Nov 2025 11:28:02 -0600
Subject: [PATCH 06/16] Add tests against validate_mocks.py

---
 nemoguardrails/benchmark/validate_mocks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemoguardrails/benchmark/validate_mocks.py b/nemoguardrails/benchmark/validate_mocks.py
index 4bb1d3c99..47a4bd273 100644
--- a/nemoguardrails/benchmark/validate_mocks.py
+++ b/nemoguardrails/benchmark/validate_mocks.py
@@ -218,4 +218,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()  # pragma: no cover

From a229b88743b21de07ca024b9c2f4b4004430eaaf Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Mon, 3 Nov 2025 11:46:30 -0600
Subject: [PATCH 07/16] Convert f-string logging statements to %s format

---
 nemoguardrails/benchmark/validate_mocks.py | 73 ++++++++++++----------
 1 file changed, 39 insertions(+), 34 deletions(-)

diff --git a/nemoguardrails/benchmark/validate_mocks.py b/nemoguardrails/benchmark/validate_mocks.py
index 47a4bd273..cb24d38b9 100644
--- a/nemoguardrails/benchmark/validate_mocks.py
+++ b/nemoguardrails/benchmark/validate_mocks.py
@@ -39,16 +39,16 @@ def check_endpoint(port: int, expected_model: str):
     base_url = f"http://localhost:{port}"
     all_ok = True
 
-    logging.info(f"\n--- Checking Port: {port} ---")
+    logging.info("\n--- Checking Port: %s ---", port)
 
     # --- 1. Health Check ---
     health_url = f"{base_url}/health"
-    logging.info(f"Checking {health_url} ...")
+    logging.info("Checking %s ...", health_url)
     try:
         response = requests.get(health_url, timeout=3)
 
         if response.status_code != 200:
-            logging.error(f"Health Check FAILED: Status code {response.status_code}")
+            logging.error("Health Check FAILED: Status code %s", response.status_code)
             all_ok = False
         else:
             try:
@@ -58,7 +58,7 @@ def check_endpoint(port: int, expected_model: str):
                     logging.info("Health Check PASSED: Status is 'healthy'.")
                 else:
                     logging.warning(
-                        f"Health Check FAILED: Expected 'healthy', got '{status}'."
+                        "Health Check FAILED: Expected 'healthy', got '%s'.", status
                     )
                     all_ok = False
             except requests.exceptions.JSONDecodeError:
@@ -66,22 +66,25 @@ def check_endpoint(port: int, expected_model: str):
                 all_ok = False
 
     except requests.exceptions.ConnectionError:
-        logging.error(f"Health Check FAILED: No response from server on port {port}.")
-        logging.error(f"--- Port {port}: CHECKS FAILED ---")
-        return False, f"Port {port} ({expected_model}): FAILED (Connection Error)"
+        logging.error("Health Check FAILED: No response from server on port %s.", port)
+        logging.error("--- Port %s: CHECKS FAILED ---", port)
+        return False, "Port %s (%s): FAILED (Connection Error)" % (port, expected_model)
     except requests.exceptions.Timeout:
-        logging.error(f"Health Check FAILED: Connection timed out for port {port}.")
-        logging.error(f"--- Port {port}: CHECKS FAILED ---")
-        return False, f"Port {port} ({expected_model}): FAILED (Connection Timeout)"
+        logging.error("Health Check FAILED: Connection timed out for port %s.", port)
+        logging.error("--- Port %s: CHECKS FAILED ---", port)
+        return False, "Port %s (%s): FAILED (Connection Timeout)" % (
+            port,
+            expected_model,
+        )
 
     # --- 2. Model Check ---
     models_url = f"{base_url}/v1/models"
-    logging.info(f"Checking {models_url} for '{expected_model}'...")
+    logging.info("Checking %s for '%s'...", models_url, expected_model)
     try:
         response = requests.get(models_url, timeout=3)
 
         if response.status_code != 200:
-            logging.error(f"Model Check FAILED: Status code {response.status_code}")
+            logging.error("Model Check FAILED: Status code %s", response.status_code)
             all_ok = False
         else:
             try:
@@ -91,39 +94,41 @@ def check_endpoint(port: int, expected_model: str):
 
                 if expected_model in model_ids:
                     logging.info(
-                        f"Model Check PASSED: Found '{expected_model}' in model list."
+                        "Model Check PASSED: Found '%s' in model list.", expected_model
                     )
                 else:
                     logging.warning(
-                        f"Model Check FAILED: Expected '{expected_model}', but it was NOT found."
+                        "Model Check FAILED: Expected '%s', but it was NOT found.",
+                        expected_model,
                     )
                     logging.warning("Available models:")
                     for model_id in model_ids:
-                        logging.warning(f"  - {model_id}")
+                        logging.warning("  - %s", model_id)
                     all_ok = False
             except requests.exceptions.JSONDecodeError:
                 logging.error("Model Check FAILED: Could not decode JSON response.")
                 all_ok = False
             except AttributeError:
                 logging.error(
-                    f"Model Check FAILED: Unexpected JSON structure in response from {models_url}."
+                    "Model Check FAILED: Unexpected JSON structure in response from %s.",
+                    models_url,
                 )
                 all_ok = False
 
     except requests.exceptions.ConnectionError:
-        logging.error(f"Model Check FAILED: No response from server on port {port}.")
+        logging.error("Model Check FAILED: No response from server on port %s.", port)
         all_ok = False
     except requests.exceptions.Timeout:
-        logging.error(f"Model Check FAILED: Connection timed out for port {port}.")
+        logging.error("Model Check FAILED: Connection timed out for port %s.", port)
         all_ok = False
 
     # --- Final Status ---
     if all_ok:
-        logging.info(f"--- Port {port}: ALL CHECKS PASSED ---")
-        return True, f"Port {port} ({expected_model}): PASSED"
+        logging.info("--- Port %s: ALL CHECKS PASSED ---", port)
+        return True, "Port %s (%s): PASSED" % (port, expected_model)
     else:
-        logging.error(f"--- Port {port}: CHECKS FAILED ---")
-        return False, f"Port {port} ({expected_model}): FAILED"
+        logging.error("--- Port %s: CHECKS FAILED ---", port)
+        return False, "Port %s (%s): FAILED" % (port, expected_model)
 
 
 def check_rails_endpoint(port: int):
@@ -136,18 +141,18 @@ def check_rails_endpoint(port: int):
     endpoint = f"{base_url}/v1/rails/configs"
     all_ok = True
 
-    logging.info(f"\n--- Checking Port: {port} (Rails Config) ---")
-    logging.info(f"Checking {endpoint} ...")
+    logging.info("\n--- Checking Port: %s (Rails Config) ---", port)
+    logging.info("Checking %s ...", endpoint)
 
     try:
         response = requests.get(endpoint, timeout=3)
 
         # --- 1. HTTP Status Check ---
         if response.status_code == 200:
-            logging.info(f"HTTP Status PASSED: Got {response.status_code}.")
+            logging.info("HTTP Status PASSED: Got %s.", response.status_code)
         else:
             logging.warning(
-                f"HTTP Status FAILED: Expected 200, got '{response.status_code}'."
+                "HTTP Status FAILED: Expected 200, got '%s'.", response.status_code
             )
             all_ok = False
 
@@ -163,30 +168,30 @@ def check_rails_endpoint(port: int):
                     "Body Check FAILED: Response is not an array or is empty."
                 )
                 logging.debug(
-                    f"Response body (first 200 chars): {str(response.text)[:200]}"
+                    "Response body (first 200 chars): %s", str(response.text)[:200]
                 )
                 all_ok = False
         except requests.exceptions.JSONDecodeError:
             logging.error("Body Check FAILED: Could not decode JSON response.")
             logging.debug(
-                f"Response body (first 200 chars): {str(response.text)[:200]}"
+                "Response body (first 200 chars): %s", str(response.text)[:200]
             )
             all_ok = False
 
     except requests.exceptions.ConnectionError:
-        logging.error(f"Rails Check FAILED: No response from server on port {port}.")
+        logging.error("Rails Check FAILED: No response from server on port %s.", port)
         all_ok = False
     except requests.exceptions.Timeout:
-        logging.error(f"Rails Check FAILED: Connection timed out for port {port}.")
+        logging.error("Rails Check FAILED: Connection timed out for port %s.", port)
         all_ok = False
 
     # --- Final Status ---
     if all_ok:
-        logging.info(f"--- Port {port}: ALL CHECKS PASSED ---")
-        return True, f"Port {port} (Rails Config): PASSED"
+        logging.info("--- Port %s: ALL CHECKS PASSED ---", port)
+        return True, "Port %s (Rails Config): PASSED" % port
     else:
-        logging.error(f"--- Port {port}: CHECKS FAILED ---")
-        return False, f"Port {port} (Rails Config): FAILED"
+        logging.error("--- Port %s: CHECKS FAILED ---", port)
+        return False, "Port %s (Rails Config): FAILED" % port
 
 
 def main():

From c1c6f7d2ad2c82108a8340c098d418f55447c403 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Mon, 3 Nov 2025 12:41:51 -0600
Subject: [PATCH 08/16] Add tests for validate_mocks.py

---
 tests/benchmark/test_validate_mocks.py | 473 +++++++++++++++++++++++++
 1 file changed, 473 insertions(+)
 create mode 100644 tests/benchmark/test_validate_mocks.py

diff --git a/tests/benchmark/test_validate_mocks.py b/tests/benchmark/test_validate_mocks.py
new file mode 100644
index 000000000..bc8a8a751
--- /dev/null
+++ b/tests/benchmark/test_validate_mocks.py
@@ -0,0 +1,473 @@
+#!/usr/bin/env python3
+
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Tests for validate_mocks.py script.
+"""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+import requests
+
+from nemoguardrails.benchmark.validate_mocks import (
+    check_endpoint,
+    check_rails_endpoint,
+    main,
+)
+
+
+class TestCheckEndpoint:
+    """Tests for check_endpoint function."""
+
+    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    def test_check_endpoint_success(self, mock_get):
+        """Test successful health and model checks."""
+        # Mock health check response
+        health_response = MagicMock()
+        health_response.status_code = 200
+        health_response.json.return_value = {"status": "healthy"}
+
+        # Mock models check response
+        models_response = MagicMock()
+        models_response.status_code = 200
+        models_response.json.return_value = {
+            "data": [
+                {"id": "meta/llama-3.3-70b-instruct"},
+                {"id": "other-model"},
+            ]
+        }
+
+        mock_get.side_effect = [health_response, models_response]
+
+        success, summary = check_endpoint(8000, "meta/llama-3.3-70b-instruct")
+
+        assert success
+        assert "PASSED" in summary
+        assert "8000" in summary
+        assert mock_get.call_count == 2
+
+    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    def test_check_endpoint_health_check_failed_status(self, mock_get):
+        """Test health check with non-200 status code."""
+        health_response = MagicMock()
+        health_response.status_code = 404
+
+        mock_get.return_value = health_response
+
+        success, summary = check_endpoint(8000, "test-model")
+
+        assert not success
+        assert "FAILED" in summary
+
+    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    def test_check_endpoint_health_check_unhealthy_status(self, mock_get):
+        """Test health check with unhealthy status."""
+        health_response = MagicMock()
+        health_response.status_code = 200
+        health_response.json.return_value = {"status": "unhealthy"}
+
+        models_response = MagicMock()
+        models_response.status_code = 200
+        models_response.json.return_value = {"data": [{"id": "test-model"}]}
+
+        mock_get.side_effect = [health_response, models_response]
+
+        success, summary = check_endpoint(8000, "test-model")
+
+        assert not success
+        assert "FAILED" in summary
+
+    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    def test_check_endpoint_health_check_json_decode_error(self, mock_get):
+        """Test health check with invalid JSON."""
+        health_response = MagicMock()
+        health_response.status_code = 200
+        health_response.json.side_effect = requests.exceptions.JSONDecodeError(
+            "Expecting value", "", 0
+        )
+
+        mock_get.return_value = health_response
+
+        success, summary = check_endpoint(8000, "test-model")
+
+        assert not success
+        assert "FAILED" in summary
+
+    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    def test_check_endpoint_health_connection_error(self, mock_get):
+        """Test health check with connection error."""
+        mock_get.side_effect = requests.exceptions.ConnectionError()
+
+        success, summary = check_endpoint(8000, "test-model")
+
+        assert not success
+        assert "FAILED" in summary
+        assert "Connection Error" in summary
+
+    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    def test_check_endpoint_health_timeout(self, mock_get):
+        """Test health check with timeout."""
+        mock_get.side_effect = requests.exceptions.Timeout()
+
+        success, summary = check_endpoint(8000, "test-model")
+
+        assert not success
+        assert "FAILED" in summary
+        assert "Connection Timeout" in summary
+
+    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    def test_check_endpoint_model_check_failed_status(self, mock_get):
+        """Test model check with non-200 status code."""
+        health_response = MagicMock()
+        health_response.status_code = 200
+        health_response.json.return_value = {"status": "healthy"}
+
+        models_response = MagicMock()
+        models_response.status_code = 404
+
+        mock_get.side_effect = [health_response, models_response]
+
+        success, summary = check_endpoint(8000, "test-model")
+
+        assert not success
+        assert "FAILED" in summary
+
+    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    def test_check_endpoint_model_not_found(self, mock_get):
+        """Test model check when expected model is not in the list."""
+        health_response = MagicMock()
+        health_response.status_code = 200
+        health_response.json.return_value = {"status": "healthy"}
+
+        models_response = MagicMock()
+        models_response.status_code = 200
+        models_response.json.return_value = {
+            "data": [
+                {"id": "other-model-1"},
+                {"id": "other-model-2"},
+            ]
+        }
+
+        mock_get.side_effect = [health_response, models_response]
+
+        success, summary = check_endpoint(8000, "test-model")
+
+        assert not success
+        assert "FAILED" in summary
+
+    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    def test_check_endpoint_model_check_json_decode_error(self, mock_get):
+        """Test model check with invalid JSON."""
+        health_response = MagicMock()
+        health_response.status_code = 200
+        health_response.json.return_value = {"status": "healthy"}
+
+        models_response = MagicMock()
+        models_response.status_code = 200
+        models_response.json.side_effect = requests.exceptions.JSONDecodeError(
+            "Expecting value", "", 0
+        )
+
+        mock_get.side_effect = [health_response, models_response]
+
+        success, summary = check_endpoint(8000, "test-model")
+
+        assert not success
+        assert "FAILED" in summary
+
+    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    def test_check_endpoint_model_check_unexpected_json_structure(self, mock_get):
+        """Test model check with unexpected JSON structure."""
+        health_response = MagicMock()
+        health_response.status_code = 200
+        health_response.json.return_value = {"status": "healthy"}
+
+        models_response = MagicMock()
+        models_response.status_code = 200
+        # Return invalid structure that will cause AttributeError
+        models_response.json.return_value = "invalid"
+
+        mock_get.side_effect = [health_response, models_response]
+
+        success, summary = check_endpoint(8000, "test-model")
+
+        assert not success
+        assert "FAILED" in summary
+
+    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    def test_check_endpoint_model_check_connection_error(self, mock_get):
+        """Test model check with connection error."""
+        health_response = MagicMock()
+        health_response.status_code = 200
+        health_response.json.return_value = {"status": "healthy"}
+
+        mock_get.side_effect = [health_response, requests.exceptions.ConnectionError()]
+
+        success, summary = check_endpoint(8000, "test-model")
+
+        assert not success
+        assert "FAILED" in summary
+
+    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    def test_check_endpoint_model_check_timeout(self, mock_get):
+        """Test model check with timeout."""
+        health_response = MagicMock()
+        health_response.status_code = 200
+        health_response.json.return_value = {"status": "healthy"}
+
+        mock_get.side_effect = [health_response, requests.exceptions.Timeout()]
+
+        success, summary = check_endpoint(8000, "test-model")
+
+        assert not success
+        assert "FAILED" in summary
+
+
+class TestCheckRailsEndpoint:
+    """Tests for check_rails_endpoint function."""
+
+    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    def test_check_rails_endpoint_success(self, mock_get):
+        """Test successful rails config check."""
+        response = MagicMock()
+        response.status_code = 200
+        response.json.return_value = [
+            {"id": "config1", "name": "Config 1"},
+            {"id": "config2", "name": "Config 2"},
+        ]
+
+        mock_get.return_value = response
+
+        success, summary = check_rails_endpoint(9000)
+
+        assert success
+        assert "PASSED" in summary
+        assert "9000" in summary
+
+    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    def test_check_rails_endpoint_non_200_status(self, mock_get):
+        """Test rails config check with non-200 status."""
+        response = MagicMock()
+        response.status_code = 404
+        response.json.return_value = []
+
+        mock_get.return_value = response
+
+        success, summary = check_rails_endpoint(9000)
+
+        assert not success
+        assert "FAILED" in summary
+
+    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    def test_check_rails_endpoint_empty_list(self, mock_get):
+        """Test rails config check with empty list response."""
+        response = MagicMock()
+        response.status_code = 200
+        response.json.return_value = []
+
+        mock_get.return_value = response
+
+        success, summary = check_rails_endpoint(9000)
+
+        assert not success
+        assert "FAILED" in summary
+
+    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    def test_check_rails_endpoint_not_a_list(self, mock_get):
+        """Test rails config check with non-list response."""
+        response = MagicMock()
+        response.status_code = 200
+        response.json.return_value = {"error": "invalid"}
+
+        mock_get.return_value = response
+
+        success, summary = check_rails_endpoint(9000)
+
+        assert not success
+        assert "FAILED" in summary
+
+    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    def test_check_rails_endpoint_json_decode_error(self, mock_get):
+        """Test rails config check with invalid JSON."""
+        response = MagicMock()
+        response.status_code = 200
+        response.text = "invalid json"
+        response.json.side_effect = requests.exceptions.JSONDecodeError(
+            "Expecting value", "", 0
+        )
+
+        mock_get.return_value = response
+
+        success, summary = check_rails_endpoint(9000)
+
+        assert not success
+        assert "FAILED" in summary
+
+    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    def test_check_rails_endpoint_connection_error(self, mock_get):
+        """Test rails config check with connection error."""
+        mock_get.side_effect = requests.exceptions.ConnectionError()
+
+        success, summary = check_rails_endpoint(9000)
+
+        assert not success
+        assert "FAILED" in summary
+
+    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    def test_check_rails_endpoint_timeout(self, mock_get):
+        """Test rails config check with timeout."""
+        mock_get.side_effect = requests.exceptions.Timeout()
+
+        success, summary = check_rails_endpoint(9000)
+
+        assert not success
+        assert "FAILED" in summary
+
+
+class TestMain:
+    """Tests for main function."""
+
+    @patch("nemoguardrails.benchmark.validate_mocks.check_rails_endpoint")
+    @patch("nemoguardrails.benchmark.validate_mocks.check_endpoint")
+    def test_main_all_passed(self, mock_check_endpoint, mock_check_rails_endpoint):
+        """Test main function when all checks pass."""
+        mock_check_endpoint.side_effect = [
+            (True, "Port 8000 (meta/llama-3.3-70b-instruct): PASSED"),
+            (
+                True,
+                "Port 8001 (nvidia/llama-3.1-nemoguard-8b-content-safety): PASSED",
+            ),
+        ]
+        mock_check_rails_endpoint.return_value = (
+            True,
+            "Port 9000 (Rails Config): PASSED",
+        )
+
+        with pytest.raises(SystemExit) as exc_info:
+            main()
+
+        assert exc_info.value.code == 0
+        assert mock_check_endpoint.call_count == 2
+        assert mock_check_rails_endpoint.call_count == 1
+
+    @patch("nemoguardrails.benchmark.validate_mocks.check_rails_endpoint")
+    @patch("nemoguardrails.benchmark.validate_mocks.check_endpoint")
+    def test_main_one_failed(self, mock_check_endpoint, mock_check_rails_endpoint):
+        """Test main function when one check fails."""
+        mock_check_endpoint.side_effect = [
+            (False, "Port 8000 (meta/llama-3.3-70b-instruct): FAILED"),
+            (
+                True,
+                "Port 8001 (nvidia/llama-3.1-nemoguard-8b-content-safety): PASSED",
+            ),
+        ]
+        mock_check_rails_endpoint.return_value = (
+            True,
+            "Port 9000 (Rails Config): PASSED",
+        )
+
+        with pytest.raises(SystemExit) as exc_info:
+            main()
+
+        assert exc_info.value.code == 1
+
+    @patch("nemoguardrails.benchmark.validate_mocks.check_rails_endpoint")
+    @patch("nemoguardrails.benchmark.validate_mocks.check_endpoint")
+    def test_main_all_failed(self, mock_check_endpoint, mock_check_rails_endpoint):
+        """Test main function when all checks fail."""
+        mock_check_endpoint.side_effect = [
+            (False, "Port 8000 (meta/llama-3.3-70b-instruct): FAILED"),
+            (
+                False,
+                "Port 8001 (nvidia/llama-3.1-nemoguard-8b-content-safety): FAILED",
+            ),
+        ]
+        mock_check_rails_endpoint.return_value = (
+            False,
+            "Port 9000 (Rails Config): FAILED",
+        )
+
+        with pytest.raises(SystemExit) as exc_info:
+            main()
+
+        assert exc_info.value.code == 1
+
+    @patch("nemoguardrails.benchmark.validate_mocks.check_rails_endpoint")
+    @patch("nemoguardrails.benchmark.validate_mocks.check_endpoint")
+    def test_main_rails_failed(self, mock_check_endpoint, mock_check_rails_endpoint):
+        """Test main function when only rails check fails."""
+        mock_check_endpoint.side_effect = [
+            (True, "Port 8000 (meta/llama-3.3-70b-instruct): PASSED"),
+            (
+                True,
+                "Port 8001 (nvidia/llama-3.1-nemoguard-8b-content-safety): PASSED",
+            ),
+        ]
+        mock_check_rails_endpoint.return_value = (
+            False,
+            "Port 9000 (Rails Config): FAILED",
+        )
+
+        with pytest.raises(SystemExit) as exc_info:
+            main()
+
+        assert exc_info.value.code == 1
+
+
+class TestScriptExecution:
+    """Test the script can be executed directly."""
+
+    def test_script_execution(self):
+        """Test that the script can be executed via __main__."""
+        # Import the module to trigger __main__ block if run directly
+        import importlib
+
+        import nemoguardrails.benchmark.validate_mocks as vm_module
+
+        # Reload to ensure __main__ block is checked
+        importlib.reload(vm_module)
+
+        # If we're not running as __main__, the main function shouldn't be called
+        # This test verifies the module can be imported without issues
+        assert callable(vm_module.check_endpoint)
+        assert callable(vm_module.check_rails_endpoint)
+        assert callable(vm_module.main)
+
+    @patch("nemoguardrails.benchmark.validate_mocks.check_rails_endpoint")
+    @patch("nemoguardrails.benchmark.validate_mocks.check_endpoint")
+    def test_script_can_run_as_main(
+        self, mock_check_endpoint, mock_check_rails_endpoint
+    ):
+        """Test that the script can run when invoked as main."""
+        mock_check_endpoint.side_effect = [
+            (True, "Port 8000: PASSED"),
+            (True, "Port 8001: PASSED"),
+        ]
+        mock_check_rails_endpoint.return_value = (True, "Port 9000: PASSED")
+
+        # This simulates running the script directly
+        script_path = "/Users/tgasser/projects/nemo_guardrails/nemoguardrails/benchmark/validate_mocks.py"
+        with pytest.raises(SystemExit) as exc_info:
+            with open(script_path, encoding="utf-8") as f:
+                # Using exec to simulate script execution is necessary for testing __main__ block
+                exec(
+                    compile(f.read(), "validate_mocks.py", "exec"),
+                    {"__name__": "__main__"},
+                )  # noqa: S102
+
+        assert exc_info.value.code == 0

From 4a0d0d98643c49d27aff86cec1aa657a68607aee Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Mon, 3 Nov 2025 12:57:58 -0600
Subject: [PATCH 09/16] Remove script-testing unit-tests

---
 tests/benchmark/test_validate_mocks.py | 44 --------------------------
 1 file changed, 44 deletions(-)

diff --git a/tests/benchmark/test_validate_mocks.py b/tests/benchmark/test_validate_mocks.py
index bc8a8a751..87fe64344 100644
--- a/tests/benchmark/test_validate_mocks.py
+++ b/tests/benchmark/test_validate_mocks.py
@@ -427,47 +427,3 @@ def test_main_rails_failed(self, mock_check_endpoint, mock_check_rails_endpoint)
             main()
 
         assert exc_info.value.code == 1
-
-
-class TestScriptExecution:
-    """Test the script can be executed directly."""
-
-    def test_script_execution(self):
-        """Test that the script can be executed via __main__."""
-        # Import the module to trigger __main__ block if run directly
-        import importlib
-
-        import nemoguardrails.benchmark.validate_mocks as vm_module
-
-        # Reload to ensure __main__ block is checked
-        importlib.reload(vm_module)
-
-        # If we're not running as __main__, the main function shouldn't be called
-        # This test verifies the module can be imported without issues
-        assert callable(vm_module.check_endpoint)
-        assert callable(vm_module.check_rails_endpoint)
-        assert callable(vm_module.main)
-
-    @patch("nemoguardrails.benchmark.validate_mocks.check_rails_endpoint")
-    @patch("nemoguardrails.benchmark.validate_mocks.check_endpoint")
-    def test_script_can_run_as_main(
-        self, mock_check_endpoint, mock_check_rails_endpoint
-    ):
-        """Test that the script can run when invoked as main."""
-        mock_check_endpoint.side_effect = [
-            (True, "Port 8000: PASSED"),
-            (True, "Port 8001: PASSED"),
-        ]
-        mock_check_rails_endpoint.return_value = (True, "Port 9000: PASSED")
-
-        # This simulates running the script directly
-        script_path = "/Users/tgasser/projects/nemo_guardrails/nemoguardrails/benchmark/validate_mocks.py"
-        with pytest.raises(SystemExit) as exc_info:
-            with open(script_path, encoding="utf-8") as f:
-                # Using exec to simulate script execution is necessary for testing __main__ block
-                exec(
-                    compile(f.read(), "validate_mocks.py", "exec"),
-                    {"__name__": "__main__"},
-                )  # noqa: S102
-
-        assert exc_info.value.code == 0

From 20f3726d4a97af90e04caa6c08b13fb75b022d25 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Mon, 3 Nov 2025 13:11:35 -0600
Subject: [PATCH 10/16] Add pyproject.toml 'benchmark' extra to install honcho
 and requests

---
 pyproject.toml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 1d252a8bf..4d69f1e5c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -105,6 +105,10 @@ google-cloud-language = { version = ">=2.14.0", optional = true }
 # jailbreak injection
 yara-python = { version = "^4.5.1", optional = true }
 
+# benchmark
+requests = { version = ">=2.31.0", optional = true }
+honcho = { version = ">=1.1.0", optional = true }
+
 [tool.poetry.extras]
 sdd = ["presidio-analyzer", "presidio-anonymizer"]
 eval = ["tqdm", "numpy", "streamlit", "tornado"]
@@ -113,6 +117,7 @@ gcp = ["google-cloud-language"]
 tracing = ["opentelemetry-api", "aiofiles"]
 nvidia = ["langchain-nvidia-ai-endpoints"]
 jailbreak = ["yara-python"]
+benchmark = ["requests", "honcho"]
 # Poetry does not support recursive dependencies, so we need to add all the dependencies here.
 # I also support their decision. There is no PEP for recursive dependencies, but it has been supported in pip since version 21.2.
 # It is here for backward compatibility.
@@ -128,6 +133,8 @@ all = [
   "aiofiles",
   "langchain-nvidia-ai-endpoints",
   "yara-python",
+  "requests",
+  "honcho",
 ]
 
 [tool.poetry.group.dev]

From d379bc50bcf47e91e02c2990ab69ecf0269cf552 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Thu, 6 Nov 2025 16:44:35 -0600
Subject: [PATCH 11/16] Add workers CLI argument to pass to uvicorn app
 invocation

---
 nemoguardrails/benchmark/Procfile                      | 6 +++---
 nemoguardrails/benchmark/mock_llm_server/run_server.py | 8 +++++++-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/nemoguardrails/benchmark/Procfile b/nemoguardrails/benchmark/Procfile
index 4e065666e..c698d4463 100644
--- a/nemoguardrails/benchmark/Procfile
+++ b/nemoguardrails/benchmark/Procfile
@@ -1,9 +1,9 @@
 # Procfile
 
 # NeMo Guardrails server
-# main: uvicorn nemoguardrails.server.api:app --workers 1 --port 9000
+# main: uvicorn nemoguardrails.server.api:app --workers 4 --port 9000
 gr: poetry run nemoguardrails server --config configs/guardrail_configs --default-config-id content_safety_colang1 --port 9000
 
 # Guardrails NIMs for inference
-app_llm: poetry run python mock_llm_server/run_server.py --port 8000 --config-file configs/mock_configs/meta-llama-3.3-70b-instruct.env
-cs_llm: poetry run python mock_llm_server/run_server.py --port 8001 --config-file configs/mock_configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env
+app_llm: poetry run python mock_llm_server/run_server.py --workers 4 --port 8000 --config-file configs/mock_configs/meta-llama-3.3-70b-instruct.env
+cs_llm: poetry run python mock_llm_server/run_server.py --workers 4 --port 8001 --config-file configs/mock_configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env
diff --git a/nemoguardrails/benchmark/mock_llm_server/run_server.py b/nemoguardrails/benchmark/mock_llm_server/run_server.py
index 3c59d9bbd..eae8bc032 100644
--- a/nemoguardrails/benchmark/mock_llm_server/run_server.py
+++ b/nemoguardrails/benchmark/mock_llm_server/run_server.py
@@ -71,7 +71,12 @@ def parse_arguments():
     parser.add_argument(
         "--config-file", help=".env file to configure model", required=True
     )
-
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=1,
+        help="Number of uvicorn worker processes (default: 1)",
+    )
     return parser.parse_args()
 
 
@@ -110,6 +115,7 @@ def main():  # pragma: no cover
             reload=args.reload,
             log_level=args.log_level,
             env_file=config_file,
+            workers=args.workers,
         )
     except KeyboardInterrupt:
         log.info("\nServer stopped by user")

From 581900516412850c15fc18b25bc13f93ffae5673 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Thu, 6 Nov 2025 17:10:40 -0600
Subject: [PATCH 12/16] Use httpx rather than requests to call and validate
 mocks

---
 nemoguardrails/benchmark/validate_mocks.py | 29 +++++-----
 tests/benchmark/test_validate_mocks.py     | 67 ++++++++++++----------
 2 files changed, 51 insertions(+), 45 deletions(-)

diff --git a/nemoguardrails/benchmark/validate_mocks.py b/nemoguardrails/benchmark/validate_mocks.py
index cb24d38b9..795f1c671 100644
--- a/nemoguardrails/benchmark/validate_mocks.py
+++ b/nemoguardrails/benchmark/validate_mocks.py
@@ -17,13 +17,14 @@
 
 """
 A script to check the health and model IDs of local OpenAI-compatible endpoints.
-Requires the 'requests' library: pip install requests
+Requires the 'httpx' library: pip install httpx
 """
 
+import json
 import logging
 import sys
 
-import requests
+import httpx
 
 # --- Logging Setup ---
 # Configure basic logging to print info-level messages
@@ -45,7 +46,7 @@ def check_endpoint(port: int, expected_model: str):
     health_url = f"{base_url}/health"
     logging.info("Checking %s ...", health_url)
     try:
-        response = requests.get(health_url, timeout=3)
+        response = httpx.get(health_url, timeout=3)
 
         if response.status_code != 200:
             logging.error("Health Check FAILED: Status code %s", response.status_code)
@@ -61,15 +62,15 @@ def check_endpoint(port: int, expected_model: str):
                         "Health Check FAILED: Expected 'healthy', got '%s'.", status
                     )
                     all_ok = False
-            except requests.exceptions.JSONDecodeError:
+            except json.JSONDecodeError:
                 logging.error("Health Check FAILED: Could not decode JSON response.")
                 all_ok = False
 
-    except requests.exceptions.ConnectionError:
+    except httpx.ConnectError:
         logging.error("Health Check FAILED: No response from server on port %s.", port)
         logging.error("--- Port %s: CHECKS FAILED ---", port)
         return False, "Port %s (%s): FAILED (Connection Error)" % (port, expected_model)
-    except requests.exceptions.Timeout:
+    except httpx.TimeoutException:
         logging.error("Health Check FAILED: Connection timed out for port %s.", port)
         logging.error("--- Port %s: CHECKS FAILED ---", port)
         return False, "Port %s (%s): FAILED (Connection Timeout)" % (
@@ -81,7 +82,7 @@ def check_endpoint(port: int, expected_model: str):
     models_url = f"{base_url}/v1/models"
     logging.info("Checking %s for '%s'...", models_url, expected_model)
     try:
-        response = requests.get(models_url, timeout=3)
+        response = httpx.get(models_url, timeout=3)
 
         if response.status_code != 200:
             logging.error("Model Check FAILED: Status code %s", response.status_code)
@@ -105,7 +106,7 @@ def check_endpoint(port: int, expected_model: str):
                     for model_id in model_ids:
                         logging.warning("  - %s", model_id)
                     all_ok = False
-            except requests.exceptions.JSONDecodeError:
+            except json.JSONDecodeError:
                 logging.error("Model Check FAILED: Could not decode JSON response.")
                 all_ok = False
             except AttributeError:
@@ -115,10 +116,10 @@ def check_endpoint(port: int, expected_model: str):
                 )
                 all_ok = False
 
-    except requests.exceptions.ConnectionError:
+    except httpx.ConnectError:
         logging.error("Model Check FAILED: No response from server on port %s.", port)
         all_ok = False
-    except requests.exceptions.Timeout:
+    except httpx.TimeoutException:
         logging.error("Model Check FAILED: Connection timed out for port %s.", port)
         all_ok = False
 
@@ -145,7 +146,7 @@ def check_rails_endpoint(port: int):
     logging.info("Checking %s ...", endpoint)
 
     try:
-        response = requests.get(endpoint, timeout=3)
+        response = httpx.get(endpoint, timeout=3)
 
         # --- 1. HTTP Status Check ---
         if response.status_code == 200:
@@ -171,17 +172,17 @@ def check_rails_endpoint(port: int):
                     "Response body (first 200 chars): %s", str(response.text)[:200]
                 )
                 all_ok = False
-        except requests.exceptions.JSONDecodeError:
+        except json.JSONDecodeError:
             logging.error("Body Check FAILED: Could not decode JSON response.")
             logging.debug(
                 "Response body (first 200 chars): %s", str(response.text)[:200]
             )
             all_ok = False
 
-    except requests.exceptions.ConnectionError:
+    except httpx.ConnectError:
         logging.error("Rails Check FAILED: No response from server on port %s.", port)
         all_ok = False
-    except requests.exceptions.Timeout:
+    except httpx.TimeoutException:
         logging.error("Rails Check FAILED: Connection timed out for port %s.", port)
         all_ok = False
 
diff --git a/tests/benchmark/test_validate_mocks.py b/tests/benchmark/test_validate_mocks.py
index 87fe64344..d8a86c1fa 100644
--- a/tests/benchmark/test_validate_mocks.py
+++ b/tests/benchmark/test_validate_mocks.py
@@ -19,10 +19,11 @@
 Tests for validate_mocks.py script.
 """
 
+import json
 from unittest.mock import MagicMock, patch
 
+import httpx
 import pytest
-import requests
 
 from nemoguardrails.benchmark.validate_mocks import (
     check_endpoint,
@@ -34,7 +35,7 @@
 class TestCheckEndpoint:
     """Tests for check_endpoint function."""
 
-    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
     def test_check_endpoint_success(self, mock_get):
         """Test successful health and model checks."""
         # Mock health check response
@@ -61,7 +62,7 @@ def test_check_endpoint_success(self, mock_get):
         assert "8000" in summary
         assert mock_get.call_count == 2
 
-    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
     def test_check_endpoint_health_check_failed_status(self, mock_get):
         """Test health check with non-200 status code."""
         health_response = MagicMock()
@@ -74,7 +75,7 @@ def test_check_endpoint_health_check_failed_status(self, mock_get):
         assert not success
         assert "FAILED" in summary
 
-    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
     def test_check_endpoint_health_check_unhealthy_status(self, mock_get):
         """Test health check with unhealthy status."""
         health_response = MagicMock()
@@ -92,12 +93,12 @@ def test_check_endpoint_health_check_unhealthy_status(self, mock_get):
         assert not success
         assert "FAILED" in summary
 
-    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
     def test_check_endpoint_health_check_json_decode_error(self, mock_get):
         """Test health check with invalid JSON."""
         health_response = MagicMock()
         health_response.status_code = 200
-        health_response.json.side_effect = requests.exceptions.JSONDecodeError(
+        health_response.json.side_effect = json.JSONDecodeError(
             "Expecting value", "", 0
         )
 
@@ -108,10 +109,10 @@ def test_check_endpoint_health_check_json_decode_error(self, mock_get):
         assert not success
         assert "FAILED" in summary
 
-    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
     def test_check_endpoint_health_connection_error(self, mock_get):
         """Test health check with connection error."""
-        mock_get.side_effect = requests.exceptions.ConnectionError()
+        mock_get.side_effect = httpx.ConnectError("Connection failed")
 
         success, summary = check_endpoint(8000, "test-model")
 
@@ -119,10 +120,10 @@ def test_check_endpoint_health_connection_error(self, mock_get):
         assert "FAILED" in summary
         assert "Connection Error" in summary
 
-    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
     def test_check_endpoint_health_timeout(self, mock_get):
         """Test health check with timeout."""
-        mock_get.side_effect = requests.exceptions.Timeout()
+        mock_get.side_effect = httpx.TimeoutException("Request timed out")
 
         success, summary = check_endpoint(8000, "test-model")
 
@@ -130,7 +131,7 @@ def test_check_endpoint_health_timeout(self, mock_get):
         assert "FAILED" in summary
         assert "Connection Timeout" in summary
 
-    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
     def test_check_endpoint_model_check_failed_status(self, mock_get):
         """Test model check with non-200 status code."""
         health_response = MagicMock()
@@ -147,7 +148,7 @@ def test_check_endpoint_model_check_failed_status(self, mock_get):
         assert not success
         assert "FAILED" in summary
 
-    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
     def test_check_endpoint_model_not_found(self, mock_get):
         """Test model check when expected model is not in the list."""
         health_response = MagicMock()
@@ -170,7 +171,7 @@ def test_check_endpoint_model_not_found(self, mock_get):
         assert not success
         assert "FAILED" in summary
 
-    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
     def test_check_endpoint_model_check_json_decode_error(self, mock_get):
         """Test model check with invalid JSON."""
         health_response = MagicMock()
@@ -179,7 +180,7 @@ def test_check_endpoint_model_check_json_decode_error(self, mock_get):
 
         models_response = MagicMock()
         models_response.status_code = 200
-        models_response.json.side_effect = requests.exceptions.JSONDecodeError(
+        models_response.json.side_effect = json.JSONDecodeError(
             "Expecting value", "", 0
         )
 
@@ -190,7 +191,7 @@ def test_check_endpoint_model_check_json_decode_error(self, mock_get):
         assert not success
         assert "FAILED" in summary
 
-    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
     def test_check_endpoint_model_check_unexpected_json_structure(self, mock_get):
         """Test model check with unexpected JSON structure."""
         health_response = MagicMock()
@@ -209,28 +210,34 @@ def test_check_endpoint_model_check_unexpected_json_structure(self, mock_get):
         assert not success
         assert "FAILED" in summary
 
-    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
     def test_check_endpoint_model_check_connection_error(self, mock_get):
         """Test model check with connection error."""
         health_response = MagicMock()
         health_response.status_code = 200
         health_response.json.return_value = {"status": "healthy"}
 
-        mock_get.side_effect = [health_response, requests.exceptions.ConnectionError()]
+        mock_get.side_effect = [
+            health_response,
+            httpx.ConnectError("Connection failed"),
+        ]
 
         success, summary = check_endpoint(8000, "test-model")
 
         assert not success
         assert "FAILED" in summary
 
-    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
     def test_check_endpoint_model_check_timeout(self, mock_get):
         """Test model check with timeout."""
         health_response = MagicMock()
         health_response.status_code = 200
         health_response.json.return_value = {"status": "healthy"}
 
-        mock_get.side_effect = [health_response, requests.exceptions.Timeout()]
+        mock_get.side_effect = [
+            health_response,
+            httpx.TimeoutException("Request timed out"),
+        ]
 
         success, summary = check_endpoint(8000, "test-model")
 
@@ -241,7 +248,7 @@ def test_check_endpoint_model_check_timeout(self, mock_get):
 class TestCheckRailsEndpoint:
     """Tests for check_rails_endpoint function."""
 
-    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
     def test_check_rails_endpoint_success(self, mock_get):
         """Test successful rails config check."""
         response = MagicMock()
@@ -259,7 +266,7 @@ def test_check_rails_endpoint_success(self, mock_get):
         assert "PASSED" in summary
         assert "9000" in summary
 
-    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
     def test_check_rails_endpoint_non_200_status(self, mock_get):
         """Test rails config check with non-200 status."""
         response = MagicMock()
@@ -273,7 +280,7 @@ def test_check_rails_endpoint_non_200_status(self, mock_get):
         assert not success
         assert "FAILED" in summary
 
-    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
     def test_check_rails_endpoint_empty_list(self, mock_get):
         """Test rails config check with empty list response."""
         response = MagicMock()
@@ -287,7 +294,7 @@ def test_check_rails_endpoint_empty_list(self, mock_get):
         assert not success
         assert "FAILED" in summary
 
-    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
     def test_check_rails_endpoint_not_a_list(self, mock_get):
         """Test rails config check with non-list response."""
         response = MagicMock()
@@ -301,15 +308,13 @@ def test_check_rails_endpoint_not_a_list(self, mock_get):
         assert not success
         assert "FAILED" in summary
 
-    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
     def test_check_rails_endpoint_json_decode_error(self, mock_get):
         """Test rails config check with invalid JSON."""
         response = MagicMock()
         response.status_code = 200
         response.text = "invalid json"
-        response.json.side_effect = requests.exceptions.JSONDecodeError(
-            "Expecting value", "", 0
-        )
+        response.json.side_effect = json.JSONDecodeError("Expecting value", "", 0)
 
         mock_get.return_value = response
 
@@ -318,20 +323,20 @@ def test_check_rails_endpoint_json_decode_error(self, mock_get):
         assert not success
         assert "FAILED" in summary
 
-    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
     def test_check_rails_endpoint_connection_error(self, mock_get):
         """Test rails config check with connection error."""
-        mock_get.side_effect = requests.exceptions.ConnectionError()
+        mock_get.side_effect = httpx.ConnectError("Connection failed")
 
         success, summary = check_rails_endpoint(9000)
 
         assert not success
         assert "FAILED" in summary
 
-    @patch("nemoguardrails.benchmark.validate_mocks.requests.get")
+    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
     def test_check_rails_endpoint_timeout(self, mock_get):
         """Test rails config check with timeout."""
-        mock_get.side_effect = requests.exceptions.Timeout()
+        mock_get.side_effect = httpx.TimeoutException("Request timed out")
 
         success, summary = check_rails_endpoint(9000)
 

From 83b40b78942a3783ed3613138c23d6664f80ef1f Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Thu, 6 Nov 2025 17:15:50 -0600
Subject: [PATCH 13/16] Revert "Add pyproject.toml 'benchmark' extra to install
 honcho and requests"

This reverts commit 20f3726d4a97af90e04caa6c08b13fb75b022d25.
---
 pyproject.toml | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4d69f1e5c..1d252a8bf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -105,10 +105,6 @@ google-cloud-language = { version = ">=2.14.0", optional = true }
 # jailbreak injection
 yara-python = { version = "^4.5.1", optional = true }
 
-# benchmark
-requests = { version = ">=2.31.0", optional = true }
-honcho = { version = ">=1.1.0", optional = true }
-
 [tool.poetry.extras]
 sdd = ["presidio-analyzer", "presidio-anonymizer"]
 eval = ["tqdm", "numpy", "streamlit", "tornado"]
@@ -117,7 +113,6 @@ gcp = ["google-cloud-language"]
 tracing = ["opentelemetry-api", "aiofiles"]
 nvidia = ["langchain-nvidia-ai-endpoints"]
 jailbreak = ["yara-python"]
-benchmark = ["requests", "honcho"]
 # Poetry does not support recursive dependencies, so we need to add all the dependencies here.
 # I also support their decision. There is no PEP for recursive dependencies, but it has been supported in pip since version 21.2.
 # It is here for backward compatibility.
@@ -133,8 +128,6 @@ all = [
   "aiofiles",
   "langchain-nvidia-ai-endpoints",
   "yara-python",
-  "requests",
-  "honcho",
 ]
 
 [tool.poetry.group.dev]

From 8f58d118630035dec589743f657f5634356d4972 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Mon, 10 Nov 2025 07:52:22 -0600
Subject: [PATCH 14/16] Removed commented line in Procfile

---
 nemoguardrails/benchmark/Procfile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nemoguardrails/benchmark/Procfile b/nemoguardrails/benchmark/Procfile
index c698d4463..f177f52be 100644
--- a/nemoguardrails/benchmark/Procfile
+++ b/nemoguardrails/benchmark/Procfile
@@ -1,7 +1,6 @@
 # Procfile
 
 # NeMo Guardrails server
-# main: uvicorn nemoguardrails.server.api:app --workers 4 --port 9000
 gr: poetry run nemoguardrails server --config configs/guardrail_configs --default-config-id content_safety_colang1 --port 9000
 
 # Guardrails NIMs for inference

From d5c6038816d4bfe97ec5b0e8de3caa18abb50f4a Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Mon, 10 Nov 2025 07:56:38 -0600
Subject: [PATCH 15/16] Add README explaining core-banchmarking

---
 nemoguardrails/benchmark/README.md | 159 +++++++++++++++++++++++++++++
 1 file changed, 159 insertions(+)
 create mode 100644 nemoguardrails/benchmark/README.md

diff --git a/nemoguardrails/benchmark/README.md b/nemoguardrails/benchmark/README.md
new file mode 100644
index 000000000..750fde0fe
--- /dev/null
+++ b/nemoguardrails/benchmark/README.md
@@ -0,0 +1,159 @@
+# Guardrails Benchmarking
+
+NeMo Guardrails includes benchmarking tools to help users capacity-test their Guardrails applications.
+Adding guardrails to an LLM-based application improves safety and security, while adding some latency. These benchmarks allow users to quantify the tradeoff between security and latency, to make data-driven decisions.
+We currently have a simple testbench, which runs the Guardrails server with mocks as Guardrail and Application models. This can be used for performance-testing on a laptop without any GPUs, and run in a few minutes.
+
+## Guardrails Core Benchmarking
+
+This benchmark measures the performance of the Guardrails application, running on CPU-only laptop or instance.
+It doesn't require GPUs on which to run local models, or access to the internet to use models hosted by providers.
+All models use the [Mock LLM Server](mock_llm_server), which is a simplified model of an LLM used for inference.
+The aim of this benchmark is to detect performance-regressions as quickly as running unit-tests.
+
+## Quickstart: Running Guardrails with Mock LLMs
+To run Guardrails with mocks for both the content-safety and main LLM, follow the steps below. All commands must be run in the `nemoguardrails/benchmark` directory. These assume you already have a working environment after following the [contribution guidelines](../CONTRIBUTING.md).
+
+First, we need to install the honcho and langchain-nvidia-ai-endpoints packages.
+The `honcho` package is used to run Procfile-based applications, and is a Python port of [Foreman](https://github.com/ddollar/foreman).
+The `langchain-nvidia-ai-endpoints` package is used to communicate with Mock LLMs via Langchain.
+
+```shell
+# Install dependencies
+$ poetry run pip install honcho langchain-nvidia-ai-endpoints
+...
+Successfully installed filetype-1.2.0 honcho-2.0.0 langchain-nvidia-ai-endpoints-0.3.19
+```
+
+Now we can start up the processes that are part of the [Procfile](Procfile).
+As the Procfile processes spin up, they log to the console with a prefix. The `system` prefix is used by Honcho, `app_llm` is the Application or Main LLM mock, `cs_llm` is the content-safety mock, and `gr` is the Guardrails service. We'll explore the Procfile in more detail below.
+Once the three 'Uvicorn running on ...' messages are printed, you can move to the next step. Note these messages are likely not on consecutive lines.
+
+```
+# All commands must be run in the nemoguardrails/benchmark directory
+$ cd nemoguardrails/benchmark
+$ poetry run honcho start
+13:40:33 system    | gr.1 started (pid=93634)
+13:40:33 system    | app_llm.1 started (pid=93635)
+13:40:33 system    | cs_llm.1 started (pid=93636)
+...
+13:40:41 app_llm.1 | INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
+...
+13:40:41 cs_llm.1  | INFO:     Uvicorn running on http://0.0.0.0:8001 (Press CTRL+C to quit)
+...
+13:40:45 gr.1      | INFO:     Uvicorn running on http://0.0.0.0:9000 (Press CTRL+C to quit)
+```
+
+Once Guardrails and the mock servers are up, we can use the `validate_mocks.py` script to check they're healthy and serving the correct models.
+
+```shell
+$ cd nemoguardrails/benchmark
+$ poetry run python validate_mocks.py
+Starting LLM endpoint health check...
+
+--- Checking Port: 8000 ---
+Checking http://localhost:8000/health ...
+HTTP Request: GET http://localhost:8000/health "HTTP/1.1 200 OK"
+Health Check PASSED: Status is 'healthy'.
+Checking http://localhost:8000/v1/models for 'meta/llama-3.3-70b-instruct'...
+HTTP Request: GET http://localhost:8000/v1/models "HTTP/1.1 200 OK"
+Model Check PASSED: Found 'meta/llama-3.3-70b-instruct' in model list.
+--- Port 8000: ALL CHECKS PASSED ---
+
+--- Checking Port: 8001 ---
+Checking http://localhost:8001/health ...
+HTTP Request: GET http://localhost:8001/health "HTTP/1.1 200 OK"
+Health Check PASSED: Status is 'healthy'.
+Checking http://localhost:8001/v1/models for 'nvidia/llama-3.1-nemoguard-8b-content-safety'...
+HTTP Request: GET http://localhost:8001/v1/models "HTTP/1.1 200 OK"
+Model Check PASSED: Found 'nvidia/llama-3.1-nemoguard-8b-content-safety' in model list.
+--- Port 8001: ALL CHECKS PASSED ---
+
+--- Checking Port: 9000 (Rails Config) ---
+Checking http://localhost:9000/v1/rails/configs ...
+HTTP Request: GET http://localhost:9000/v1/rails/configs "HTTP/1.1 200 OK"
+HTTP Status PASSED: Got 200.
+Body Check PASSED: Response is an array with at least one entry.
+--- Port 9000: ALL CHECKS PASSED ---
+
+--- Final Summary ---
+Port 8000 (meta/llama-3.3-70b-instruct): PASSED
+Port 8001 (nvidia/llama-3.1-nemoguard-8b-content-safety): PASSED
+Port 9000 (Rails Config): PASSED
+---------------------
+Overall Status: All endpoints are healthy!
+```
+
+Once the mocks and Guardrails are running and the script passes, we can issue curl requests against the Guardrails `/chat/completions` endpoint to generate a response and test the system end-to-end.
+
+```shell
+curl -s -X POST http://0.0.0.0:9000/v1/chat/completions \
+   -H 'Accept: application/json' \
+   -H 'Content-Type: application/json' \
+   -d '{
+      "model": "meta/llama-3.3-70b-instruct",
+      "messages": [
+         {
+            "role": "user",
+            "content": "what can you do for me?"
+         }
+      ],
+      "stream": false
+    }' | jq
+{
+  "messages": [
+    {
+      "role": "assistant",
+      "content": "I can provide information and help with a wide range of topics, from science and history to entertainment and culture. I can also help with language-related tasks, such as translation and text summarization. However, I can't assist with requests that involve harm or illegal activities."
+    }
+  ]
+}
+
+```
+
+## Deep-Dive: Configuration
+
+In this section, we'll examine the configuration files used in the quickstart above. This gives more context on how the system works, and can be extended as needed.
+
+### Procfile
+
+The [Procfile](Procfile?raw=true) contains all the processes that make up the application.
+The Honcho package reads in this file, starts all the processes, and combines their logs to the console
+The `gr` line runs the Guardrails server on port 9000 and sets the default Guardrails configuration as [content_safety_colang1](configs/guardrail_configs/content_safety_colang1?raw=true).
+The `app_llm` line runs the Application or Main Mock LLM. Guardrails calls this LLM to generate a response to the user's query. This server uses 4 uvicorn workers and runs on port 8000. The configuration file here is a Mock LLM configuration, not a Guardrails configuration.
+The `cs_llm` line runs the Content-Safety Mock LLM. This uses 4 uvicorn workers and runs on port 8001.
+
+### Guardrails Configuration
+The [Guardrails Configuration](configs/guardrail_configs/content_safety_colang1/config.yml) is used by the Guardrails server.
+Under the `models` section, the `main` model is used to generate responses to the user queries. The base URL for this model is the `app_llm` Mock LLM from the Procfile, running on port 8000. The `model` field has to match the Mock LLM model name.
+The `content_safety` model is configured for use in an input and output rail. The `type` field matches the `$model` used in the input and output flows.
+
+### Mock LLM Endpoints
+The Mock LLM implements a subset of the OpenAI LLM API.
+There are two Mock LLM configurations, one for the Mock [main model](configs/mock_configs/meta-llama-3.3-70b-instruct.env), and another for the Mock [content-safety](configs/mock_configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env) model.
+The Mock LLM has the following OpenAI-compatible endpoints:
+
+* `/health`: Returns a JSON object with status set to healthy and timestamp in seconds-since-epoch. For example `{"status":"healthy","timestamp":1762781239}`
+* `/v1/models`: Returns the `MODEL` field from the Mock configuration (see below). For example `{"object":"list","data":[{"id":"meta/llama-3.3-70b-instruct","object":"model","created":1762781290,"owned_by":"system"}]}`
+* `/v1/completions`: Returns an [OpenAI completion object](https://platform.openai.com/docs/api-reference/completions/object) using the Mock configuration (see below).
+* `/v1/chat/completions`: Returns an [OpenAI chat completion object](https://platform.openai.com/docs/api-reference/chat/object) using the Mock configuration (see below).
+
+### Mock LLM Configuration
+Mock LLMs are configured using the `.env` file format. These files are passed to the Mock LLM using the `--config-file` argument.
+The Mock LLMs return either a `SAFE_TEXT` or `UNSAFE_TEXT` response to `/v1/completions` or `/v1/chat/completions` inference requests.
+The probability of the `UNSAFE_TEXT` being returned if given by `UNSAFE_PROBABILITY`.
+The latency of each response is also controllable, and works as follows:
+
+* Latency is first sampled from a normal distribution with mean `LATENCY_MEAN_SECONDS` and standard deviation `LATENCY_STD_SECONDS`.
+* If the sampled value is less than `LATENCY_MIN_SECONDS`, it is set to `LATENCY_MIN_SECONDS`.
+* If the sampled value is less than `LATENCY_MAX_SECONDS`, it is set to `LATENCY_MAX_SECONDS`.
+
+The full list of configuration fields is shown below:
+* `MODEL`: The Model name served by the Mock LLM. This will be returned on the `/v1/models` endpoint.
+* `UNSAFE_PROBABILITY`: Probability of an unsafe response. This is a probability, and must be in the range [0, 1].
+* `UNSAFE_TEXT`: String returned as an unsafe response.
+* `SAFE_TEXT`: String returned as a safe response.
+* `LATENCY_MIN_SECONDS`: Minimum latency in seconds.
+* `LATENCY_MAX_SECONDS`: Maximum latency in seconds.
+* `LATENCY_MEAN_SECONDS`: Normal distribution mean from which to sample latency.
+* `LATENCY_STD_SECONDS`: Normal distribution standard deviation from which to sample latency.

From 7eb70d1c58c2f74b76ecaaca58f82d12449ed892 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Mon, 10 Nov 2025 08:12:46 -0600
Subject: [PATCH 16/16] Small README tweaks

---
 nemoguardrails/benchmark/README.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/nemoguardrails/benchmark/README.md b/nemoguardrails/benchmark/README.md
index 750fde0fe..914d1b47c 100644
--- a/nemoguardrails/benchmark/README.md
+++ b/nemoguardrails/benchmark/README.md
@@ -4,6 +4,8 @@ NeMo Guardrails includes benchmarking tools to help users capacity-test their Gu
 Adding guardrails to an LLM-based application improves safety and security, while adding some latency. These benchmarks allow users to quantify the tradeoff between security and latency, to make data-driven decisions.
 We currently have a simple testbench, which runs the Guardrails server with mocks as Guardrail and Application models. This can be used for performance-testing on a laptop without any GPUs, and run in a few minutes.
 
+-----
+
 ## Guardrails Core Benchmarking
 
 This benchmark measures the performance of the Guardrails application, running on CPU-only laptop or instance.
@@ -12,7 +14,9 @@ All models use the [Mock LLM Server](mock_llm_server), which is a simplified mod
 The aim of this benchmark is to detect performance-regressions as quickly as running unit-tests.
 
 ## Quickstart: Running Guardrails with Mock LLMs
-To run Guardrails with mocks for both the content-safety and main LLM, follow the steps below. All commands must be run in the `nemoguardrails/benchmark` directory. These assume you already have a working environment after following the [contribution guidelines](../CONTRIBUTING.md).
+To run Guardrails with mocks for both the content-safety and main LLM, follow the steps below.
+All commands must be run in the `nemoguardrails/benchmark` directory.
+These assume you already have a working environment after following the steps in [CONTRIBUTING.md](../../CONTRIBUTING.md).
 
 First, we need to install the honcho and langchain-nvidia-ai-endpoints packages.
 The `honcho` package is used to run Procfile-based applications, and is a Python port of [Foreman](https://github.com/ddollar/foreman).
@@ -111,6 +115,8 @@ curl -s -X POST http://0.0.0.0:9000/v1/chat/completions \
 
 ```
 
+------
+
 ## Deep-Dive: Configuration
 
 In this section, we'll examine the configuration files used in the quickstart above. This gives more context on how the system works, and can be extended as needed.