openai
diff --git a/‎docs/benchmarking/alignment_roc_curves.png‎
-24 KB b/‎docs/benchmarking/alignment_roc_curves.png‎
-24 KB
diff --git a/‎docs/ref/checks/prompt_injection_detection.md‎
Lines changed: 8 additions & 10 deletions b/‎docs/ref/checks/prompt_injection_detection.md‎
Lines changed: 8 additions & 10 deletions
diff --git a/‎src/guardrails/checks/text/hallucination_detection.py‎
Lines changed: 31 additions & 21 deletions b/‎src/guardrails/checks/text/hallucination_detection.py‎
Lines changed: 31 additions & 21 deletions
diff --git a/‎src/guardrails/checks/text/llm_base.py‎
Lines changed: 49 additions & 15 deletions b/‎src/guardrails/checks/text/llm_base.py‎
Lines changed: 49 additions & 15 deletions
@@ -92,10 +92,8 @@ Returns a `GuardrailResult` with the following `info` dictionary:
 
 This benchmark evaluates model performance on agent conversation traces:
 
-- **Synthetic dataset**: 1,000 samples with 500 positive cases (50% prevalence) simulating realistic agent traces
-- **AgentDojo dataset**: 1,046 samples from AgentDojo's workspace, travel, banking, and Slack suite combined with the "important_instructions" attack (949 positive cases, 97 negative samples)
-- **Test scenarios**: Multi-turn conversations with function calls and tool outputs across realistic workplace domains
-- **Misalignment examples**: Unrelated function calls, harmful operations, and data leakage
+- **[AgentDojo dataset](https://github.com/ethz-spylab/agentdojo)**: 1,046 samples generated from running AgentDojo's benchmark script on workspace, travel, banking, and Slack suite combined with the "important_instructions" attack (949 positive cases, 97 negative samples)
+- **Internal synthetic dataset**: 537 positive cases simulating realistic, multi-turn agent conversation traces
 
 **Example of misaligned conversation:**
 
@@ -113,12 +111,12 @@ This benchmark evaluates model performance on agent conversation traces:
 
 | Model         | ROC AUC | Prec@R=0.80 | Prec@R=0.90 | Prec@R=0.95 | Recall@FPR=0.01 |
 |---------------|---------|-------------|-------------|-------------|-----------------|
-| gpt-5         | 0.9604  | 0.998       | 0.995       | 0.963       | 0.431           |
-| gpt-5-mini    | 0.9796  | 0.999       | 0.999       | 0.966       | 0.000           |
-| gpt-5-nano    | 0.8651  | 0.963       | 0.963       | 0.951       | 0.056           |
-| gpt-4.1       | 0.9846  | 0.998       | 0.998       | 0.998       | 0.000           |
-| gpt-4.1-mini (default) | 0.9728  | 0.995       | 0.995       | 0.995       | 0.000           |
-| gpt-4.1-nano  | 0.8677  | 0.974       | 0.974       | 0.974       | 0.000           |
+| gpt-5         | 0.9931  | 0.9992      | 0.9992      | 0.9992      | 0.5845          |
+| gpt-5-mini    | 0.9536  | 0.9951      | 0.9951      | 0.9951      | 0.0000          |
+| gpt-5-nano    | 0.9283  | 0.9913      | 0.9913      | 0.9717      | 0.0350          |
+| gpt-4.1       | 0.9794  | 0.9973      | 0.9973      | 0.9973      | 0.0000          |
+| gpt-4.1-mini (default) | 0.9865  | 0.9986      | 0.9986      | 0.9986      | 0.0000          |
+| gpt-4.1-nano  | 0.9142  | 0.9948      | 0.9948      | 0.9387      | 0.0000          |
 
 **Notes:**
 
 
@@ -52,7 +52,13 @@
 from guardrails.spec import GuardrailSpecMetadata
 from guardrails.types import GuardrailLLMContextProto, GuardrailResult
 
-from .llm_base import LLMConfig, LLMOutput, _invoke_openai_callable
+from .llm_base import (
+    LLMConfig,
+    LLMErrorOutput,
+    LLMOutput,
+    _invoke_openai_callable,
+    create_error_result,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -232,39 +238,43 @@ async def hallucination_detection(
         )
 
     except ValueError as e:
-        # Log validation errors but return safe default
+        # Log validation errors and use shared error helper
         logger.warning(f"Validation error in hallucination_detection: {e}")
-        return GuardrailResult(
-            tripwire_triggered=False,
-            info={
-                "guardrail_name": "Hallucination Detection",
-                "flagged": False,
-                "confidence": 0.0,
+        error_output = LLMErrorOutput(
+            flagged=False,
+            confidence=0.0,
+            info={"error_message": f"Validation failed: {str(e)}"},
+        )
+        return create_error_result(
+            guardrail_name="Hallucination Detection",
+            analysis=error_output,
+            checked_text=candidate,
+            additional_info={
+                "threshold": config.confidence_threshold,
                 "reasoning": f"Validation failed: {str(e)}",
                 "hallucination_type": None,
                 "hallucinated_statements": None,
                 "verified_statements": None,
-                "threshold": config.confidence_threshold,
-                "error": str(e),
-                "checked_text": candidate,  # Hallucination Detection doesn't modify text, pass through unchanged
             },
         )
     except Exception as e:
-        # Log unexpected errors and return safe default
+        # Log unexpected errors and use shared error helper
         logger.exception("Unexpected error in hallucination_detection")
-        return GuardrailResult(
-            tripwire_triggered=False,
-            info={
-                "guardrail_name": "Hallucination Detection",
-                "flagged": False,
-                "confidence": 0.0,
+        error_output = LLMErrorOutput(
+            flagged=False,
+            confidence=0.0,
+            info={"error_message": str(e)},
+        )
+        return create_error_result(
+            guardrail_name="Hallucination Detection",
+            analysis=error_output,
+            checked_text=candidate,
+            additional_info={
+                "threshold": config.confidence_threshold,
                 "reasoning": f"Analysis failed: {str(e)}",
                 "hallucination_type": None,
                 "hallucinated_statements": None,
                 "verified_statements": None,
-                "threshold": config.confidence_threshold,
-                "error": str(e),
-                "checked_text": candidate,  # Hallucination Detection doesn't modify text, pass through unchanged
             },
         )
 
 
@@ -60,7 +60,13 @@ class MyLLMOutput(LLMOutput):
 logger = logging.getLogger(__name__)
 
 
-__all__ = ["LLMConfig", "LLMOutput", "LLMErrorOutput", "create_llm_check_fn"]
+__all__ = [
+    "LLMConfig",
+    "LLMOutput",
+    "LLMErrorOutput",
+    "create_llm_check_fn",
+    "create_error_result",
+]
 
 
 class LLMConfig(BaseModel):
@@ -115,6 +121,44 @@ class LLMErrorOutput(LLMOutput):
     info: dict
 
 
+def create_error_result(
+    guardrail_name: str,
+    analysis: LLMErrorOutput,
+    checked_text: str,
+    additional_info: dict[str, Any] | None = None,
+) -> GuardrailResult:
+    """Create a standardized GuardrailResult from an LLM error output.
+
+    Args:
+        guardrail_name: Name of the guardrail that failed.
+        analysis: The LLM error output.
+        checked_text: The text that was being checked.
+        additional_info: Optional additional fields to include in info dict.
+
+    Returns:
+        GuardrailResult with execution_failed=True.
+    """
+    error_info = getattr(analysis, "info", {})
+    error_message = error_info.get("error_message", "LLM execution failed")
+
+    result_info: dict[str, Any] = {
+        "guardrail_name": guardrail_name,
+        "checked_text": checked_text,
+        "error": error_message,
+        **analysis.model_dump(),
+    }
+
+    if additional_info:
+        result_info.update(additional_info)
+
+    return GuardrailResult(
+        tripwire_triggered=False,
+        execution_failed=True,
+        original_exception=Exception(error_message),
+        info=result_info,
+    )
+
+
 def _build_full_prompt(system_prompt: str) -> str:
     """Assemble a complete LLM prompt with instructions and response schema.
 
@@ -334,20 +378,10 @@ async def guardrail_func(
 
         # Check if this is an error result
         if isinstance(analysis, LLMErrorOutput):
-            # Extract error information from the LLMErrorOutput
-            error_info = analysis.info if hasattr(analysis, "info") else {}
-            error_message = error_info.get("error_message", "LLM execution failed")
-
-            return GuardrailResult(
-                tripwire_triggered=False,  # Don't trigger tripwire on execution errors
-                execution_failed=True,
-                original_exception=Exception(error_message),  # Create exception from error message
-                info={
-                    "guardrail_name": name,
-                    "checked_text": data,
-                    "error": error_message,
-                    **analysis.model_dump(),
-                },
+            return create_error_result(
+                guardrail_name=name,
+                analysis=analysis,
+                checked_text=data,
             )
 
         # Compare severity levels