Refactor config.yml to use example files for different providers

olesho · claude · olesho · commit 05ea2a6d3e7a · 2025-10-21T09:33:08.000-05:00
Created separate example config files for different model providers: - config.example.openai.yml: OpenAI models (now default) - config.example.openrouter-gemini.yml: OpenRouter with Gemini - config.example.openrouter-gpt.yml: OpenRouter with GPT Main config.yml now defaults to OpenAI provider for reliability. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
diff --git a/evals/config.example.openai.yml b/evals/config.example.openai.yml
@@ -0,0 +1,60 @@
+# Evaluation Framework Configuration
+# This configuration is shared across all evaluation runner scripts
+# Example configuration for OpenAI models
+
+# API endpoint for the evaluation server
+api_endpoint: "http://localhost:8080"
+
+# Model configurations for running evaluations
+# These models are sent to the agent for processing requests
+
+main_model:
+  provider: "openai"
+  model_name: "gpt-5-mini"
+  api_key: "${OPENAI_API_KEY}"
+
+mini_model:
+  provider: "openai"
+  model_name: "gpt-5-nano"
+  api_key: "${OPENAI_API_KEY}"
+
+nano_model:
+  provider: "openai"
+  model_name: "gpt-5-nano"
+  api_key: "${OPENAI_API_KEY}"
+
+# Model configuration for judging evaluation responses
+# This model is used locally to assess the quality of agent responses
+
+judge_model:
+  provider: "openai"
+  model_name: "gpt-5"
+  api_key: "${OPENAI_API_KEY}"
+  # temperature: 0.1  # GPT-5 doesn't support custom temperature
+
+# Execution settings
+
+execution:
+  # Default number of evaluations to run per script execution
+  default_limit: 20
+
+  # Timeout for API requests (seconds) - set to max for slow custom API
+  timeout: 3600
+
+  # Number of concurrent evaluation requests
+  concurrent_requests: 1
+
+  # Delay between requests (seconds)
+  request_delay: 1
+
+# Reporting settings
+
+reporting:
+  # Directory for storing evaluation reports
+  reports_dir: "reports"
+
+  # Report format
+  format: "csv"
+
+  # Include detailed judge reasoning in reports
+  include_reasoning: true
diff --git a/evals/config.example.openrouter-gemini.yml b/evals/config.example.openrouter-gemini.yml
@@ -0,0 +1,60 @@
+# Evaluation Framework Configuration
+# This configuration is shared across all evaluation runner scripts
+# Example configuration for OpenRouter with Google Gemini models
+
+# API endpoint for the evaluation server
+api_endpoint: "http://localhost:8080"
+
+# Model configurations for running evaluations
+# These models are sent to the agent for processing requests
+
+main_model:
+  provider: "openrouter"
+  model_name: "google/gemini-2.0-flash-exp:free"
+  api_key: "${OPENROUTER_API_KEY}"
+
+mini_model:
+  provider: "openrouter"
+  model_name: "google/gemini-2.0-flash-exp:free"
+  api_key: "${OPENROUTER_API_KEY}"
+
+nano_model:
+  provider: "openrouter"
+  model_name: "google/gemini-2.0-flash-exp:free"
+  api_key: "${OPENROUTER_API_KEY}"
+
+# Model configuration for judging evaluation responses
+# This model is used locally to assess the quality of agent responses
+
+judge_model:
+  provider: "openai"
+  model_name: "gpt-5"
+  api_key: "${OPENAI_API_KEY}"
+  # temperature: 0.1  # GPT-5 doesn't support custom temperature
+
+# Execution settings
+
+execution:
+  # Default number of evaluations to run per script execution
+  default_limit: 20
+
+  # Timeout for API requests (seconds) - set to max for slow custom API
+  timeout: 3600
+
+  # Number of concurrent evaluation requests
+  concurrent_requests: 1
+
+  # Delay between requests (seconds)
+  request_delay: 1
+
+# Reporting settings
+
+reporting:
+  # Directory for storing evaluation reports
+  reports_dir: "reports"
+
+  # Report format
+  format: "csv"
+
+  # Include detailed judge reasoning in reports
+  include_reasoning: true
diff --git a/evals/config.example.openrouter-gpt.yml b/evals/config.example.openrouter-gpt.yml
@@ -0,0 +1,60 @@
+# Evaluation Framework Configuration
+# This configuration is shared across all evaluation runner scripts
+# Example configuration for OpenRouter with GPT models
+
+# API endpoint for the evaluation server
+api_endpoint: "http://localhost:8080"
+
+# Model configurations for running evaluations
+# These models are sent to the agent for processing requests
+
+main_model:
+  provider: "openrouter"
+  model_name: "openai/gpt-oss-20b:free"
+  api_key: "${OPENROUTER_API_KEY}"
+
+mini_model:
+  provider: "openrouter"
+  model_name: "openai/gpt-oss-20b:free"
+  api_key: "${OPENROUTER_API_KEY}"
+
+nano_model:
+  provider: "openrouter"
+  model_name: "openai/gpt-oss-20b:free"
+  api_key: "${OPENROUTER_API_KEY}"
+
+# Model configuration for judging evaluation responses
+# This model is used locally to assess the quality of agent responses
+
+judge_model:
+  provider: "openai"
+  model_name: "gpt-5"
+  api_key: "${OPENAI_API_KEY}"
+  # temperature: 0.1  # GPT-5 doesn't support custom temperature
+
+# Execution settings
+
+execution:
+  # Default number of evaluations to run per script execution
+  default_limit: 20
+
+  # Timeout for API requests (seconds) - set to max for slow custom API
+  timeout: 3600
+
+  # Number of concurrent evaluation requests
+  concurrent_requests: 1
+
+  # Delay between requests (seconds)
+  request_delay: 1
+
+# Reporting settings
+
+reporting:
+  # Directory for storing evaluation reports
+  reports_dir: "reports"
+
+  # Report format
+  format: "csv"
+
+  # Include detailed judge reasoning in reports
+  include_reasoning: true
diff --git a/evals/config.yml b/evals/config.yml
@@ -6,53 +6,22 @@ api_endpoint: "http://localhost:8080"
 
 # Model configurations for running evaluations
 # These models are sent to the agent for processing requests
-
-# main_model:
-#   provider: "openai"
-#   model_name: "gpt-5-mini"
-#   api_key: "${OPENAI_API_KEY}"
-
-# mini_model:
-#   provider: "openai"
-#   model_name: "gpt-5-nano"
-#   api_key: "${OPENAI_API_KEY}"
-
-# nano_model:
-#   provider: "openai"
-#   model_name: "gpt-5-nano"
-#   api_key: "${OPENAI_API_KEY}"
-
-# main_model:
-#   provider: "openrouter"
-#   model_name: "openai/gpt-oss-20b:free"
-#   # model_name: "tngtech/deepseek-r1t2-chimera:free"
-#   api_key: "${OPENROUTER_API_KEY}"
-
-# mini_model:
-#   provider: "openrouter"
-#   model_name: "openai/gpt-oss-20b:free"
-#   api_key: "${OPENROUTER_API_KEY}"
-
-# nano_model:
-#   provider: "openrouter"
-#   model_name: "openai/gpt-oss-20b:free"
-#   api_key: "${OPENROUTER_API_KEY}"
+# See config.example.*.yml files for other provider/model configurations
 
 main_model:
-  provider: "openrouter"
-  model_name: "google/gemini-2.0-flash-exp:free"
-  # model_name: "tngtech/deepseek-r1t2-chimera:free"
-  api_key: "${OPENROUTER_API_KEY}"
+  provider: "openai"
+  model_name: "gpt-5-mini"
+  api_key: "${OPENAI_API_KEY}"
 
 mini_model:
-  provider: "openrouter"
-  model_name: "google/gemini-2.0-flash-exp:free"
-  api_key: "${OPENROUTER_API_KEY}"
+  provider: "openai"
+  model_name: "gpt-5-nano"
+  api_key: "${OPENAI_API_KEY}"
 
 nano_model:
-  provider: "openrouter"
-  model_name: "google/gemini-2.0-flash-exp:free"
-  api_key: "${OPENROUTER_API_KEY}"
+  provider: "openai"
+  model_name: "gpt-5-nano"
+  api_key: "${OPENAI_API_KEY}"
 
 # Model configuration for judging evaluation responses
 # This model is used locally to assess the quality of agent responses