diff --git a/tools/make/linter.mk b/tools/make/linter.mk index a252d4ac4..23548b361 100644 --- a/tools/make/linter.mk +++ b/tools/make/linter.mk @@ -4,14 +4,6 @@ ##@ Linter -docs-lint: docs-install ## Lint documentation in website/ - @$(LOG_TARGET) - cd website && npm run lint - -docs-lint-fix: docs-install ## Auto-fix documentation lint issues in website/ - @$(LOG_TARGET) - cd website && npm run lint:fix - markdown-lint: ## Lint all markdown files in the project @$(LOG_TARGET) markdownlint -c tools/linter/markdown/markdownlint.yaml "**/*.md" \ diff --git a/website/docs/installation/configuration.md b/website/docs/installation/configuration.md index 18742ee35..03b5b5ccc 100644 --- a/website/docs/installation/configuration.md +++ b/website/docs/installation/configuration.md @@ -15,14 +15,14 @@ The configuration file is located at `config/config.yaml`. Here's the structure # BERT model for semantic similarity bert_model: - model_id: sentence-transformers/all-MiniLM-L12-v2 + model_id: models/all-MiniLM-L12-v2 threshold: 0.6 use_cpu: true # Semantic caching semantic_cache: backend_type: "memory" # Options: "memory" or "milvus" - enabled: false + enabled: true similarity_threshold: 0.8 # Global default threshold max_entries: 1000 ttl_seconds: 3600 @@ -30,7 +30,7 @@ semantic_cache: # Tool auto-selection tools: - enabled: false + enabled: true top_k: 3 similarity_threshold: 0.2 tools_db_path: "config/tools_db.json" @@ -38,7 +38,7 @@ tools: # Jailbreak protection prompt_guard: - enabled: false # Global default - can be overridden per category + enabled: true # Global default - can be overridden per category use_modernbert: true model_id: "models/jailbreak_classifier_modernbert-base_model" threshold: 0.7 @@ -330,31 +330,23 @@ categories: Configure how different models handle reasoning mode syntax. This allows you to add new models without code changes: ```yaml -# Model reasoning configurations - define how different models handle reasoning syntax -model_reasoning_configs: - - name: "deepseek" - patterns: ["deepseek", "ds-", "ds_", "ds:", "ds "] - reasoning_syntax: - type: "chat_template_kwargs" - parameter: "thinking" - - - name: "qwen3" - patterns: ["qwen3"] - reasoning_syntax: - type: "chat_template_kwargs" - parameter: "enable_thinking" - - - name: "gpt-oss" - patterns: ["gpt-oss", "gpt_oss"] - reasoning_syntax: - type: "reasoning_effort" - parameter: "reasoning_effort" - - - name: "gpt" - patterns: ["gpt"] - reasoning_syntax: - type: "reasoning_effort" - parameter: "reasoning_effort" +# Reasoning family configurations - define how different model families handle reasoning syntax +reasoning_families: + deepseek: + type: "chat_template_kwargs" + parameter: "thinking" + + qwen3: + type: "chat_template_kwargs" + parameter: "enable_thinking" + + gpt-oss: + type: "reasoning_effort" + parameter: "reasoning_effort" + + gpt: + type: "reasoning_effort" + parameter: "reasoning_effort" # Global default reasoning effort level (when not specified per category) default_reasoning_effort: "medium" @@ -364,46 +356,40 @@ default_reasoning_effort: "medium" **Configuration Structure:** -- `name`: A unique identifier for the model family -- `patterns`: Array of patterns to match against model names -- `reasoning_syntax.type`: How the model expects reasoning mode to be specified +- `name`: A unique identifier for the model family (e.g., "deepseek", "qwen3") +- `type`: How the model expects reasoning mode to be specified - `"chat_template_kwargs"`: Use chat template parameters (for models like DeepSeek, Qwen3) - `"reasoning_effort"`: Use OpenAI-compatible reasoning_effort field (for GPT models) -- `reasoning_syntax.parameter`: The specific parameter name the model uses +- `parameter`: The specific parameter name the model uses **Pattern Matching:** -The system supports both simple string patterns and regular expressions for flexible model matching: +The system supports model family names that are matched against model configurations: -- **Simple string matches**: `"deepseek"` matches any model containing "deepseek" -- **Prefix patterns**: `"ds-"` matches models starting with "ds-" or exactly "ds" -- **Regular expressions**: `"^gpt-4.*"` matches models starting with "gpt-4" -- **Wildcard**: `"*"` matches all models (use for fallback configurations) -- **Multiple patterns**: `["deepseek", "ds-", "^phi.*"]` matches any of these patterns +- **Family names**: `"deepseek"`, `"qwen3"`, `"gpt-oss"`, `"gpt"` +- Models are assigned to families via `model_config[model_name].reasoning_family` +- Unknown models will have no reasoning fields applied when reasoning mode is enabled -**Regex Pattern Examples:** +**Adding New Models:** +To support a new model family (e.g., Claude), simply add a new configuration: ```yaml -patterns: - - "^gpt-4.*" # Models starting with "gpt-4" - - ".*-instruct$" # Models ending with "-instruct" - - "phi[0-9]+" # Models like "phi3", "phi4", etc. - - "^(llama|mistral)" # Models starting with "llama" or "mistral" +reasoning_families: + claude: + type: "chat_template_kwargs" + parameter: "enable_reasoning" ``` -**Adding New Models:** -To support a new model family (e.g., Claude), simply add a new configuration: +Then assign your model to the family: ```yaml -model_reasoning_configs: - - name: "claude" - patterns: ["claude"] - reasoning_syntax: - type: "chat_template_kwargs" - parameter: "enable_reasoning" +model_config: + "claude-3-opus": + reasoning_family: "claude" + preferred_endpoints: ["endpoint1"] ``` **Unknown Models:** -Models that don't match any configured pattern will have no reasoning fields applied when reasoning mode is enabled. This prevents issues with models that don't support reasoning syntax. +Models that don't have a `reasoning_family` assigned will have no reasoning fields applied when reasoning mode is enabled. This prevents issues with models that don't support reasoning syntax. **Default Reasoning Effort:** Set the global default reasoning effort level used when categories don't specify their own effort level: @@ -505,7 +491,7 @@ tools: # BERT Model for Similarity bert_model: - model_id: sentence-transformers/all-MiniLM-L12-v2 + model_id: models/all-MiniLM-L12-v2 threshold: 0.6 # Similarity threshold use_cpu: true # CPU-only inference @@ -974,27 +960,31 @@ make test-prompt-guard # Jailbreak protection **Model not getting reasoning fields:** -- Check that the model name matches a pattern in `model_reasoning_configs` -- Verify the pattern syntax (exact matches vs prefixes) -- Unknown models will have no reasoning fields applied (this is by design) +- Check that the model has a `reasoning_family` assigned in `model_config` +- Verify the reasoning family exists in `reasoning_families` configuration +- Unknown models (without `reasoning_family`) will have no reasoning fields applied (this is by design) **Wrong reasoning syntax applied:** -- Ensure the `reasoning_syntax.type` matches your model's expected format -- Check the `reasoning_syntax.parameter` name is correct +- Ensure the `type` field in `reasoning_families` matches your model's expected format +- Check the `parameter` name is correct for your model family - DeepSeek models typically use `chat_template_kwargs` with `"thinking"` - GPT models typically use `reasoning_effort` **Adding support for new models:** ```yaml -# Add a new model configuration -model_reasoning_configs: - - name: "my-new-model" - patterns: ["my-model"] - reasoning_syntax: - type: "chat_template_kwargs" # or "reasoning_effort" - parameter: "custom_parameter" +# Add a new reasoning family +reasoning_families: + my-new-family: + type: "chat_template_kwargs" # or "reasoning_effort" + parameter: "custom_parameter" + +# Assign model to the family +model_config: + "my-model": + reasoning_family: "my-new-family" + preferred_endpoints: ["endpoint1"] ``` **Testing model reasoning configuration:** diff --git a/website/docs/overview/categories/configuration.md b/website/docs/overview/categories/configuration.md index 6ed2ad3cc..b985d59c8 100644 --- a/website/docs/overview/categories/configuration.md +++ b/website/docs/overview/categories/configuration.md @@ -13,12 +13,10 @@ categories: - name: "category_name" description: "Optional description" system_prompt: "Category-specific system prompt" - use_reasoning: true|false - reasoning_description: "Why reasoning is needed" - reasoning_effort: "low|medium|high" model_scores: - model: "model_name" score: 0.0-1.0 + use_reasoning: true|false # Per-model reasoning setting ``` ## Configuration Parameters @@ -148,58 +146,42 @@ categories: - **0.6-0.8**: Standard categories (general queries) - **0.4-0.6**: Technical categories (code generation, development tools) -#### `use_reasoning` (Required) - -- **Type**: Boolean -- **Description**: Whether to enable reasoning mode for this category -- **Default**: `false` -- **Impact**: Enables step-by-step problem solving - -```yaml -categories: - - name: "math" - use_reasoning: true # Enable reasoning for math problems -``` +### Model Scoring -#### `reasoning_description` (Optional) +#### `model_scores` (Required) -- **Type**: String -- **Description**: Explanation of why reasoning is needed -- **Purpose**: Documentation and model context -- **Best Practice**: Provide clear justification +- **Type**: Array of model-score pairs +- **Description**: Defines model preferences and reasoning settings for this category +- **Purpose**: Intelligent model selection based on domain expertise ```yaml categories: - - name: "chemistry" - use_reasoning: true - reasoning_description: "Chemical reactions require systematic analysis" + - name: "math" + model_scores: + - model: "phi4" + score: 1.0 # Highest preference + use_reasoning: true # Enable reasoning for this model on math + - model: "mistral-small3.1" + score: 0.8 + use_reasoning: false # No reasoning for this model ``` -#### `reasoning_effort` (Optional) +#### `use_reasoning` (Model-Level, Required) -- **Type**: String -- **Valid Values**: `"low"`, `"medium"`, `"high"` -- **Default**: `"medium"` -- **Description**: Controls the depth of reasoning +- **Type**: Boolean +- **Location**: Within each `model_scores` entry +- **Description**: Whether to enable reasoning mode for this specific model in this category +- **Default**: `false` +- **Impact**: Enables step-by-step problem solving for that model ```yaml categories: - name: "math" - use_reasoning: true - reasoning_effort: "high" # Maximum reasoning depth + model_scores: + - model: "phi4" + score: 1.0 + use_reasoning: true # Enable reasoning for phi4 on math problems ``` - -**Reasoning Effort Levels**: - -- **Low**: Basic step-by-step thinking (1-3 steps) -- **Medium**: Moderate analysis (3-7 steps) -- **High**: Deep reasoning (7-15 steps) - -### Model Scoring - -#### `model_scores` (Required) - -- **Type**: Array of model-score pairs - **Description**: Defines model preferences for this category - **Purpose**: Intelligent model selection based on domain expertise @@ -231,16 +213,16 @@ categories: categories: - name: "math" description: "Mathematical problems requiring step-by-step reasoning" - use_reasoning: true - reasoning_description: "Mathematical problems require systematic analysis" - reasoning_effort: "high" model_scores: - model: "phi4" score: 1.0 + use_reasoning: true # Enable reasoning for phi4 on math - model: "mistral-small3.1" score: 0.8 + use_reasoning: true # Enable reasoning for mistral on math - model: "gemma3:27b" score: 0.6 + use_reasoning: false # No reasoning for gemma on math ``` ### Example 2: Professional Category (Reasoning Disabled) @@ -249,16 +231,16 @@ categories: categories: - name: "business" description: "Business strategy and management discussions" - use_reasoning: false - reasoning_description: "Business content is typically conversational" - reasoning_effort: "low" model_scores: - model: "phi4" score: 0.8 + use_reasoning: false # Business doesn't need reasoning - model: "gemma3:27b" score: 0.4 + use_reasoning: false - model: "mistral-small3.1" score: 0.2 + use_reasoning: false ``` ### Example 3: Security-Focused Configuration (Jailbreak Protection) @@ -270,36 +252,38 @@ categories: description: "Customer support and general inquiries" jailbreak_enabled: true # Strict jailbreak protection jailbreak_threshold: 0.9 # High threshold for public-facing - use_reasoning: false model_scores: - model: "phi4" score: 0.9 + use_reasoning: false - model: "mistral-small3.1" score: 0.7 + use_reasoning: false # Technical category with relaxed threshold - name: "code_generation" description: "Code generation for developers" jailbreak_enabled: true # Keep enabled jailbreak_threshold: 0.5 # Lower threshold to reduce false positives on code - use_reasoning: true - reasoning_effort: "medium" model_scores: - model: "gemma3:27b" score: 0.9 + use_reasoning: true # Enable reasoning for code - model: "phi4" score: 0.7 + use_reasoning: true # General category using global default - name: "general" description: "General queries" # jailbreak_enabled not specified - inherits from global prompt_guard.enabled - use_reasoning: false model_scores: - model: "phi4" score: 0.6 + use_reasoning: false - model: "mistral-small3.1" score: 0.6 + use_reasoning: false ``` ### Example 4: Multi-Category Configuration @@ -308,51 +292,53 @@ categories: categories: # Technical categories with reasoning - name: "computer science" - use_reasoning: true - reasoning_description: "Programming requires logical analysis" - reasoning_effort: "medium" model_scores: - model: "gemma3:27b" score: 0.6 + use_reasoning: true # Enable reasoning for coding - model: "mistral-small3.1" score: 0.6 + use_reasoning: true - model: "phi4" score: 0.0 + use_reasoning: false - name: "physics" - use_reasoning: true - reasoning_description: "Physics concepts need systematic thinking" - reasoning_effort: "medium" model_scores: - model: "gemma3:27b" score: 0.4 + use_reasoning: true # Enable reasoning for physics - model: "phi4" score: 0.4 + use_reasoning: true - model: "mistral-small3.1" score: 0.4 + use_reasoning: true # General categories without reasoning - name: "history" - use_reasoning: false - reasoning_description: "Historical content is narrative-based" model_scores: - model: "mistral-small3.1" score: 0.8 + use_reasoning: false # History is narrative-based - model: "phi4" score: 0.6 + use_reasoning: false - model: "gemma3:27b" score: 0.4 + use_reasoning: false - name: "other" - use_reasoning: false - reasoning_description: "General content doesn't require reasoning" model_scores: - model: "gemma3:27b" score: 0.8 + use_reasoning: false # General content doesn't need reasoning - model: "phi4" score: 0.6 + use_reasoning: false - model: "mistral-small3.1" score: 0.6 + use_reasoning: false ``` ## Configuration Best Practices @@ -393,24 +379,36 @@ categories: # Reasoning recommended for: categories: - name: "math" - use_reasoning: true - reasoning_effort: "high" + model_scores: + - model: "phi4" + score: 1.0 + use_reasoning: true # Enable reasoning for math - name: "computer science" - use_reasoning: true - reasoning_effort: "medium" + model_scores: + - model: "gemma3:27b" + score: 0.6 + use_reasoning: true # Enable reasoning for coding - name: "chemistry" - use_reasoning: true - reasoning_effort: "high" + model_scores: + - model: "phi4" + score: 0.6 + use_reasoning: true # Enable reasoning for chemistry # Reasoning not needed for: categories: - name: "business" - use_reasoning: false + model_scores: + - model: "phi4" + score: 0.7 + use_reasoning: false # Business doesn't need reasoning - name: "history" - use_reasoning: false + model_scores: + - model: "mistral-small3.1" + score: 0.7 + use_reasoning: false # History is narrative ``` ### 3. Performance Tuning @@ -421,24 +419,24 @@ categories: # High-performance setup (lower latency) categories: - name: "math" - use_reasoning: true - reasoning_effort: "medium" # Reduced from "high" model_scores: - model: "phi4" score: 1.0 + use_reasoning: false # Disable reasoning for speed - model: "mistral-small3.1" score: 0.6 # Larger gap for faster selection + use_reasoning: false # High-accuracy setup (higher latency) categories: - name: "math" - use_reasoning: true - reasoning_effort: "high" # Maximum reasoning model_scores: - model: "phi4" score: 1.0 + use_reasoning: true # Enable reasoning for accuracy - model: "mistral-small3.1" score: 0.9 # Close scores for better fallback + use_reasoning: true ``` ## Classifier Configuration @@ -569,11 +567,10 @@ routing_rules: categories: - name: "math" system_prompt: "You are a mathematics expert. Provide step-by-step solutions." - use_reasoning: true - reasoning_effort: "high" model_scores: - model: "phi4" score: 1.0 + use_reasoning: true # Per-model reasoning setting ``` ## Complete Configuration Example @@ -583,31 +580,26 @@ categories: - name: "math" description: "Mathematical problems and calculations" system_prompt: "You are a mathematics expert. Provide step-by-step solutions, show your work clearly, and explain mathematical concepts in an understandable way." - use_reasoning: true - reasoning_effort: "high" model_scores: - model: "openai/gpt-oss-20b" score: 0.9 - use_reasoning: true + use_reasoning: true # Enable reasoning for this model - name: "computer science" description: "Programming and software engineering" system_prompt: "You are a computer science expert. Provide clear, practical solutions with code examples when helpful." - use_reasoning: true - reasoning_effort: "medium" model_scores: - model: "openai/gpt-oss-20b" score: 0.8 - use_reasoning: true + use_reasoning: true # Enable reasoning for coding - name: "business" description: "Business strategy and management" system_prompt: "You are a professional business consultant. Provide practical, actionable advice." - use_reasoning: false model_scores: - model: "openai/gpt-oss-20b" score: 0.7 - use_reasoning: false + use_reasoning: false # Business doesn't need reasoning ``` ## Next Steps diff --git a/website/docs/overview/categories/technical-details.md b/website/docs/overview/categories/technical-details.md index 54c169dc8..3e3264266 100644 --- a/website/docs/overview/categories/technical-details.md +++ b/website/docs/overview/categories/technical-details.md @@ -431,10 +431,10 @@ func (c *Classifier) HealthCheck() error { ```yaml categories: - name: "new_category" - use_reasoning: false model_scores: - model: "best-model-for-category" score: 1.0 + use_reasoning: false # Set per-model reasoning ``` ### Custom Classification Models diff --git a/website/docs/proposals/nvidia-dynamo-integration.md b/website/docs/proposals/nvidia-dynamo-integration.md index a6933ec37..281d65d1e 100644 --- a/website/docs/proposals/nvidia-dynamo-integration.md +++ b/website/docs/proposals/nvidia-dynamo-integration.md @@ -336,7 +336,7 @@ The Semantic Router leverages **four specialized deep learning models** for inte ```yaml bert_model: - model_id: sentence-transformers/all-MiniLM-L12-v2 + model_id: models/all-MiniLM-L12-v2 threshold: 0.6 use_cpu: true ``` diff --git a/website/docs/training/model-performance-eval.md b/website/docs/training/model-performance-eval.md index 529c54815..2f9ef86dd 100644 --- a/website/docs/training/model-performance-eval.md +++ b/website/docs/training/model-performance-eval.md @@ -271,7 +271,7 @@ see more about config at [configuration](https://vllm-semantic-router.com/docs/i ```yaml bert_model: - model_id: sentence-transformers/all-MiniLM-L12-v2 + model_id: models/all-MiniLM-L12-v2 threshold: 0.6 use_cpu: true semantic_cache: @@ -310,35 +310,32 @@ classifier: pii_mapping_path: models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json categories: - name: business - use_reasoning: false - reasoning_description: Business content is typically conversational - reasoning_effort: low model_scores: - model: phi4 score: 0.2 + use_reasoning: false # Business doesn't need reasoning - model: qwen3-0.6B score: 0.0 + use_reasoning: false - name: law - use_reasoning: false - reasoning_description: Legal content is typically explanatory - reasoning_effort: medium model_scores: - model: phi4 score: 0.8 + use_reasoning: false # Legal content is explanatory - model: qwen3-0.6B score: 0.2 + use_reasoning: false # Ignore some categories here - name: engineering - use_reasoning: true - reasoning_description: Engineering problems require systematic problem-solving - reasoning_effort: high model_scores: - model: phi4 score: 0.6 + use_reasoning: true # Engineering requires problem-solving - model: qwen3-0.6B score: 0.2 + use_reasoning: true default_reasoning_effort: medium default_model: phi4 ``` diff --git a/website/docs/tutorials/intelligent-route/reasoning.md b/website/docs/tutorials/intelligent-route/reasoning.md index 29b4774cb..719bbb98c 100644 --- a/website/docs/tutorials/intelligent-route/reasoning.md +++ b/website/docs/tutorials/intelligent-route/reasoning.md @@ -66,19 +66,19 @@ model_config: reasoning_family: "gpt-oss" preferred_endpoints: ["endpoint1"] -# Categories: which kinds of queries require reasoning and at what effort +# Categories: which models to use for each type of query, with per-model reasoning settings categories: - name: math - use_reasoning: true - reasoning_effort: high # overrides default_reasoning_effort - reasoning_description: "Mathematical problems require step-by-step reasoning" model_scores: - model: openai/gpt-oss-20b score: 1.0 + use_reasoning: true # Enable reasoning for this model on math - model: deepseek-v31 score: 0.8 + use_reasoning: true - model: qwen3-30b score: 0.8 + use_reasoning: true # A safe default when no category is confidently selected @@ -87,11 +87,10 @@ default_model: qwen3-30b Notes -- Reasoning is controlled by categories.use_reasoning and optionally categories.reasoning_effort. -- A model only gets reasoning fields if it has a model_config.<MODEL>.reasoning_family that maps to a reasoning_families entry. +- Reasoning is controlled per-model within `model_scores` using the `use_reasoning` field. +- A model only gets reasoning fields if it has a model_config..reasoning_family that maps to a reasoning_families entry. - DeepSeek/Qwen3 (chat_template_kwargs): the router injects chat_template_kwargs only when reasoning is enabled. When disabled, no chat_template_kwargs are added. -- GPT/GPT-OSS (reasoning_effort): when reasoning is enabled, the router sets reasoning_effort based on the category (fallback to default_reasoning_effort). When reasoning is disabled, if the request already contains reasoning_effort and the model’s family type is reasoning_effort, the router preserves the original value; otherwise it is absent. -- Category descriptions (for example, description and reasoning_description) are informational only today; they do not affect routing or classification. +- GPT/GPT-OSS (reasoning_effort): when reasoning is enabled, the router sets reasoning_effort based on global `default_reasoning_effort`. When reasoning is disabled, if the request already contains reasoning_effort and the model's family type is reasoning_effort, the router preserves the original value; otherwise it is absent. - Categories must be from MMLU-Pro at the moment; avoid free-form categories like "general". If you want generic categories, consider opening an issue to map them to MMLU-Pro. 2) Start the router