feat(kubevirt): add VM troubleshooting tool with diagnostic plan template

lyarwood · lyarwood · commit 2e488730e855 · 2025-11-04T17:53:17.000Z
Implements a new troubleshoot tool for VirtualMachines in the kubevirt toolset.
The tool provides automated diagnostic plans based on VM status, conditions, and
common issues, helping users identify and resolve VM problems efficiently.

Adds:
- Troubleshoot tool implementation with structured diagnostic output
- Template-based diagnostic plan generation
- Comprehensive test coverage for the troubleshoot functionality

Assisted-By: Claude &lt;noreply@anthropic.com&gt;
Signed-off-by: Lee Yarwood &lt;lyarwood@redhat.com&gt;
diff --git a/pkg/toolsets/kubevirt/toolset.go b/pkg/toolsets/kubevirt/toolset.go
@@ -7,6 +7,7 @@ import (
 	internalk8s "github.com/containers/kubernetes-mcp-server/pkg/kubernetes"
 	"github.com/containers/kubernetes-mcp-server/pkg/toolsets"
 	vm_create "github.com/containers/kubernetes-mcp-server/pkg/toolsets/kubevirt/vm/create"
+	vm_troubleshoot "github.com/containers/kubernetes-mcp-server/pkg/toolsets/kubevirt/vm/troubleshoot"
 )
 
 type Toolset struct{}
@@ -24,6 +25,7 @@ func (t *Toolset) GetDescription() string {
 func (t *Toolset) GetTools(o internalk8s.Openshift) []api.ServerTool {
 	return slices.Concat(
 		vm_create.Tools(),
+		vm_troubleshoot.Tools(),
 	)
 }
 
diff --git a/pkg/toolsets/kubevirt/vm/troubleshoot/plan.tmpl b/pkg/toolsets/kubevirt/vm/troubleshoot/plan.tmpl
@@ -0,0 +1,151 @@
+# VirtualMachine Troubleshooting Guide
+
+## VM: {{.Name}} (namespace: {{.Namespace}})
+
+Follow these steps to diagnose issues with the VirtualMachine:
+
+---
+
+## Step 1: Check VirtualMachine Status
+
+Use the `resources_get` tool to inspect the VirtualMachine:
+- **apiVersion**: `kubevirt.io/v1`
+- **kind**: `VirtualMachine`
+- **namespace**: `{{.Namespace}}`
+- **name**: `{{.Name}}`
+
+**What to look for:**
+- `status.printableStatus` - Should be "Running" for a healthy VM
+- `status.ready` - Should be `true`
+- `status.conditions` - Look for conditions with `status: "False"` or error messages
+- `spec.runStrategy` - Check if it's "Always", "Manual", "Halted", or "RerunOnFailure"
+
+---
+
+## Step 2: Check VirtualMachineInstance Status
+
+If the VM exists but isn't running, check if a VirtualMachineInstance was created:
+
+Use the `resources_get` tool:
+- **apiVersion**: `kubevirt.io/v1`
+- **kind**: `VirtualMachineInstance`
+- **namespace**: `{{.Namespace}}`
+- **name**: `{{.Name}}`
+
+**What to look for:**
+- `status.phase` - Should be "Running" for a healthy VMI
+- `status.conditions` - Check for "Ready" condition with `status: "True"`
+- `status.guestOSInfo` - Confirms guest agent is running
+- If VMI doesn't exist and VM runStrategy is "Always", this indicates a problem
+
+---
+
+## Step 3: Check DataVolume Status (if applicable)
+
+If the VM uses DataVolumeTemplates, check their status:
+
+Use the `resources_list` tool:
+- **apiVersion**: `cdi.kubevirt.io/v1beta1`
+- **kind**: `DataVolume`
+- **namespace**: `{{.Namespace}}`
+
+Look for DataVolumes with names starting with `{{.Name}}-`
+
+**What to look for:**
+- `status.phase` - Should be "Succeeded" when ready
+- `status.progress` - Shows import/clone progress (e.g., "100.0%")
+- Common issues:
+  - Phase "Pending" - Waiting for resources
+  - Phase "ImportScheduled" or "ImportInProgress" - Still importing
+  - Phase "Failed" - Check `status.conditions` for error details
+
+---
+
+## Step 4: Check virt-launcher Pod
+
+The virt-launcher pod runs the actual VM. Find and inspect it:
+
+Use the `pods_list_in_namespace` tool:
+- **namespace**: `{{.Namespace}}`
+- **labelSelector**: `kubevirt.io=virt-launcher,vm.kubevirt.io/name={{.Name}}`
+
+**What to look for:**
+- Pod should be in "Running" phase
+- All containers should be ready (e.g., "2/2")
+- Check pod events and conditions for errors
+
+If pod exists, get detailed status with `pods_get`:
+- **namespace**: `{{.Namespace}}`
+- **name**: `virt-launcher-{{.Name}}-xxxxx` (use actual pod name from list)
+
+Get pod logs with `pods_log`:
+- **namespace**: `{{.Namespace}}`
+- **name**: `virt-launcher-{{.Name}}-xxxxx`
+- **container**: `compute` (main VM container)
+
+---
+
+## Step 5: Check Events
+
+Events provide crucial diagnostic information:
+
+Use the `events_list` tool:
+- **namespace**: `{{.Namespace}}`
+
+Filter output for events related to `{{.Name}}` - look for warnings or errors.
+
+---
+
+## Step 6: Check Instance Type and Preference (if used)
+
+If the VM uses instance types or preferences, verify they exist:
+
+For instance types, use `resources_get`:
+- **apiVersion**: `instancetype.kubevirt.io/v1beta1`
+- **kind**: `VirtualMachineClusterInstancetype`
+- **name**: (check VM spec for instancetype name)
+
+For preferences, use `resources_get`:
+- **apiVersion**: `instancetype.kubevirt.io/v1beta1`
+- **kind**: `VirtualMachineClusterPreference`
+- **name**: (check VM spec for preference name)
+
+---
+
+## Common Issues and Solutions
+
+### VM stuck in "Stopped" or "Halted"
+- Check `spec.runStrategy` - if "Halted", the VM is intentionally stopped
+- Change runStrategy to "Always" to start the VM
+
+### VMI doesn't exist
+- Check VM conditions for admission errors
+- Verify instance type and preference exist
+- Check resource quotas in the namespace
+
+### DataVolume stuck in "ImportInProgress"
+- Check CDI controller pods in `cdi` namespace
+- Verify source image is accessible
+- Check PVC storage class exists and has available capacity
+
+### virt-launcher pod in CrashLoopBackOff
+- Check pod logs for container `compute`
+- Common causes:
+  - Insufficient resources (CPU/memory)
+  - Invalid VM configuration
+  - Storage issues (PVC not available)
+
+### VM starts but guest doesn't boot
+- Check virt-launcher logs for QEMU errors
+- Verify boot disk is properly configured
+- Check if guest agent is installed (for cloud images)
+- Ensure correct architecture (amd64 vs arm64)
+
+---
+
+## Additional Resources
+
+For more detailed diagnostics:
+- Check KubeVirt components: `pods_list` in `kubevirt` namespace
+- Check CDI components: `pods_list` in `cdi` namespace (if using DataVolumes)
+- Review resource consumption: `pods_top` for the virt-launcher pod
diff --git a/pkg/toolsets/kubevirt/vm/troubleshoot/tool.go b/pkg/toolsets/kubevirt/vm/troubleshoot/tool.go
@@ -0,0 +1,98 @@
+package troubleshoot
+
+import (
+	_ "embed"
+	"fmt"
+	"strings"
+	"text/template"
+
+	"github.com/containers/kubernetes-mcp-server/pkg/api"
+	"github.com/google/jsonschema-go/jsonschema"
+	"k8s.io/utils/ptr"
+)
+
+//go:embed plan.tmpl
+var planTemplate string
+
+func Tools() []api.ServerTool {
+	return []api.ServerTool{
+		{
+			Tool: api.Tool{
+				Name:        "vm_troubleshoot",
+				Description: "Generate a comprehensive troubleshooting guide for a VirtualMachine, providing step-by-step instructions to diagnose common issues",
+				InputSchema: &jsonschema.Schema{
+					Type: "object",
+					Properties: map[string]*jsonschema.Schema{
+						"namespace": {
+							Type:        "string",
+							Description: "The namespace of the virtual machine",
+						},
+						"name": {
+							Type:        "string",
+							Description: "The name of the virtual machine",
+						},
+					},
+					Required: []string{"namespace", "name"},
+				},
+				Annotations: api.ToolAnnotations{
+					Title:           "Virtual Machine: Troubleshoot",
+					ReadOnlyHint:    ptr.To(true),
+					DestructiveHint: ptr.To(false),
+					IdempotentHint:  ptr.To(true),
+					OpenWorldHint:   ptr.To(false),
+				},
+			},
+			Handler: troubleshoot,
+		},
+	}
+}
+
+type troubleshootParams struct {
+	Namespace string
+	Name      string
+}
+
+func troubleshoot(params api.ToolHandlerParams) (*api.ToolCallResult, error) {
+	// Parse required parameters
+	namespace, err := getRequiredString(params, "namespace")
+	if err != nil {
+		return api.NewToolCallResult("", err), nil
+	}
+
+	name, err := getRequiredString(params, "name")
+	if err != nil {
+		return api.NewToolCallResult("", err), nil
+	}
+
+	// Prepare template parameters
+	templateParams := troubleshootParams{
+		Namespace: namespace,
+		Name:      name,
+	}
+
+	// Render template
+	tmpl, err := template.New("troubleshoot").Parse(planTemplate)
+	if err != nil {
+		return api.NewToolCallResult("", fmt.Errorf("failed to parse template: %w", err)), nil
+	}
+
+	var result strings.Builder
+	if err := tmpl.Execute(&result, templateParams); err != nil {
+		return api.NewToolCallResult("", fmt.Errorf("failed to render template: %w", err)), nil
+	}
+
+	return api.NewToolCallResult(result.String(), nil), nil
+}
+
+func getRequiredString(params api.ToolHandlerParams, key string) (string, error) {
+	args := params.GetArguments()
+	val, ok := args[key]
+	if !ok {
+		return "", fmt.Errorf("%s parameter required", key)
+	}
+	str, ok := val.(string)
+	if !ok {
+		return "", fmt.Errorf("%s parameter must be a string", key)
+	}
+	return str, nil
+}
diff --git a/pkg/toolsets/kubevirt/vm/troubleshoot/tool_test.go b/pkg/toolsets/kubevirt/vm/troubleshoot/tool_test.go
@@ -0,0 +1,110 @@
+package troubleshoot
+
+import (
+	"context"
+	"strings"
+	"testing"
+
+	"github.com/containers/kubernetes-mcp-server/pkg/api"
+	internalk8s "github.com/containers/kubernetes-mcp-server/pkg/kubernetes"
+)
+
+type mockToolCallRequest struct {
+	arguments map[string]interface{}
+}
+
+func (m *mockToolCallRequest) GetArguments() map[string]any {
+	return m.arguments
+}
+
+func TestTroubleshoot(t *testing.T) {
+	tests := []struct {
+		name      string
+		args      map[string]interface{}
+		wantErr   bool
+		checkFunc func(t *testing.T, result string)
+	}{
+		{
+			name: "generates troubleshooting guide",
+			args: map[string]interface{}{
+				"namespace": "test-ns",
+				"name":      "test-vm",
+			},
+			wantErr: false,
+			checkFunc: func(t *testing.T, result string) {
+				if !strings.Contains(result, "VirtualMachine Troubleshooting Guide") {
+					t.Errorf("Expected troubleshooting guide header")
+				}
+				if !strings.Contains(result, "test-vm") {
+					t.Errorf("Expected VM name in guide")
+				}
+				if !strings.Contains(result, "test-ns") {
+					t.Errorf("Expected namespace in guide")
+				}
+				if !strings.Contains(result, "Step 1: Check VirtualMachine Status") {
+					t.Errorf("Expected step 1 header")
+				}
+				if !strings.Contains(result, "resources_get") {
+					t.Errorf("Expected resources_get tool reference")
+				}
+				if !strings.Contains(result, "VirtualMachineInstance") {
+					t.Errorf("Expected VMI section")
+				}
+				if !strings.Contains(result, "virt-launcher") {
+					t.Errorf("Expected virt-launcher pod section")
+				}
+			},
+		},
+		{
+			name: "missing namespace",
+			args: map[string]interface{}{
+				"name": "test-vm",
+			},
+			wantErr: true,
+		},
+		{
+			name: "missing name",
+			args: map[string]interface{}{
+				"namespace": "test-ns",
+			},
+			wantErr: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			params := api.ToolHandlerParams{
+				Context:         context.Background(),
+				Kubernetes:      &internalk8s.Kubernetes{},
+				ToolCallRequest: &mockToolCallRequest{arguments: tt.args},
+			}
+
+			result, err := troubleshoot(params)
+			if err != nil {
+				t.Errorf("troubleshoot() unexpected Go error: %v", err)
+				return
+			}
+
+			if result == nil {
+				t.Error("Expected non-nil result")
+				return
+			}
+
+			if tt.wantErr {
+				if result.Error == nil {
+					t.Error("Expected error in result.Error, got nil")
+				}
+			} else {
+				if result.Error != nil {
+					t.Errorf("Expected no error in result, got: %v", result.Error)
+				}
+				if result.Content == "" {
+					t.Error("Expected non-empty result content")
+				}
+				if tt.checkFunc != nil {
+					tt.checkFunc(t, result.Content)
+				}
+			}
+		})
+	}
+}

Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@ import (`
`7`	`7`	`internalk8s "github.com/containers/kubernetes-mcp-server/pkg/kubernetes"`
`8`	`8`	`"github.com/containers/kubernetes-mcp-server/pkg/toolsets"`
`9`	`9`	`vm_create "github.com/containers/kubernetes-mcp-server/pkg/toolsets/kubevirt/vm/create"`
	`10`	`+ vm_troubleshoot "github.com/containers/kubernetes-mcp-server/pkg/toolsets/kubevirt/vm/troubleshoot"`
`10`	`11`	`)`
`11`	`12`
`12`	`13`	`type Toolset struct{}`
`@@ -24,6 +25,7 @@ func (t *Toolset) GetDescription() string {`
`24`	`25`	`func (t *Toolset) GetTools(o internalk8s.Openshift) []api.ServerTool {`
`25`	`26`	`return slices.Concat(`
`26`	`27`	`vm_create.Tools(),`
	`28`	`+ vm_troubleshoot.Tools(),`
`27`	`29`	`)`
`28`	`30`	`}`
`29`	`31`