From e310902fded1b67102d1c0a97a5d360f949dbe85 Mon Sep 17 00:00:00 2001 From: Rohit Patil Date: Tue, 4 Nov 2025 10:41:32 +0530 Subject: [PATCH 1/4] check cluster health Signed-off-by: Rohit Patil --- README.md | 5 + pkg/toolsets/core/health_check.go | 964 ++++++++++++++++++++++++++++++ pkg/toolsets/core/toolset.go | 1 + 3 files changed, 970 insertions(+) create mode 100644 pkg/toolsets/core/health_check.go diff --git a/README.md b/README.md index ee592bd5..c760d9cb 100644 --- a/README.md +++ b/README.md @@ -240,6 +240,11 @@ In case multi-cluster support is enabled (default) and you have access to multip - **events_list** - List all the Kubernetes events in the current cluster from all namespaces - `namespace` (`string`) - Optional Namespace to retrieve the events from. If not provided, will list events from all namespaces +- **cluster_health_check** - Perform comprehensive health check on Kubernetes/OpenShift cluster and report issues. Examines cluster operators (OpenShift), nodes, deployments, pods, persistent volumes, and events to identify problems affecting cluster stability or workload availability. + - `check_events` (`boolean`) - Include recent warning events in the health check (may increase execution time) + - `output_format` (`string`) - Output format for results: 'text' (human-readable) or 'json' (machine-readable) + - `verbose` (`boolean`) - Enable detailed output with additional context and resource-level details + - **namespaces_list** - List all the Kubernetes namespaces in the current cluster - **projects_list** - List all the OpenShift projects in the current cluster diff --git a/pkg/toolsets/core/health_check.go b/pkg/toolsets/core/health_check.go new file mode 100644 index 00000000..805ff642 --- /dev/null +++ b/pkg/toolsets/core/health_check.go @@ -0,0 +1,964 @@ +package core + +import ( + "encoding/json" + "fmt" + "strings" + "time" + + "github.com/google/jsonschema-go/jsonschema" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/utils/ptr" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + internalk8s "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" +) + +const ( + // HighRestartThreshold defines the number of restarts above which a pod is considered to have high restarts + HighRestartThreshold = 5 + // EventsTimeWindow defines how far back to look for warning events (in minutes) + EventsTimeWindow = 30 * time.Minute + // MaxRecentWarnings defines the maximum number of recent warning events to collect + MaxRecentWarnings = 20 + // MaxVerboseWarnings defines the maximum number of warning events to display in verbose mode + MaxVerboseWarnings = 10 +) + +// HealthCheckResult represents the overall health check result +type HealthCheckResult struct { + Cluster ClusterInfo `json:"cluster"` + Summary HealthSummary `json:"summary"` + Checks HealthChecks `json:"checks"` + Timestamp string `json:"timestamp"` + Details []string `json:"details,omitempty"` +} + +// ClusterInfo contains basic cluster information +type ClusterInfo struct { + Type string `json:"type"` + Version string `json:"version,omitempty"` +} + +// HealthSummary provides a high-level summary of health status +type HealthSummary struct { + CriticalIssues int `json:"criticalIssues"` + Warnings int `json:"warnings"` + Healthy bool `json:"healthy"` + Status string `json:"status"` +} + +// HealthChecks contains detailed health check results +type HealthChecks struct { + ClusterOperators *ClusterOperatorHealth `json:"clusterOperators,omitempty"` + Nodes NodeHealth `json:"nodes"` + Pods PodHealth `json:"pods"` + Workloads WorkloadHealth `json:"workloads"` + Storage StorageHealth `json:"storage"` + Events EventHealth `json:"events,omitempty"` +} + +// ClusterOperatorHealth represents OpenShift cluster operator health +type ClusterOperatorHealth struct { + Status string `json:"status"` + Total int `json:"total"` + Degraded []string `json:"degraded,omitempty"` + Unavailable []string `json:"unavailable,omitempty"` + Progressing []string `json:"progressing,omitempty"` +} + +// NodeHealth represents node health status +type NodeHealth struct { + Status string `json:"status"` + Total int `json:"total"` + NotReady []string `json:"notReady,omitempty"` + Unschedulable []string `json:"unschedulable,omitempty"` + UnderPressure []string `json:"underPressure,omitempty"` +} + +// PodHealth represents pod health status +type PodHealth struct { + Status string `json:"status"` + Total int `json:"total"` + Failed []string `json:"failed,omitempty"` + CrashLooping []string `json:"crashLooping,omitempty"` + ImagePullErrors []string `json:"imagePullErrors,omitempty"` + HighRestarts []string `json:"highRestarts,omitempty"` +} + +// WorkloadHealth represents workload controller health +type WorkloadHealth struct { + UnhealthyDeployments []string `json:"unhealthyDeployments,omitempty"` + UnhealthyStatefulSets []string `json:"unhealthyStatefulSets,omitempty"` + UnhealthyDaemonSets []string `json:"unhealthyDaemonSets,omitempty"` +} + +// StorageHealth represents storage health status +type StorageHealth struct { + PendingPVCs []string `json:"pendingPVCs,omitempty"` +} + +// EventHealth represents recent event analysis +type EventHealth struct { + RecentWarnings []string `json:"recentWarnings,omitempty"` +} + +func initHealthCheck(o internalk8s.Openshift) []api.ServerTool { + return []api.ServerTool{ + {Tool: api.Tool{ + Name: "cluster_health_check", + Description: "Perform comprehensive health check on Kubernetes/OpenShift cluster and report issues. Examines cluster operators (OpenShift), nodes, deployments, pods, persistent volumes, and events to identify problems affecting cluster stability or workload availability.", + InputSchema: &jsonschema.Schema{ + Type: "object", + Properties: map[string]*jsonschema.Schema{ + "verbose": { + Type: "boolean", + Description: "Enable detailed output with additional context and resource-level details", + Default: api.ToRawMessage(false), + }, + "output_format": { + Type: "string", + Description: "Output format for results: 'text' (human-readable) or 'json' (machine-readable)", + Enum: []interface{}{"text", "json"}, + Default: api.ToRawMessage("text"), + }, + "check_events": { + Type: "boolean", + Description: "Include recent warning events in the health check (may increase execution time)", + Default: api.ToRawMessage(true), + }, + }, + }, + Annotations: api.ToolAnnotations{ + Title: "Cluster: Health Check", + ReadOnlyHint: ptr.To(true), + DestructiveHint: ptr.To(false), + IdempotentHint: ptr.To(true), + OpenWorldHint: ptr.To(true), + }, + }, Handler: createHealthCheckHandler(o)}, + } +} + +func createHealthCheckHandler(o internalk8s.Openshift) api.ToolHandlerFunc { + return func(params api.ToolHandlerParams) (*api.ToolCallResult, error) { + verbose := false + if v, ok := params.GetArguments()["verbose"].(bool); ok { + verbose = v + } + + outputFormat := "text" + if v, ok := params.GetArguments()["output_format"].(string); ok { + outputFormat = v + } + + checkEvents := true + if v, ok := params.GetArguments()["check_events"].(bool); ok { + checkEvents = v + } + + result := &HealthCheckResult{ + Timestamp: time.Now().UTC().Format(time.RFC3339), + Details: []string{}, + } + + // Determine cluster type + isOpenShift := o.IsOpenShift(params.Context) + result.Cluster.Type = "Kubernetes" + if isOpenShift { + result.Cluster.Type = "OpenShift" + } + + // Get cluster version + version, err := getClusterVersion(params, isOpenShift) + if err == nil { + result.Cluster.Version = version + } + + // Run health checks + criticalCount := 0 + warningCount := 0 + + // 1. Check Cluster Operators (OpenShift only) + if isOpenShift { + coHealth, details, critical, warning := checkClusterOperators(params, verbose) + result.Checks.ClusterOperators = coHealth + result.Details = append(result.Details, details...) + criticalCount += critical + warningCount += warning + } + + // 2. Check Node Health + nodeHealth, details, critical, warning := checkNodeHealth(params, verbose) + result.Checks.Nodes = *nodeHealth + result.Details = append(result.Details, details...) + criticalCount += critical + warningCount += warning + + // 3. Check Pod Health + podHealth, details, critical, warning := checkPodHealth(params, verbose) + result.Checks.Pods = *podHealth + result.Details = append(result.Details, details...) + criticalCount += critical + warningCount += warning + + // 4. Check Workload Controllers + workloadHealth, details, warning := checkWorkloadHealth(params, verbose) + result.Checks.Workloads = *workloadHealth + result.Details = append(result.Details, details...) + warningCount += warning + + // 5. Check Storage + storageHealth, details, warning := checkStorageHealth(params, verbose) + result.Checks.Storage = *storageHealth + result.Details = append(result.Details, details...) + warningCount += warning + + // 6. Check Recent Events (optional) + if checkEvents { + eventHealth, details := checkRecentEvents(params, verbose) + if eventHealth != nil { + result.Checks.Events = *eventHealth + result.Details = append(result.Details, details...) + } + } + + // Build summary + result.Summary.CriticalIssues = criticalCount + result.Summary.Warnings = warningCount + result.Summary.Healthy = criticalCount == 0 + + if criticalCount > 0 { + result.Summary.Status = "critical" + } else if warningCount > 0 { + result.Summary.Status = "warning" + } else { + result.Summary.Status = "healthy" + } + + // Format output + var output string + if outputFormat == "json" { + jsonBytes, err := json.MarshalIndent(result, "", " ") + if err != nil { + return api.NewToolCallResult("", fmt.Errorf("failed to marshal health check result to JSON: %v", err)), nil + } + output = string(jsonBytes) + } else { + output = formatTextOutput(result, verbose) + } + + return api.NewToolCallResult(output, nil), nil + } +} + +// getClusterVersion retrieves the cluster version information. +// For OpenShift clusters, it attempts to get the version from ClusterVersion resource. +// Returns the version string or an error if unable to determine. +func getClusterVersion(params api.ToolHandlerParams, isOpenShift bool) (string, error) { + if isOpenShift { + // Try to get OpenShift cluster version + gvk := &schema.GroupVersionKind{ + Group: "config.openshift.io", + Version: "v1", + Kind: "ClusterVersion", + } + versions, err := params.ResourcesList(params, gvk, "", internalk8s.ResourceListOptions{}) + if err == nil && versions != nil { + versionList := versions.UnstructuredContent() + if items, ok := versionList["items"].([]interface{}); ok && len(items) > 0 { + if item, ok := items[0].(map[string]interface{}); ok { + if status, ok := item["status"].(map[string]interface{}); ok { + if desired, ok := status["desired"].(map[string]interface{}); ok { + if ver, ok := desired["version"].(string); ok { + return ver, nil + } + } + } + } + } + } + } + + // Fallback: Get server version + // Note: This would require access to discovery client which isn't exposed in params + // For now, return empty string + return "", fmt.Errorf("unable to determine cluster version") +} + +// checkClusterOperators examines OpenShift cluster operator health status. +// It checks for degraded, unavailable, or progressing operators. +// Returns: health status, detail messages, critical count, warning count. +func checkClusterOperators(params api.ToolHandlerParams, verbose bool) (*ClusterOperatorHealth, []string, int, int) { + health := &ClusterOperatorHealth{ + Status: "healthy", + Degraded: []string{}, + Unavailable: []string{}, + Progressing: []string{}, + } + details := []string{"Checking Cluster Operators..."} + criticalCount := 0 + warningCount := 0 + + gvk := &schema.GroupVersionKind{ + Group: "config.openshift.io", + Version: "v1", + Kind: "ClusterOperator", + } + + operators, err := params.ResourcesList(params, gvk, "", internalk8s.ResourceListOptions{}) + if err != nil { + details = append(details, fmt.Sprintf("⚠️ WARNING: Unable to check cluster operators: %v", err)) + warningCount++ + return health, details, criticalCount, warningCount + } + + if operators == nil { + details = append(details, "⚠️ WARNING: No cluster operator data available") + warningCount++ + return health, details, criticalCount, warningCount + } + + operatorList := operators.UnstructuredContent() + items, ok := operatorList["items"].([]interface{}) + if !ok { + details = append(details, "⚠️ WARNING: Unable to parse cluster operator data") + warningCount++ + return health, details, criticalCount, warningCount + } + + health.Total = len(items) + + for _, item := range items { + op, ok := item.(map[string]interface{}) + if !ok { + continue + } + + metadata, _ := op["metadata"].(map[string]interface{}) + name, _ := metadata["name"].(string) + + status, ok := op["status"].(map[string]interface{}) + if !ok { + continue + } + + conditions, ok := status["conditions"].([]interface{}) + if !ok { + continue + } + + for _, cond := range conditions { + condition, ok := cond.(map[string]interface{}) + if !ok { + continue + } + + condType, _ := condition["type"].(string) + condStatus, _ := condition["status"].(string) + + if condType == "Degraded" && condStatus == "True" { + health.Degraded = append(health.Degraded, name) + criticalCount++ + health.Status = "critical" + if verbose { + reason, _ := condition["reason"].(string) + message, _ := condition["message"].(string) + details = append(details, fmt.Sprintf(" ❌ %s: Degraded - %s: %s", name, reason, message)) + } + } else if condType == "Available" && condStatus == "False" { + health.Unavailable = append(health.Unavailable, name) + criticalCount++ + health.Status = "critical" + } else if condType == "Progressing" && condStatus == "True" { + health.Progressing = append(health.Progressing, name) + warningCount++ + if health.Status != "critical" { + health.Status = "warning" + } + } + } + } + + if len(health.Degraded) > 0 { + details = append(details, fmt.Sprintf("❌ CRITICAL: %d degraded cluster operator(s): %s", len(health.Degraded), strings.Join(health.Degraded, ", "))) + } + if len(health.Unavailable) > 0 { + details = append(details, fmt.Sprintf("❌ CRITICAL: %d unavailable cluster operator(s): %s", len(health.Unavailable), strings.Join(health.Unavailable, ", "))) + } + if len(health.Progressing) > 0 { + details = append(details, fmt.Sprintf("⚠️ WARNING: %d cluster operator(s) in progress: %s", len(health.Progressing), strings.Join(health.Progressing, ", "))) + } + if len(health.Degraded) == 0 && len(health.Unavailable) == 0 && len(health.Progressing) == 0 { + details = append(details, fmt.Sprintf("✅ All cluster operators healthy (%d/%d)", health.Total, health.Total)) + } + + return health, details, criticalCount, warningCount +} + +// checkNodeHealth examines the health status of all nodes in the cluster. +// It checks for nodes that are not ready, unschedulable, or under resource pressure. +// Returns: health status, detail messages, critical count, warning count. +func checkNodeHealth(params api.ToolHandlerParams, verbose bool) (*NodeHealth, []string, int, int) { + health := &NodeHealth{ + Status: "healthy", + NotReady: []string{}, + Unschedulable: []string{}, + UnderPressure: []string{}, + } + details := []string{"Checking Node Health..."} + criticalCount := 0 + warningCount := 0 + + gvk := &schema.GroupVersionKind{ + Group: "", + Version: "v1", + Kind: "Node", + } + + nodes, err := params.ResourcesList(params, gvk, "", internalk8s.ResourceListOptions{}) + if err != nil { + details = append(details, fmt.Sprintf("❌ CRITICAL: Unable to check nodes: %v", err)) + criticalCount++ + return health, details, criticalCount, warningCount + } + + if nodes == nil { + details = append(details, "❌ CRITICAL: No node data available") + criticalCount++ + return health, details, criticalCount, warningCount + } + + nodeList := nodes.UnstructuredContent() + items, ok := nodeList["items"].([]interface{}) + if !ok { + details = append(details, "❌ CRITICAL: Unable to parse node data") + criticalCount++ + return health, details, criticalCount, warningCount + } + + health.Total = len(items) + + for _, item := range items { + node, ok := item.(map[string]interface{}) + if !ok { + continue + } + + metadata, _ := node["metadata"].(map[string]interface{}) + name, _ := metadata["name"].(string) + + // Check unschedulable + spec, _ := node["spec"].(map[string]interface{}) + if unschedulable, ok := spec["unschedulable"].(bool); ok && unschedulable { + health.Unschedulable = append(health.Unschedulable, name) + warningCount++ + if health.Status == "healthy" { + health.Status = "warning" + } + } + + status, ok := node["status"].(map[string]interface{}) + if !ok { + continue + } + + conditions, ok := status["conditions"].([]interface{}) + if !ok { + continue + } + + nodeReady := true + pressureTypes := []string{} + + for _, cond := range conditions { + condition, ok := cond.(map[string]interface{}) + if !ok { + continue + } + + condType, _ := condition["type"].(string) + condStatus, _ := condition["status"].(string) + + if condType == "Ready" && condStatus != "True" { + health.NotReady = append(health.NotReady, name) + criticalCount++ + health.Status = "critical" + nodeReady = false + if verbose { + message, _ := condition["message"].(string) + details = append(details, fmt.Sprintf(" ❌ %s: Not Ready - %s", name, message)) + } + } else if (condType == "MemoryPressure" || condType == "DiskPressure" || condType == "PIDPressure") && condStatus == "True" { + pressureTypes = append(pressureTypes, condType) + } + } + + if len(pressureTypes) > 0 { + pressureInfo := fmt.Sprintf("%s (%s)", name, strings.Join(pressureTypes, ", ")) + health.UnderPressure = append(health.UnderPressure, pressureInfo) + warningCount++ + if health.Status == "healthy" { + health.Status = "warning" + } + if verbose && nodeReady { + details = append(details, fmt.Sprintf(" ⚠️ %s: Under pressure - %s", name, strings.Join(pressureTypes, ", "))) + } + } + } + + if len(health.NotReady) > 0 { + details = append(details, fmt.Sprintf("❌ CRITICAL: %d node(s) not ready: %s", len(health.NotReady), strings.Join(health.NotReady, ", "))) + } + if len(health.Unschedulable) > 0 { + details = append(details, fmt.Sprintf("⚠️ WARNING: %d node(s) unschedulable: %s", len(health.Unschedulable), strings.Join(health.Unschedulable, ", "))) + } + if len(health.UnderPressure) > 0 { + details = append(details, fmt.Sprintf("⚠️ WARNING: %d node(s) under pressure: %s", len(health.UnderPressure), strings.Join(health.UnderPressure, ", "))) + } + if len(health.NotReady) == 0 && len(health.Unschedulable) == 0 && len(health.UnderPressure) == 0 { + details = append(details, fmt.Sprintf("✅ All nodes healthy (%d)", health.Total)) + } + + return health, details, criticalCount, warningCount +} + +// checkPodHealth examines the health status of all pods across all namespaces. +// It identifies failed pods, crash looping pods, image pull errors, and pods with high restart counts. +// Returns: health status, detail messages, critical count, warning count. +func checkPodHealth(params api.ToolHandlerParams, verbose bool) (*PodHealth, []string, int, int) { + health := &PodHealth{ + Status: "healthy", + Failed: []string{}, + CrashLooping: []string{}, + ImagePullErrors: []string{}, + HighRestarts: []string{}, + } + details := []string{"Checking Pod Health..."} + criticalCount := 0 + warningCount := 0 + + pods, err := params.PodsListInAllNamespaces(params, internalk8s.ResourceListOptions{}) + if err != nil { + details = append(details, fmt.Sprintf("❌ CRITICAL: Unable to check pods: %v", err)) + criticalCount++ + return health, details, criticalCount, warningCount + } + + if pods == nil { + details = append(details, "❌ CRITICAL: No pod data available") + criticalCount++ + return health, details, criticalCount, warningCount + } + + podList := pods.UnstructuredContent() + items, ok := podList["items"].([]interface{}) + if !ok { + details = append(details, "❌ CRITICAL: Unable to parse pod data") + criticalCount++ + return health, details, criticalCount, warningCount + } + + health.Total = len(items) + + for _, item := range items { + pod, ok := item.(map[string]interface{}) + if !ok { + continue + } + + metadata, _ := pod["metadata"].(map[string]interface{}) + name, _ := metadata["name"].(string) + namespace, _ := metadata["namespace"].(string) + podName := fmt.Sprintf("%s/%s", namespace, name) + + status, ok := pod["status"].(map[string]interface{}) + if !ok { + continue + } + + phase, _ := status["phase"].(string) + + // Check for failed/pending pods + if phase != "Running" && phase != "Succeeded" { + health.Failed = append(health.Failed, fmt.Sprintf("%s [%s]", podName, phase)) + criticalCount++ + health.Status = "critical" + } + + // Check container statuses + containerStatuses, ok := status["containerStatuses"].([]interface{}) + if !ok { + continue + } + + for _, cs := range containerStatuses { + containerStatus, ok := cs.(map[string]interface{}) + if !ok { + continue + } + + // Check restart count + restartCount, _ := containerStatus["restartCount"].(float64) + if restartCount > HighRestartThreshold { + health.HighRestarts = append(health.HighRestarts, fmt.Sprintf("%s [%d restarts]", podName, int(restartCount))) + warningCount++ + if health.Status == "healthy" { + health.Status = "warning" + } + } + + // Check for CrashLoopBackOff + state, ok := containerStatus["state"].(map[string]interface{}) + if ok { + if waiting, ok := state["waiting"].(map[string]interface{}); ok { + reason, _ := waiting["reason"].(string) + switch reason { + case "CrashLoopBackOff": + health.CrashLooping = append(health.CrashLooping, podName) + criticalCount++ + health.Status = "critical" + case "ImagePullBackOff", "ErrImagePull": + health.ImagePullErrors = append(health.ImagePullErrors, podName) + criticalCount++ + health.Status = "critical" + } + } + } + } + } + + if len(health.Failed) > 0 { + details = append(details, fmt.Sprintf("❌ CRITICAL: %d pod(s) in failed/pending state", len(health.Failed))) + if verbose { + for _, pod := range health.Failed { + details = append(details, fmt.Sprintf(" - %s", pod)) + } + } + } + if len(health.CrashLooping) > 0 { + details = append(details, fmt.Sprintf("❌ CRITICAL: %d pod(s) in CrashLoopBackOff: %s", len(health.CrashLooping), strings.Join(health.CrashLooping, ", "))) + } + if len(health.ImagePullErrors) > 0 { + details = append(details, fmt.Sprintf("❌ CRITICAL: %d pod(s) with image pull errors: %s", len(health.ImagePullErrors), strings.Join(health.ImagePullErrors, ", "))) + } + if len(health.HighRestarts) > 0 { + details = append(details, fmt.Sprintf("⚠️ WARNING: %d pod(s) with high restart count (>%d)", len(health.HighRestarts), HighRestartThreshold)) + if verbose { + for _, pod := range health.HighRestarts { + details = append(details, fmt.Sprintf(" - %s", pod)) + } + } + } + if len(health.Failed) == 0 && len(health.CrashLooping) == 0 && len(health.ImagePullErrors) == 0 && len(health.HighRestarts) == 0 { + details = append(details, fmt.Sprintf("✅ All pods healthy (%d)", health.Total)) + } + + return health, details, criticalCount, warningCount +} + +// checkWorkloadHealth examines the health of workload controllers (Deployments, StatefulSets, DaemonSets). +// It identifies controllers with unavailable or not-ready replicas. +// Returns: health status, detail messages, warning count. +func checkWorkloadHealth(params api.ToolHandlerParams, verbose bool) (*WorkloadHealth, []string, int) { + health := &WorkloadHealth{ + UnhealthyDeployments: []string{}, + UnhealthyStatefulSets: []string{}, + UnhealthyDaemonSets: []string{}, + } + details := []string{"Checking Workload Controllers..."} + warningCount := 0 + + // Check Deployments + deploymentsGVK := &schema.GroupVersionKind{ + Group: "apps", + Version: "v1", + Kind: "Deployment", + } + deployments, err := params.ResourcesList(params, deploymentsGVK, "", internalk8s.ResourceListOptions{}) + if err == nil && deployments != nil { + depList := deployments.UnstructuredContent() + if items, ok := depList["items"].([]interface{}); ok { + for _, item := range items { + dep, _ := item.(map[string]interface{}) + metadata, _ := dep["metadata"].(map[string]interface{}) + name, _ := metadata["name"].(string) + namespace, _ := metadata["namespace"].(string) + + status, _ := dep["status"].(map[string]interface{}) + spec, _ := dep["spec"].(map[string]interface{}) + + replicas, _ := spec["replicas"].(float64) + readyReplicas, _ := status["readyReplicas"].(float64) + unavailableReplicas, _ := status["unavailableReplicas"].(float64) + + if unavailableReplicas > 0 || readyReplicas != replicas { + health.UnhealthyDeployments = append(health.UnhealthyDeployments, + fmt.Sprintf("%s/%s [Ready: %d/%d]", namespace, name, int(readyReplicas), int(replicas))) + warningCount++ + } + } + } + } + + // Check StatefulSets + stsGVK := &schema.GroupVersionKind{ + Group: "apps", + Version: "v1", + Kind: "StatefulSet", + } + statefulsets, err := params.ResourcesList(params, stsGVK, "", internalk8s.ResourceListOptions{}) + if err == nil && statefulsets != nil { + stsList := statefulsets.UnstructuredContent() + if items, ok := stsList["items"].([]interface{}); ok { + for _, item := range items { + sts, _ := item.(map[string]interface{}) + metadata, _ := sts["metadata"].(map[string]interface{}) + name, _ := metadata["name"].(string) + namespace, _ := metadata["namespace"].(string) + + status, _ := sts["status"].(map[string]interface{}) + spec, _ := sts["spec"].(map[string]interface{}) + + replicas, _ := spec["replicas"].(float64) + readyReplicas, _ := status["readyReplicas"].(float64) + + if readyReplicas != replicas { + health.UnhealthyStatefulSets = append(health.UnhealthyStatefulSets, + fmt.Sprintf("%s/%s [Ready: %d/%d]", namespace, name, int(readyReplicas), int(replicas))) + warningCount++ + } + } + } + } + + // Check DaemonSets + dsGVK := &schema.GroupVersionKind{ + Group: "apps", + Version: "v1", + Kind: "DaemonSet", + } + daemonsets, err := params.ResourcesList(params, dsGVK, "", internalk8s.ResourceListOptions{}) + if err == nil && daemonsets != nil { + dsList := daemonsets.UnstructuredContent() + if items, ok := dsList["items"].([]interface{}); ok { + for _, item := range items { + ds, _ := item.(map[string]interface{}) + metadata, _ := ds["metadata"].(map[string]interface{}) + name, _ := metadata["name"].(string) + namespace, _ := metadata["namespace"].(string) + + status, _ := ds["status"].(map[string]interface{}) + + desiredNumberScheduled, _ := status["desiredNumberScheduled"].(float64) + numberReady, _ := status["numberReady"].(float64) + + if numberReady != desiredNumberScheduled { + health.UnhealthyDaemonSets = append(health.UnhealthyDaemonSets, + fmt.Sprintf("%s/%s [Ready: %d/%d]", namespace, name, int(numberReady), int(desiredNumberScheduled))) + warningCount++ + } + } + } + } + + if len(health.UnhealthyDeployments) > 0 { + details = append(details, fmt.Sprintf("⚠️ WARNING: %d deployment(s) with unavailable replicas", len(health.UnhealthyDeployments))) + if verbose { + for _, dep := range health.UnhealthyDeployments { + details = append(details, fmt.Sprintf(" - %s", dep)) + } + } + } else { + details = append(details, "✅ All deployments healthy") + } + + if len(health.UnhealthyStatefulSets) > 0 { + details = append(details, fmt.Sprintf("⚠️ WARNING: %d statefulset(s) with unavailable replicas", len(health.UnhealthyStatefulSets))) + if verbose { + for _, sts := range health.UnhealthyStatefulSets { + details = append(details, fmt.Sprintf(" - %s", sts)) + } + } + } else { + details = append(details, "✅ All statefulsets healthy") + } + + if len(health.UnhealthyDaemonSets) > 0 { + details = append(details, fmt.Sprintf("⚠️ WARNING: %d daemonset(s) with unavailable pods", len(health.UnhealthyDaemonSets))) + if verbose { + for _, ds := range health.UnhealthyDaemonSets { + details = append(details, fmt.Sprintf(" - %s", ds)) + } + } + } else { + details = append(details, "✅ All daemonsets healthy") + } + + return health, details, warningCount +} + +// checkStorageHealth examines the health of persistent volume claims (PVCs). +// It identifies PVCs that are not in Bound state. +// Returns: health status, detail messages, warning count. +func checkStorageHealth(params api.ToolHandlerParams, verbose bool) (*StorageHealth, []string, int) { + health := &StorageHealth{ + PendingPVCs: []string{}, + } + details := []string{"Checking Storage..."} + warningCount := 0 + + pvcGVK := &schema.GroupVersionKind{ + Group: "", + Version: "v1", + Kind: "PersistentVolumeClaim", + } + + pvcs, err := params.ResourcesList(params, pvcGVK, "", internalk8s.ResourceListOptions{}) + if err != nil { + if !errors.IsNotFound(err) { + details = append(details, fmt.Sprintf("⚠️ WARNING: Unable to check PVCs: %v", err)) + warningCount++ + } + return health, details, warningCount + } + + if pvcs != nil { + pvcList := pvcs.UnstructuredContent() + if items, ok := pvcList["items"].([]interface{}); ok { + for _, item := range items { + pvc, _ := item.(map[string]interface{}) + metadata, _ := pvc["metadata"].(map[string]interface{}) + name, _ := metadata["name"].(string) + namespace, _ := metadata["namespace"].(string) + + status, _ := pvc["status"].(map[string]interface{}) + phase, _ := status["phase"].(string) + + if phase != "Bound" { + health.PendingPVCs = append(health.PendingPVCs, fmt.Sprintf("%s/%s [%s]", namespace, name, phase)) + warningCount++ + } + } + } + } + + if len(health.PendingPVCs) > 0 { + details = append(details, fmt.Sprintf("⚠️ WARNING: %d PVC(s) not bound", len(health.PendingPVCs))) + if verbose { + for _, pvc := range health.PendingPVCs { + details = append(details, fmt.Sprintf(" - %s", pvc)) + } + } + } else { + details = append(details, "✅ All PVCs bound") + } + + return health, details, warningCount +} + +// checkRecentEvents analyzes recent warning events in the cluster. +// It collects warning events from the last 30 minutes across all namespaces. +// Returns: event health status, detail messages. +func checkRecentEvents(params api.ToolHandlerParams, verbose bool) (*EventHealth, []string) { + health := &EventHealth{ + RecentWarnings: []string{}, + } + details := []string{"Checking Recent Events..."} + + // Get events from all namespaces + eventList, err := params.EventsList(params, "") + if err != nil { + details = append(details, fmt.Sprintf("⚠️ Unable to check events: %v", err)) + return health, details + } + + // Parse events + thirtyMinutesAgo := time.Now().Add(-EventsTimeWindow) + + for _, event := range eventList { + eventType, _ := event["Type"].(string) + if eventType != string(corev1.EventTypeWarning) { + continue + } + + // Check timestamp + timestamp, _ := event["Timestamp"].(string) + if timestamp != "" { + eventTime, err := time.Parse(time.RFC3339, timestamp) + if err == nil && eventTime.After(thirtyMinutesAgo) { + involvedObject, _ := event["InvolvedObject"].(map[string]string) + namespace, _ := event["Namespace"].(string) + name := involvedObject["Name"] + message, _ := event["Message"].(string) + + eventStr := fmt.Sprintf("%s [%s/%s]: %s", timestamp, namespace, name, message) + health.RecentWarnings = append(health.RecentWarnings, eventStr) + + if len(health.RecentWarnings) >= MaxRecentWarnings { + break + } + } + } + } + + if len(health.RecentWarnings) > 0 { + details = append(details, fmt.Sprintf("⚠️ %d warning event(s) in last %d minutes", len(health.RecentWarnings), int(EventsTimeWindow.Minutes()))) + if verbose { + for i, event := range health.RecentWarnings { + if i >= MaxVerboseWarnings { + details = append(details, fmt.Sprintf(" ... and %d more", len(health.RecentWarnings)-MaxVerboseWarnings)) + break + } + details = append(details, fmt.Sprintf(" - %s", event)) + } + } + } else { + details = append(details, "✅ No recent warning events") + } + + return health, details +} + +// formatTextOutput formats the health check result into a human-readable text report. +// The report includes cluster information, detailed check results, and a summary. +// Returns: formatted text output. +func formatTextOutput(result *HealthCheckResult, verbose bool) string { + var output strings.Builder + + output.WriteString("===============================================\n") + output.WriteString("Cluster Health Check Report\n") + output.WriteString("===============================================\n") + output.WriteString(fmt.Sprintf("Cluster Type: %s\n", result.Cluster.Type)) + if result.Cluster.Version != "" { + output.WriteString(fmt.Sprintf("Cluster Version: %s\n", result.Cluster.Version)) + } + output.WriteString(fmt.Sprintf("Check Time: %s\n", result.Timestamp)) + output.WriteString("\n") + + // Write details + for _, detail := range result.Details { + output.WriteString(detail) + output.WriteString("\n") + } + + output.WriteString("\n") + output.WriteString("===============================================\n") + output.WriteString("Summary\n") + output.WriteString("===============================================\n") + output.WriteString(fmt.Sprintf("Critical Issues: %d\n", result.Summary.CriticalIssues)) + output.WriteString(fmt.Sprintf("Warnings: %d\n", result.Summary.Warnings)) + output.WriteString("\n") + + if result.Summary.CriticalIssues == 0 && result.Summary.Warnings == 0 { + output.WriteString("✅ Cluster is healthy - no issues detected\n") + } else if result.Summary.CriticalIssues > 0 { + output.WriteString("❌ Cluster has CRITICAL issues requiring immediate attention\n") + } else { + output.WriteString("⚠️ Cluster has warnings - monitoring recommended\n") + } + + return output.String() +} diff --git a/pkg/toolsets/core/toolset.go b/pkg/toolsets/core/toolset.go index dfd61f42..1e30b5b3 100644 --- a/pkg/toolsets/core/toolset.go +++ b/pkg/toolsets/core/toolset.go @@ -23,6 +23,7 @@ func (t *Toolset) GetDescription() string { func (t *Toolset) GetTools(o internalk8s.Openshift) []api.ServerTool { return slices.Concat( initEvents(), + initHealthCheck(o), initNamespaces(o), initNodes(), initPods(), From 37b8e887f12739cb4e9a8b58e5d15c18da2dee8e Mon Sep 17 00:00:00 2001 From: Rohit Patil Date: Wed, 5 Nov 2025 20:35:21 +0530 Subject: [PATCH 2/4] implement with prompt type Signed-off-by: Rohit Patil --- pkg/api/prompts.go | 36 + pkg/config/config.go | 1 + pkg/mcp/mcp.go | 45 ++ pkg/mcp/mcp_test.go | 18 +- pkg/mcp/modules.go | 1 + pkg/mcp/prompts.go | 72 ++ pkg/mcp/prompts_test.go | 185 +++++ pkg/promptsets/core/health_check.go | 178 +++++ pkg/promptsets/core/health_check_test.go | 330 ++++++++ pkg/promptsets/core/promptset.go | 39 + pkg/promptsets/promptsets.go | 50 ++ pkg/promptsets/promptsets_test.go | 138 ++++ pkg/toolsets/core/health_check.go | 964 ----------------------- pkg/toolsets/core/toolset.go | 1 - 14 files changed, 1088 insertions(+), 970 deletions(-) create mode 100644 pkg/api/prompts.go create mode 100644 pkg/mcp/prompts.go create mode 100644 pkg/mcp/prompts_test.go create mode 100644 pkg/promptsets/core/health_check.go create mode 100644 pkg/promptsets/core/health_check_test.go create mode 100644 pkg/promptsets/core/promptset.go create mode 100644 pkg/promptsets/promptsets.go create mode 100644 pkg/promptsets/promptsets_test.go delete mode 100644 pkg/toolsets/core/health_check.go diff --git a/pkg/api/prompts.go b/pkg/api/prompts.go new file mode 100644 index 00000000..5cd5b436 --- /dev/null +++ b/pkg/api/prompts.go @@ -0,0 +1,36 @@ +package api + +import ( + internalk8s "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" +) + +// ServerPrompt represents a prompt that can be provided to the MCP server +type ServerPrompt struct { + Name string + Description string + Arguments []PromptArgument + GetMessages func(arguments map[string]string) []PromptMessage +} + +// PromptArgument defines an argument that can be passed to a prompt +type PromptArgument struct { + Name string + Description string + Required bool +} + +// PromptMessage represents a message in a prompt +type PromptMessage struct { + Role string // "user" or "assistant" + Content string +} + +// PromptSet groups related prompts together +type PromptSet interface { + // GetName returns the name of the prompt set + GetName() string + // GetDescription returns a description of what this prompt set provides + GetDescription() string + // GetPrompts returns all prompts in this set + GetPrompts(o internalk8s.Openshift) []ServerPrompt +} diff --git a/pkg/config/config.go b/pkg/config/config.go index 81bec2b7..bddc2868 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -31,6 +31,7 @@ type StaticConfig struct { // When true, disable tools annotated with destructiveHint=true DisableDestructive bool `toml:"disable_destructive,omitempty"` Toolsets []string `toml:"toolsets,omitempty"` + Promptsets []string `toml:"promptsets,omitempty"` EnabledTools []string `toml:"enabled_tools,omitempty"` DisabledTools []string `toml:"disabled_tools,omitempty"` diff --git a/pkg/mcp/mcp.go b/pkg/mcp/mcp.go index f64d4104..140c46e0 100644 --- a/pkg/mcp/mcp.go +++ b/pkg/mcp/mcp.go @@ -17,6 +17,7 @@ import ( "github.com/containers/kubernetes-mcp-server/pkg/config" internalk8s "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" "github.com/containers/kubernetes-mcp-server/pkg/output" + "github.com/containers/kubernetes-mcp-server/pkg/promptsets" "github.com/containers/kubernetes-mcp-server/pkg/toolsets" "github.com/containers/kubernetes-mcp-server/pkg/version" ) @@ -29,6 +30,7 @@ type Configuration struct { *config.StaticConfig listOutput output.Output toolsets []api.Toolset + promptsets []api.PromptSet } func (c *Configuration) Toolsets() []api.Toolset { @@ -40,6 +42,23 @@ func (c *Configuration) Toolsets() []api.Toolset { return c.toolsets } +func (c *Configuration) Promptsets() []api.PromptSet { + if c.promptsets == nil { + // Default to core if no promptsets configured + promptsetNames := c.StaticConfig.Promptsets + if len(promptsetNames) == 0 { + promptsetNames = []string{"core"} + } + for _, promptset := range promptsetNames { + ps := promptsets.PromptSetFromString(promptset) + if ps != nil { + c.promptsets = append(c.promptsets, ps) + } + } + } + return c.promptsets +} + func (c *Configuration) ListOutput() output.Output { if c.listOutput == nil { c.listOutput = output.FromString(c.StaticConfig.ListOutput) @@ -148,11 +167,37 @@ func (s *Server) reloadKubernetesClusterProvider() error { s.server.SetTools(m3labsServerTools...) + // Register prompts + if err := s.registerPrompts(p); err != nil { + klog.Warningf("Failed to register prompts: %v", err) + // Don't fail the whole reload if prompts fail + } + // start new watch s.p.WatchTargets(s.reloadKubernetesClusterProvider) return nil } +// registerPrompts loads and registers all prompts with the MCP server +func (s *Server) registerPrompts(p internalk8s.Provider) error { + allPrompts := make([]api.ServerPrompt, 0) + for _, ps := range s.configuration.Promptsets() { + prompts := ps.GetPrompts(p) + allPrompts = append(allPrompts, prompts...) + klog.V(5).Infof("Loaded %d prompts from promptset '%s'", len(prompts), ps.GetName()) + } + + m3labsPrompts, err := ServerPromptToM3LabsPrompt(allPrompts) + if err != nil { + return fmt.Errorf("failed to convert prompts: %v", err) + } + + s.server.SetPrompts(m3labsPrompts...) + klog.V(3).Infof("Registered %d prompts", len(m3labsPrompts)) + + return nil +} + func (s *Server) ServeStdio() error { return server.ServeStdio(s.server) } diff --git a/pkg/mcp/mcp_test.go b/pkg/mcp/mcp_test.go index 9dca88e4..c187c31d 100644 --- a/pkg/mcp/mcp_test.go +++ b/pkg/mcp/mcp_test.go @@ -32,15 +32,21 @@ func (s *WatchKubeConfigSuite) TestNotifiesToolsChange() { s.InitMcpClient() withTimeout, cancel := context.WithTimeout(s.T().Context(), 5*time.Second) defer cancel() - var notification *mcp.JSONRPCNotification + var toolsNotification *mcp.JSONRPCNotification + var promptsNotification *mcp.JSONRPCNotification s.OnNotification(func(n mcp.JSONRPCNotification) { - notification = &n + if n.Method == "notifications/tools/list_changed" { + toolsNotification = &n + } + if n.Method == "notifications/prompts/list_changed" { + promptsNotification = &n + } }) // When f, _ := os.OpenFile(s.Cfg.KubeConfig, os.O_APPEND|os.O_WRONLY, 0644) _, _ = f.WriteString("\n") _ = f.Close() - for notification == nil { + for toolsNotification == nil || promptsNotification == nil { select { case <-withTimeout.Done(): s.FailNow("timeout waiting for WatchKubeConfig notification") @@ -49,8 +55,10 @@ func (s *WatchKubeConfigSuite) TestNotifiesToolsChange() { } } // Then - s.NotNil(notification, "WatchKubeConfig did not notify") - s.Equal("notifications/tools/list_changed", notification.Method, "WatchKubeConfig did not notify tools change") + s.NotNil(toolsNotification, "WatchKubeConfig did not notify tools change") + s.Equal("notifications/tools/list_changed", toolsNotification.Method, "WatchKubeConfig did not notify tools change") + s.NotNil(promptsNotification, "WatchKubeConfig did not notify prompts change") + s.Equal("notifications/prompts/list_changed", promptsNotification.Method, "WatchKubeConfig did not notify prompts change") } func TestWatchKubeConfig(t *testing.T) { diff --git a/pkg/mcp/modules.go b/pkg/mcp/modules.go index 3295d72b..af7bcea0 100644 --- a/pkg/mcp/modules.go +++ b/pkg/mcp/modules.go @@ -1,5 +1,6 @@ package mcp +import _ "github.com/containers/kubernetes-mcp-server/pkg/promptsets/core" import _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/config" import _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/core" import _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/helm" diff --git a/pkg/mcp/prompts.go b/pkg/mcp/prompts.go new file mode 100644 index 00000000..8c42659e --- /dev/null +++ b/pkg/mcp/prompts.go @@ -0,0 +1,72 @@ +package mcp + +import ( + "context" + + "github.com/mark3labs/mcp-go/mcp" + "github.com/mark3labs/mcp-go/server" + + "github.com/containers/kubernetes-mcp-server/pkg/api" +) + +// ServerPromptToM3LabsPrompt converts our internal ServerPrompt to mcp-go ServerPrompt format +func ServerPromptToM3LabsPrompt(prompts []api.ServerPrompt) ([]server.ServerPrompt, error) { + m3labsPrompts := make([]server.ServerPrompt, 0, len(prompts)) + + for _, prompt := range prompts { + // Convert arguments + arguments := make([]mcp.PromptArgument, 0, len(prompt.Arguments)) + for _, arg := range prompt.Arguments { + arguments = append(arguments, mcp.PromptArgument{ + Name: arg.Name, + Description: arg.Description, + Required: arg.Required, + }) + } + + // Create the prompt handler + handler := createPromptHandler(prompt) + + m3labsPrompts = append(m3labsPrompts, server.ServerPrompt{ + Prompt: mcp.Prompt{ + Name: prompt.Name, + Description: prompt.Description, + Arguments: arguments, + }, + Handler: handler, + }) + } + + return m3labsPrompts, nil +} + +// createPromptHandler creates a handler function for a prompt +func createPromptHandler(prompt api.ServerPrompt) server.PromptHandlerFunc { + return func(ctx context.Context, request mcp.GetPromptRequest) (*mcp.GetPromptResult, error) { + // Get arguments from the request (already a map[string]string) + arguments := request.Params.Arguments + if arguments == nil { + arguments = make(map[string]string) + } + + // Get messages from the prompt + promptMessages := prompt.GetMessages(arguments) + + // Convert to mcp-go format + messages := make([]mcp.PromptMessage, 0, len(promptMessages)) + for _, msg := range promptMessages { + messages = append(messages, mcp.PromptMessage{ + Role: mcp.Role(msg.Role), + Content: mcp.TextContent{ + Type: "text", + Text: msg.Content, + }, + }) + } + + return &mcp.GetPromptResult{ + Description: prompt.Description, + Messages: messages, + }, nil + } +} diff --git a/pkg/mcp/prompts_test.go b/pkg/mcp/prompts_test.go new file mode 100644 index 00000000..2be59ac9 --- /dev/null +++ b/pkg/mcp/prompts_test.go @@ -0,0 +1,185 @@ +package mcp + +import ( + "context" + "testing" + + "github.com/mark3labs/mcp-go/mcp" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/containers/kubernetes-mcp-server/pkg/api" +) + +func TestServerPromptToM3LabsPrompt(t *testing.T) { + t.Run("Converts empty prompt list", func(t *testing.T) { + // Given + prompts := []api.ServerPrompt{} + + // When + result, err := ServerPromptToM3LabsPrompt(prompts) + + // Then + require.NoError(t, err) + assert.Empty(t, result) + }) + + t.Run("Converts single prompt correctly", func(t *testing.T) { + // Given + prompts := []api.ServerPrompt{ + { + Name: "test_prompt", + Description: "Test prompt description", + Arguments: []api.PromptArgument{ + { + Name: "arg1", + Description: "Argument 1", + Required: true, + }, + }, + GetMessages: func(arguments map[string]string) []api.PromptMessage { + return []api.PromptMessage{ + {Role: "user", Content: "Hello"}, + {Role: "assistant", Content: "Hi there"}, + } + }, + }, + } + + // When + result, err := ServerPromptToM3LabsPrompt(prompts) + + // Then + require.NoError(t, err) + require.Len(t, result, 1) + + assert.Equal(t, "test_prompt", result[0].Prompt.Name) + assert.Equal(t, "Test prompt description", result[0].Prompt.Description) + require.Len(t, result[0].Prompt.Arguments, 1) + assert.Equal(t, "arg1", result[0].Prompt.Arguments[0].Name) + assert.Equal(t, "Argument 1", result[0].Prompt.Arguments[0].Description) + assert.True(t, result[0].Prompt.Arguments[0].Required) + }) + + t.Run("Converts multiple prompts correctly", func(t *testing.T) { + // Given + prompts := []api.ServerPrompt{ + { + Name: "prompt1", + Description: "First prompt", + Arguments: []api.PromptArgument{}, + GetMessages: func(arguments map[string]string) []api.PromptMessage { + return []api.PromptMessage{{Role: "user", Content: "test1"}} + }, + }, + { + Name: "prompt2", + Description: "Second prompt", + Arguments: []api.PromptArgument{}, + GetMessages: func(arguments map[string]string) []api.PromptMessage { + return []api.PromptMessage{{Role: "user", Content: "test2"}} + }, + }, + } + + // When + result, err := ServerPromptToM3LabsPrompt(prompts) + + // Then + require.NoError(t, err) + assert.Len(t, result, 2) + assert.Equal(t, "prompt1", result[0].Prompt.Name) + assert.Equal(t, "prompt2", result[1].Prompt.Name) + }) +} + +func TestCreatePromptHandler(t *testing.T) { + t.Run("Handler returns correct messages", func(t *testing.T) { + // Given + prompt := api.ServerPrompt{ + Name: "test", + Description: "Test prompt", + Arguments: []api.PromptArgument{}, + GetMessages: func(arguments map[string]string) []api.PromptMessage { + return []api.PromptMessage{ + {Role: "user", Content: "Test message"}, + {Role: "assistant", Content: "Test response"}, + } + }, + } + + handler := createPromptHandler(prompt) + + // When + result, err := handler(context.Background(), mcp.GetPromptRequest{ + Params: mcp.GetPromptParams{ + Arguments: map[string]string{}, + }, + }) + + // Then + require.NoError(t, err) + assert.Equal(t, "Test prompt", result.Description) + require.Len(t, result.Messages, 2) + assert.Equal(t, mcp.Role("user"), result.Messages[0].Role) + assert.Equal(t, "Test message", result.Messages[0].Content.(mcp.TextContent).Text) + assert.Equal(t, mcp.Role("assistant"), result.Messages[1].Role) + assert.Equal(t, "Test response", result.Messages[1].Content.(mcp.TextContent).Text) + }) + + t.Run("Handler uses provided arguments", func(t *testing.T) { + // Given + prompt := api.ServerPrompt{ + Name: "test", + Description: "Test prompt", + Arguments: []api.PromptArgument{ + {Name: "param1", Description: "Parameter 1", Required: false}, + }, + GetMessages: func(arguments map[string]string) []api.PromptMessage { + value := arguments["param1"] + return []api.PromptMessage{ + {Role: "user", Content: "Value is: " + value}, + } + }, + } + + handler := createPromptHandler(prompt) + + // When + result, err := handler(context.Background(), mcp.GetPromptRequest{ + Params: mcp.GetPromptParams{ + Arguments: map[string]string{"param1": "test_value"}, + }, + }) + + // Then + require.NoError(t, err) + require.Len(t, result.Messages, 1) + assert.Equal(t, "Value is: test_value", result.Messages[0].Content.(mcp.TextContent).Text) + }) + + t.Run("Handler handles nil arguments", func(t *testing.T) { + // Given + prompt := api.ServerPrompt{ + Name: "test", + Description: "Test prompt", + Arguments: []api.PromptArgument{}, + GetMessages: func(arguments map[string]string) []api.PromptMessage { + return []api.PromptMessage{{Role: "user", Content: "test"}} + }, + } + + handler := createPromptHandler(prompt) + + // When + result, err := handler(context.Background(), mcp.GetPromptRequest{ + Params: mcp.GetPromptParams{ + Arguments: nil, + }, + }) + + // Then + require.NoError(t, err) + require.Len(t, result.Messages, 1) + }) +} diff --git a/pkg/promptsets/core/health_check.go b/pkg/promptsets/core/health_check.go new file mode 100644 index 00000000..a2e69a2c --- /dev/null +++ b/pkg/promptsets/core/health_check.go @@ -0,0 +1,178 @@ +package core + +import ( + "fmt" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + internalk8s "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" +) + +func initHealthCheckPrompts(o internalk8s.Openshift) []api.ServerPrompt { + return []api.ServerPrompt{ + { + Name: "cluster_health_check", + Description: "Guide for performing comprehensive health check on Kubernetes/OpenShift clusters. Provides step-by-step instructions for examining cluster operators, nodes, pods, workloads, storage, and events to identify issues affecting cluster stability.", + Arguments: []api.PromptArgument{ + { + Name: "verbose", + Description: "Whether to include detailed diagnostics and resource-level information", + Required: false, + }, + { + Name: "namespace", + Description: "Limit health check to specific namespace (optional, defaults to all namespaces)", + Required: false, + }, + }, + GetMessages: func(arguments map[string]string) []api.PromptMessage { + verbose := arguments["verbose"] == "true" + namespace := arguments["namespace"] + + return buildHealthCheckPromptMessages(verbose, namespace) + }, + }, + } +} + +func buildHealthCheckPromptMessages(verbose bool, namespace string) []api.PromptMessage { + scopeMsg := "across all namespaces" + namespaceFilter := "" + if namespace != "" { + scopeMsg = fmt.Sprintf("in namespace '%s'", namespace) + namespaceFilter = fmt.Sprintf(" in namespace '%s'", namespace) + } + + verboseMsg := "" + if verbose { + verboseMsg = "\n\nFor verbose mode, include additional details such as:\n" + + "- Specific error messages from conditions\n" + + "- Resource-level details (CPU/memory pressure types)\n" + + "- Individual pod and deployment names\n" + + "- Event messages and timestamps" + } + + userMessage := fmt.Sprintf(`Please perform a comprehensive health check on the Kubernetes cluster %s. + +Follow these steps systematically: + +## 1. Check Cluster-Level Components + +### For OpenShift Clusters: +- Use resources_list with kind=ClusterOperator to check cluster operator health +- Look for operators with: + * Degraded=True (CRITICAL) + * Available=False (CRITICAL) + * Progressing=True (WARNING) + +### For All Kubernetes Clusters: +- Verify if this is an OpenShift cluster by checking for OpenShift-specific resources +- Note the cluster type in your report + +## 2. Check Node Health +- Use resources_list with kind=Node to examine all nodes +- Check each node for: + * Ready condition != True (CRITICAL) + * Unschedulable spec field = true (WARNING) + * MemoryPressure, DiskPressure, or PIDPressure conditions = True (WARNING) +- Count total nodes and categorize issues + +## 3. Check Pod Health +- Use pods_list to get all pods%s +- Identify problematic pods: + * Phase = Failed or Pending (CRITICAL) + * Container state waiting with reason: + - CrashLoopBackOff (CRITICAL) + - ImagePullBackOff or ErrImagePull (CRITICAL) + * RestartCount > 5 (WARNING - configurable threshold) +- Group issues by type and count occurrences + +## 4. Check Workload Controllers +- Use resources_list for each workload type: + * kind=Deployment (apps/v1) + * kind=StatefulSet (apps/v1) + * kind=DaemonSet (apps/v1) +- For each controller, compare: + * spec.replicas vs status.readyReplicas (Deployment/StatefulSet) + * status.desiredNumberScheduled vs status.numberReady (DaemonSet) + * Report mismatches as WARNINGs + +## 5. Check Storage +- Use resources_list with kind=PersistentVolumeClaim +- Identify PVCs not in Bound phase (WARNING) +- Note namespace and PVC name for each issue + +## 6. Check Recent Events (Optional) +- Use events_list to get cluster events +- Filter for: + * Type = Warning + * Timestamp within last 30 minutes +- Limit to 10-20 most recent warnings +- Include event message and involved object%s + +## Output Format + +Structure your health check report as follows: + +`+"```"+` +================================================ +Cluster Health Check Report +================================================ +Cluster Type: [Kubernetes/OpenShift] +Cluster Version: [if determinable] +Check Time: [current timestamp] +Scope: [all namespaces / specific namespace] + +### Cluster Operators (OpenShift only) +[Status with counts and specific issues] + +### Node Health +[Status with counts: total, not ready, unschedulable, under pressure] + +### Pod Health +[Status with counts: total, failed, crash looping, image pull errors, high restarts] + +### Workload Controllers +[Status for Deployments, StatefulSets, DaemonSets] + +### Storage +[PVC status: total, bound, pending/other] + +### Recent Events +[Warning events from last 30 minutes] + +================================================ +Summary +================================================ +Critical Issues: [count] +Warnings: [count] + +[Overall assessment: healthy / has warnings / has critical issues] +`+"```"+` + +## Health Status Definitions + +- **CRITICAL**: Issues requiring immediate attention (e.g., pods failing, nodes not ready, degraded operators) +- **WARNING**: Issues that should be monitored (e.g., high restarts, progressing operators, resource pressure) +- **HEALTHY**: No issues detected + +## Important Notes + +- Use the existing tools (resources_list, pods_list, events_list, etc.) +- Be efficient: don't call the same tool multiple times unnecessarily +- If a resource type doesn't exist (e.g., ClusterOperator on vanilla K8s), skip it gracefully +- Provide clear, actionable insights in your summary +- Use emojis for visual clarity: ✅ (healthy), ⚠️ (warning), ❌ (critical)`, scopeMsg, namespaceFilter, verboseMsg) + + assistantMessage := `I'll perform a comprehensive cluster health check following the systematic approach outlined. Let me start by gathering information about the cluster components.` + + return []api.PromptMessage{ + { + Role: "user", + Content: userMessage, + }, + { + Role: "assistant", + Content: assistantMessage, + }, + } +} diff --git a/pkg/promptsets/core/health_check_test.go b/pkg/promptsets/core/health_check_test.go new file mode 100644 index 00000000..2d272dcd --- /dev/null +++ b/pkg/promptsets/core/health_check_test.go @@ -0,0 +1,330 @@ +package core + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestInitHealthCheckPrompts(t *testing.T) { + // When + prompts := initHealthCheckPrompts(nil) + + // Then + require.Len(t, prompts, 1) + assert.Equal(t, "cluster_health_check", prompts[0].Name) + assert.Contains(t, prompts[0].Description, "comprehensive health check") + assert.Len(t, prompts[0].Arguments, 2) + + // Check arguments + assert.Equal(t, "verbose", prompts[0].Arguments[0].Name) + assert.False(t, prompts[0].Arguments[0].Required) + + assert.Equal(t, "namespace", prompts[0].Arguments[1].Name) + assert.False(t, prompts[0].Arguments[1].Required) +} + +func TestBuildHealthCheckPromptMessages(t *testing.T) { + t.Run("Default messages with no arguments", func(t *testing.T) { + // When + messages := buildHealthCheckPromptMessages(false, "") + + // Then + require.Len(t, messages, 2) + assert.Equal(t, "user", messages[0].Role) + assert.Equal(t, "assistant", messages[1].Role) + + // Check user message content + userContent := messages[0].Content + assert.Contains(t, userContent, "across all namespaces") + assert.Contains(t, userContent, "pods_list") + assert.Contains(t, userContent, "resources_list") + assert.Contains(t, userContent, "events_list") + assert.NotContains(t, userContent, "pods_list_in_all_namespaces") + + // Check assistant message + assert.Contains(t, messages[1].Content, "comprehensive cluster health check") + }) + + t.Run("Messages with namespace filter", func(t *testing.T) { + // When + messages := buildHealthCheckPromptMessages(false, "test-namespace") + + // Then + require.Len(t, messages, 2) + + userContent := messages[0].Content + assert.Contains(t, userContent, "in namespace 'test-namespace'") + assert.NotContains(t, userContent, "across all namespaces") + }) + + t.Run("Messages with verbose mode", func(t *testing.T) { + // When + messages := buildHealthCheckPromptMessages(true, "") + + // Then + require.Len(t, messages, 2) + + userContent := messages[0].Content + assert.Contains(t, userContent, "For verbose mode") + assert.Contains(t, userContent, "Specific error messages") + assert.Contains(t, userContent, "Resource-level details") + assert.Contains(t, userContent, "Individual pod and deployment names") + }) + + t.Run("Messages with both verbose and namespace", func(t *testing.T) { + // When + messages := buildHealthCheckPromptMessages(true, "prod") + + // Then + require.Len(t, messages, 2) + + userContent := messages[0].Content + assert.Contains(t, userContent, "in namespace 'prod'") + assert.Contains(t, userContent, "For verbose mode") + }) + + t.Run("User message contains all required sections", func(t *testing.T) { + // When + messages := buildHealthCheckPromptMessages(false, "") + + // Then + userContent := messages[0].Content + + // Check for all main sections + sections := []string{ + "## 1. Check Cluster-Level Components", + "## 2. Check Node Health", + "## 3. Check Pod Health", + "## 4. Check Workload Controllers", + "## 5. Check Storage", + "## 6. Check Recent Events", + "## Output Format", + "## Health Status Definitions", + "## Important Notes", + } + + for _, section := range sections { + assert.Contains(t, userContent, section, "Missing section: %s", section) + } + }) + + t.Run("User message contains critical tool references", func(t *testing.T) { + // When + messages := buildHealthCheckPromptMessages(false, "") + + // Then + userContent := messages[0].Content + + // Check for tool names + tools := []string{ + "resources_list", + "pods_list", + "events_list", + } + + for _, tool := range tools { + assert.Contains(t, userContent, tool, "Missing tool reference: %s", tool) + } + }) + + t.Run("User message contains health check criteria", func(t *testing.T) { + // When + messages := buildHealthCheckPromptMessages(false, "") + + // Then + userContent := messages[0].Content + + // Check for critical conditions + criteria := []string{ + "Degraded=True (CRITICAL)", + "Available=False (CRITICAL)", + "Ready condition != True (CRITICAL)", + "CrashLoopBackOff (CRITICAL)", + "ImagePullBackOff", + "RestartCount > 5 (WARNING", + "MemoryPressure", + "DiskPressure", + } + + for _, criterion := range criteria { + assert.Contains(t, userContent, criterion, "Missing criterion: %s", criterion) + } + }) + + t.Run("User message contains workload types", func(t *testing.T) { + // When + messages := buildHealthCheckPromptMessages(false, "") + + // Then + userContent := messages[0].Content + + workloadTypes := []string{ + "kind=Deployment", + "kind=StatefulSet", + "kind=DaemonSet", + "kind=ClusterOperator", + "kind=Node", + "kind=PersistentVolumeClaim", + } + + for _, wl := range workloadTypes { + assert.Contains(t, userContent, wl, "Missing workload type: %s", wl) + } + }) + + t.Run("User message contains output format template", func(t *testing.T) { + // When + messages := buildHealthCheckPromptMessages(false, "") + + // Then + userContent := messages[0].Content + + // Check for report structure + reportElements := []string{ + "Cluster Health Check Report", + "Cluster Type:", + "### Cluster Operators", + "### Node Health", + "### Pod Health", + "### Workload Controllers", + "### Storage", + "### Recent Events", + "Summary", + "Critical Issues:", + "Warnings:", + } + + for _, element := range reportElements { + assert.Contains(t, userContent, element, "Missing report element: %s", element) + } + }) + + t.Run("User message does not reference non-existent tools", func(t *testing.T) { + // When + messages := buildHealthCheckPromptMessages(false, "") + + // Then + userContent := messages[0].Content + + // Make sure we're not referencing the old tool name + assert.NotContains(t, userContent, "pods_list_in_all_namespaces") + }) +} + +func TestGetMessagesWithArguments(t *testing.T) { + // Given + prompts := initHealthCheckPrompts(nil) + require.Len(t, prompts, 1) + + getMessages := prompts[0].GetMessages + + t.Run("With no arguments", func(t *testing.T) { + // When + messages := getMessages(map[string]string{}) + + // Then + require.Len(t, messages, 2) + userContent := messages[0].Content + assert.Contains(t, userContent, "across all namespaces") + assert.NotContains(t, userContent, "For verbose mode") + }) + + t.Run("With verbose=true", func(t *testing.T) { + // When + messages := getMessages(map[string]string{"verbose": "true"}) + + // Then + require.Len(t, messages, 2) + userContent := messages[0].Content + assert.Contains(t, userContent, "For verbose mode") + }) + + t.Run("With verbose=false", func(t *testing.T) { + // When + messages := getMessages(map[string]string{"verbose": "false"}) + + // Then + require.Len(t, messages, 2) + userContent := messages[0].Content + assert.NotContains(t, userContent, "For verbose mode") + }) + + t.Run("With namespace", func(t *testing.T) { + // When + messages := getMessages(map[string]string{"namespace": "kube-system"}) + + // Then + require.Len(t, messages, 2) + userContent := messages[0].Content + assert.Contains(t, userContent, "in namespace 'kube-system'") + }) + + t.Run("With both arguments", func(t *testing.T) { + // When + messages := getMessages(map[string]string{ + "verbose": "true", + "namespace": "default", + }) + + // Then + require.Len(t, messages, 2) + userContent := messages[0].Content + assert.Contains(t, userContent, "For verbose mode") + assert.Contains(t, userContent, "in namespace 'default'") + }) +} + +func TestHealthCheckPromptCompleteness(t *testing.T) { + // This test ensures the prompt covers all essential aspects + + messages := buildHealthCheckPromptMessages(false, "") + userContent := messages[0].Content + + t.Run("Covers all Kubernetes resource types", func(t *testing.T) { + resourceTypes := []string{ + "Node", + "Pod", + "Deployment", + "StatefulSet", + "DaemonSet", + "PersistentVolumeClaim", + "ClusterOperator", // OpenShift specific + } + + for _, rt := range resourceTypes { + assert.Contains(t, userContent, rt, "Missing resource type: %s", rt) + } + }) + + t.Run("Provides clear severity levels", func(t *testing.T) { + assert.Contains(t, userContent, "CRITICAL") + assert.Contains(t, userContent, "WARNING") + assert.Contains(t, userContent, "HEALTHY") + }) + + t.Run("Includes efficiency guidelines", func(t *testing.T) { + assert.Contains(t, userContent, "Be efficient") + assert.Contains(t, userContent, "don't call the same tool multiple times unnecessarily") + }) + + t.Run("Handles OpenShift gracefully", func(t *testing.T) { + assert.Contains(t, userContent, "For OpenShift Clusters") + assert.Contains(t, userContent, "For All Kubernetes Clusters") + assert.Contains(t, userContent, "skip it gracefully") + }) + + t.Run("Instructions are clear and actionable", func(t *testing.T) { + // Check that the prompt uses imperative language + imperativeVerbs := []string{"Use", "Check", "Look for", "Verify", "Identify", "Compare"} + foundVerbs := 0 + for _, verb := range imperativeVerbs { + if strings.Contains(userContent, verb) { + foundVerbs++ + } + } + assert.Greater(t, foundVerbs, 3, "Prompt should use clear imperative language") + }) +} diff --git a/pkg/promptsets/core/promptset.go b/pkg/promptsets/core/promptset.go new file mode 100644 index 00000000..f9e61e2a --- /dev/null +++ b/pkg/promptsets/core/promptset.go @@ -0,0 +1,39 @@ +package core + +import ( + "github.com/containers/kubernetes-mcp-server/pkg/api" + internalk8s "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" + "github.com/containers/kubernetes-mcp-server/pkg/promptsets" +) + +const ( + Name = "core" + Description = "Core prompts for common Kubernetes/OpenShift operations including cluster health diagnostics" +) + +type PromptSet struct{} + +func (t *PromptSet) GetName() string { + return Name +} + +func (t *PromptSet) GetDescription() string { + return Description +} + +func (t *PromptSet) GetPrompts(o internalk8s.Openshift) []api.ServerPrompt { + prompts := make([]api.ServerPrompt, 0) + + // Health check prompts + prompts = append(prompts, initHealthCheckPrompts(o)...) + + // Future: Add more prompts here + // prompts = append(prompts, initTroubleshootingPrompts(o)...) + // prompts = append(prompts, initDeploymentPrompts(o)...) + + return prompts +} + +func init() { + promptsets.Register(&PromptSet{}) +} diff --git a/pkg/promptsets/promptsets.go b/pkg/promptsets/promptsets.go new file mode 100644 index 00000000..e140aa0d --- /dev/null +++ b/pkg/promptsets/promptsets.go @@ -0,0 +1,50 @@ +package promptsets + +import ( + "slices" + "strings" + + "github.com/containers/kubernetes-mcp-server/pkg/api" +) + +var promptsets []api.PromptSet + +// Clear removes all registered promptsets, TESTING PURPOSES ONLY. +func Clear() { + promptsets = []api.PromptSet{} +} + +// Register adds a promptset to the registry +func Register(promptset api.PromptSet) { + promptsets = append(promptsets, promptset) +} + +// PromptSets returns all registered promptsets +func PromptSets() []api.PromptSet { + return promptsets +} + +// PromptSetFromString returns a PromptSet by name, or nil if not found +func PromptSetFromString(name string) api.PromptSet { + for _, ps := range PromptSets() { + if ps.GetName() == strings.TrimSpace(name) { + return ps + } + } + return nil +} + +// AllPromptSets returns all available promptsets +func AllPromptSets() []api.PromptSet { + return PromptSets() +} + +// GetPromptSetNames returns names of all registered promptsets +func GetPromptSetNames() []string { + names := make([]string, 0, len(promptsets)) + for _, ps := range promptsets { + names = append(names, ps.GetName()) + } + slices.Sort(names) + return names +} diff --git a/pkg/promptsets/promptsets_test.go b/pkg/promptsets/promptsets_test.go new file mode 100644 index 00000000..31764361 --- /dev/null +++ b/pkg/promptsets/promptsets_test.go @@ -0,0 +1,138 @@ +package promptsets + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/suite" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + internalk8s "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" +) + +type PromptSetsSuite struct { + suite.Suite +} + +func (s *PromptSetsSuite) SetupTest() { + // Clear the registry before each test + Clear() +} + +func (s *PromptSetsSuite) TestRegister() { + // Given + testPS := &testPromptSet{name: "test"} + + // When + Register(testPS) + + // Then + assert.Equal(s.T(), 1, len(PromptSets())) + assert.Equal(s.T(), testPS, PromptSets()[0]) +} + +func (s *PromptSetsSuite) TestPromptSetFromString() { + s.Run("Returns nil if promptset not found", func() { + // When + ps := PromptSetFromString("nonexistent") + + // Then + assert.Nil(s.T(), ps) + }) + + s.Run("Returns the correct promptset if found", func() { + // Given + testPS := &testPromptSet{name: "test"} + Register(testPS) + + // When + ps := PromptSetFromString("test") + + // Then + assert.Equal(s.T(), testPS, ps) + assert.Equal(s.T(), "test", ps.GetName()) + }) + + s.Run("Returns the correct promptset if found after trimming spaces", func() { + // Given + testPS := &testPromptSet{name: "test"} + Register(testPS) + + // When + ps := PromptSetFromString(" test ") + + // Then + assert.Equal(s.T(), testPS, ps) + }) +} + +func (s *PromptSetsSuite) TestAllPromptSets() { + // Given + testPS1 := &testPromptSet{name: "test1"} + testPS2 := &testPromptSet{name: "test2"} + Register(testPS1) + Register(testPS2) + + // When + all := AllPromptSets() + + // Then + assert.Equal(s.T(), 2, len(all)) + assert.Contains(s.T(), all, testPS1) + assert.Contains(s.T(), all, testPS2) +} + +func (s *PromptSetsSuite) TestGetPromptSetNames() { + s.Run("Returns empty slice when no promptsets registered", func() { + // When + names := GetPromptSetNames() + + // Then + assert.Empty(s.T(), names) + }) + + s.Run("Returns sorted names of all registered promptsets", func() { + // Given + Register(&testPromptSet{name: "zebra"}) + Register(&testPromptSet{name: "alpha"}) + Register(&testPromptSet{name: "beta"}) + + // When + names := GetPromptSetNames() + + // Then + assert.Equal(s.T(), []string{"alpha", "beta", "zebra"}, names) + }) +} + +func TestPromptSets(t *testing.T) { + suite.Run(t, new(PromptSetsSuite)) +} + +// Test helper +type testPromptSet struct { + name string +} + +func (t *testPromptSet) GetName() string { + return t.name +} + +func (t *testPromptSet) GetDescription() string { + return "Test promptset" +} + +func (t *testPromptSet) GetPrompts(o internalk8s.Openshift) []api.ServerPrompt { + return []api.ServerPrompt{ + { + Name: "test_prompt", + Description: "Test prompt", + Arguments: []api.PromptArgument{}, + GetMessages: func(arguments map[string]string) []api.PromptMessage { + return []api.PromptMessage{ + {Role: "user", Content: "test"}, + } + }, + }, + } +} diff --git a/pkg/toolsets/core/health_check.go b/pkg/toolsets/core/health_check.go deleted file mode 100644 index 805ff642..00000000 --- a/pkg/toolsets/core/health_check.go +++ /dev/null @@ -1,964 +0,0 @@ -package core - -import ( - "encoding/json" - "fmt" - "strings" - "time" - - "github.com/google/jsonschema-go/jsonschema" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/utils/ptr" - - "github.com/containers/kubernetes-mcp-server/pkg/api" - internalk8s "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" -) - -const ( - // HighRestartThreshold defines the number of restarts above which a pod is considered to have high restarts - HighRestartThreshold = 5 - // EventsTimeWindow defines how far back to look for warning events (in minutes) - EventsTimeWindow = 30 * time.Minute - // MaxRecentWarnings defines the maximum number of recent warning events to collect - MaxRecentWarnings = 20 - // MaxVerboseWarnings defines the maximum number of warning events to display in verbose mode - MaxVerboseWarnings = 10 -) - -// HealthCheckResult represents the overall health check result -type HealthCheckResult struct { - Cluster ClusterInfo `json:"cluster"` - Summary HealthSummary `json:"summary"` - Checks HealthChecks `json:"checks"` - Timestamp string `json:"timestamp"` - Details []string `json:"details,omitempty"` -} - -// ClusterInfo contains basic cluster information -type ClusterInfo struct { - Type string `json:"type"` - Version string `json:"version,omitempty"` -} - -// HealthSummary provides a high-level summary of health status -type HealthSummary struct { - CriticalIssues int `json:"criticalIssues"` - Warnings int `json:"warnings"` - Healthy bool `json:"healthy"` - Status string `json:"status"` -} - -// HealthChecks contains detailed health check results -type HealthChecks struct { - ClusterOperators *ClusterOperatorHealth `json:"clusterOperators,omitempty"` - Nodes NodeHealth `json:"nodes"` - Pods PodHealth `json:"pods"` - Workloads WorkloadHealth `json:"workloads"` - Storage StorageHealth `json:"storage"` - Events EventHealth `json:"events,omitempty"` -} - -// ClusterOperatorHealth represents OpenShift cluster operator health -type ClusterOperatorHealth struct { - Status string `json:"status"` - Total int `json:"total"` - Degraded []string `json:"degraded,omitempty"` - Unavailable []string `json:"unavailable,omitempty"` - Progressing []string `json:"progressing,omitempty"` -} - -// NodeHealth represents node health status -type NodeHealth struct { - Status string `json:"status"` - Total int `json:"total"` - NotReady []string `json:"notReady,omitempty"` - Unschedulable []string `json:"unschedulable,omitempty"` - UnderPressure []string `json:"underPressure,omitempty"` -} - -// PodHealth represents pod health status -type PodHealth struct { - Status string `json:"status"` - Total int `json:"total"` - Failed []string `json:"failed,omitempty"` - CrashLooping []string `json:"crashLooping,omitempty"` - ImagePullErrors []string `json:"imagePullErrors,omitempty"` - HighRestarts []string `json:"highRestarts,omitempty"` -} - -// WorkloadHealth represents workload controller health -type WorkloadHealth struct { - UnhealthyDeployments []string `json:"unhealthyDeployments,omitempty"` - UnhealthyStatefulSets []string `json:"unhealthyStatefulSets,omitempty"` - UnhealthyDaemonSets []string `json:"unhealthyDaemonSets,omitempty"` -} - -// StorageHealth represents storage health status -type StorageHealth struct { - PendingPVCs []string `json:"pendingPVCs,omitempty"` -} - -// EventHealth represents recent event analysis -type EventHealth struct { - RecentWarnings []string `json:"recentWarnings,omitempty"` -} - -func initHealthCheck(o internalk8s.Openshift) []api.ServerTool { - return []api.ServerTool{ - {Tool: api.Tool{ - Name: "cluster_health_check", - Description: "Perform comprehensive health check on Kubernetes/OpenShift cluster and report issues. Examines cluster operators (OpenShift), nodes, deployments, pods, persistent volumes, and events to identify problems affecting cluster stability or workload availability.", - InputSchema: &jsonschema.Schema{ - Type: "object", - Properties: map[string]*jsonschema.Schema{ - "verbose": { - Type: "boolean", - Description: "Enable detailed output with additional context and resource-level details", - Default: api.ToRawMessage(false), - }, - "output_format": { - Type: "string", - Description: "Output format for results: 'text' (human-readable) or 'json' (machine-readable)", - Enum: []interface{}{"text", "json"}, - Default: api.ToRawMessage("text"), - }, - "check_events": { - Type: "boolean", - Description: "Include recent warning events in the health check (may increase execution time)", - Default: api.ToRawMessage(true), - }, - }, - }, - Annotations: api.ToolAnnotations{ - Title: "Cluster: Health Check", - ReadOnlyHint: ptr.To(true), - DestructiveHint: ptr.To(false), - IdempotentHint: ptr.To(true), - OpenWorldHint: ptr.To(true), - }, - }, Handler: createHealthCheckHandler(o)}, - } -} - -func createHealthCheckHandler(o internalk8s.Openshift) api.ToolHandlerFunc { - return func(params api.ToolHandlerParams) (*api.ToolCallResult, error) { - verbose := false - if v, ok := params.GetArguments()["verbose"].(bool); ok { - verbose = v - } - - outputFormat := "text" - if v, ok := params.GetArguments()["output_format"].(string); ok { - outputFormat = v - } - - checkEvents := true - if v, ok := params.GetArguments()["check_events"].(bool); ok { - checkEvents = v - } - - result := &HealthCheckResult{ - Timestamp: time.Now().UTC().Format(time.RFC3339), - Details: []string{}, - } - - // Determine cluster type - isOpenShift := o.IsOpenShift(params.Context) - result.Cluster.Type = "Kubernetes" - if isOpenShift { - result.Cluster.Type = "OpenShift" - } - - // Get cluster version - version, err := getClusterVersion(params, isOpenShift) - if err == nil { - result.Cluster.Version = version - } - - // Run health checks - criticalCount := 0 - warningCount := 0 - - // 1. Check Cluster Operators (OpenShift only) - if isOpenShift { - coHealth, details, critical, warning := checkClusterOperators(params, verbose) - result.Checks.ClusterOperators = coHealth - result.Details = append(result.Details, details...) - criticalCount += critical - warningCount += warning - } - - // 2. Check Node Health - nodeHealth, details, critical, warning := checkNodeHealth(params, verbose) - result.Checks.Nodes = *nodeHealth - result.Details = append(result.Details, details...) - criticalCount += critical - warningCount += warning - - // 3. Check Pod Health - podHealth, details, critical, warning := checkPodHealth(params, verbose) - result.Checks.Pods = *podHealth - result.Details = append(result.Details, details...) - criticalCount += critical - warningCount += warning - - // 4. Check Workload Controllers - workloadHealth, details, warning := checkWorkloadHealth(params, verbose) - result.Checks.Workloads = *workloadHealth - result.Details = append(result.Details, details...) - warningCount += warning - - // 5. Check Storage - storageHealth, details, warning := checkStorageHealth(params, verbose) - result.Checks.Storage = *storageHealth - result.Details = append(result.Details, details...) - warningCount += warning - - // 6. Check Recent Events (optional) - if checkEvents { - eventHealth, details := checkRecentEvents(params, verbose) - if eventHealth != nil { - result.Checks.Events = *eventHealth - result.Details = append(result.Details, details...) - } - } - - // Build summary - result.Summary.CriticalIssues = criticalCount - result.Summary.Warnings = warningCount - result.Summary.Healthy = criticalCount == 0 - - if criticalCount > 0 { - result.Summary.Status = "critical" - } else if warningCount > 0 { - result.Summary.Status = "warning" - } else { - result.Summary.Status = "healthy" - } - - // Format output - var output string - if outputFormat == "json" { - jsonBytes, err := json.MarshalIndent(result, "", " ") - if err != nil { - return api.NewToolCallResult("", fmt.Errorf("failed to marshal health check result to JSON: %v", err)), nil - } - output = string(jsonBytes) - } else { - output = formatTextOutput(result, verbose) - } - - return api.NewToolCallResult(output, nil), nil - } -} - -// getClusterVersion retrieves the cluster version information. -// For OpenShift clusters, it attempts to get the version from ClusterVersion resource. -// Returns the version string or an error if unable to determine. -func getClusterVersion(params api.ToolHandlerParams, isOpenShift bool) (string, error) { - if isOpenShift { - // Try to get OpenShift cluster version - gvk := &schema.GroupVersionKind{ - Group: "config.openshift.io", - Version: "v1", - Kind: "ClusterVersion", - } - versions, err := params.ResourcesList(params, gvk, "", internalk8s.ResourceListOptions{}) - if err == nil && versions != nil { - versionList := versions.UnstructuredContent() - if items, ok := versionList["items"].([]interface{}); ok && len(items) > 0 { - if item, ok := items[0].(map[string]interface{}); ok { - if status, ok := item["status"].(map[string]interface{}); ok { - if desired, ok := status["desired"].(map[string]interface{}); ok { - if ver, ok := desired["version"].(string); ok { - return ver, nil - } - } - } - } - } - } - } - - // Fallback: Get server version - // Note: This would require access to discovery client which isn't exposed in params - // For now, return empty string - return "", fmt.Errorf("unable to determine cluster version") -} - -// checkClusterOperators examines OpenShift cluster operator health status. -// It checks for degraded, unavailable, or progressing operators. -// Returns: health status, detail messages, critical count, warning count. -func checkClusterOperators(params api.ToolHandlerParams, verbose bool) (*ClusterOperatorHealth, []string, int, int) { - health := &ClusterOperatorHealth{ - Status: "healthy", - Degraded: []string{}, - Unavailable: []string{}, - Progressing: []string{}, - } - details := []string{"Checking Cluster Operators..."} - criticalCount := 0 - warningCount := 0 - - gvk := &schema.GroupVersionKind{ - Group: "config.openshift.io", - Version: "v1", - Kind: "ClusterOperator", - } - - operators, err := params.ResourcesList(params, gvk, "", internalk8s.ResourceListOptions{}) - if err != nil { - details = append(details, fmt.Sprintf("⚠️ WARNING: Unable to check cluster operators: %v", err)) - warningCount++ - return health, details, criticalCount, warningCount - } - - if operators == nil { - details = append(details, "⚠️ WARNING: No cluster operator data available") - warningCount++ - return health, details, criticalCount, warningCount - } - - operatorList := operators.UnstructuredContent() - items, ok := operatorList["items"].([]interface{}) - if !ok { - details = append(details, "⚠️ WARNING: Unable to parse cluster operator data") - warningCount++ - return health, details, criticalCount, warningCount - } - - health.Total = len(items) - - for _, item := range items { - op, ok := item.(map[string]interface{}) - if !ok { - continue - } - - metadata, _ := op["metadata"].(map[string]interface{}) - name, _ := metadata["name"].(string) - - status, ok := op["status"].(map[string]interface{}) - if !ok { - continue - } - - conditions, ok := status["conditions"].([]interface{}) - if !ok { - continue - } - - for _, cond := range conditions { - condition, ok := cond.(map[string]interface{}) - if !ok { - continue - } - - condType, _ := condition["type"].(string) - condStatus, _ := condition["status"].(string) - - if condType == "Degraded" && condStatus == "True" { - health.Degraded = append(health.Degraded, name) - criticalCount++ - health.Status = "critical" - if verbose { - reason, _ := condition["reason"].(string) - message, _ := condition["message"].(string) - details = append(details, fmt.Sprintf(" ❌ %s: Degraded - %s: %s", name, reason, message)) - } - } else if condType == "Available" && condStatus == "False" { - health.Unavailable = append(health.Unavailable, name) - criticalCount++ - health.Status = "critical" - } else if condType == "Progressing" && condStatus == "True" { - health.Progressing = append(health.Progressing, name) - warningCount++ - if health.Status != "critical" { - health.Status = "warning" - } - } - } - } - - if len(health.Degraded) > 0 { - details = append(details, fmt.Sprintf("❌ CRITICAL: %d degraded cluster operator(s): %s", len(health.Degraded), strings.Join(health.Degraded, ", "))) - } - if len(health.Unavailable) > 0 { - details = append(details, fmt.Sprintf("❌ CRITICAL: %d unavailable cluster operator(s): %s", len(health.Unavailable), strings.Join(health.Unavailable, ", "))) - } - if len(health.Progressing) > 0 { - details = append(details, fmt.Sprintf("⚠️ WARNING: %d cluster operator(s) in progress: %s", len(health.Progressing), strings.Join(health.Progressing, ", "))) - } - if len(health.Degraded) == 0 && len(health.Unavailable) == 0 && len(health.Progressing) == 0 { - details = append(details, fmt.Sprintf("✅ All cluster operators healthy (%d/%d)", health.Total, health.Total)) - } - - return health, details, criticalCount, warningCount -} - -// checkNodeHealth examines the health status of all nodes in the cluster. -// It checks for nodes that are not ready, unschedulable, or under resource pressure. -// Returns: health status, detail messages, critical count, warning count. -func checkNodeHealth(params api.ToolHandlerParams, verbose bool) (*NodeHealth, []string, int, int) { - health := &NodeHealth{ - Status: "healthy", - NotReady: []string{}, - Unschedulable: []string{}, - UnderPressure: []string{}, - } - details := []string{"Checking Node Health..."} - criticalCount := 0 - warningCount := 0 - - gvk := &schema.GroupVersionKind{ - Group: "", - Version: "v1", - Kind: "Node", - } - - nodes, err := params.ResourcesList(params, gvk, "", internalk8s.ResourceListOptions{}) - if err != nil { - details = append(details, fmt.Sprintf("❌ CRITICAL: Unable to check nodes: %v", err)) - criticalCount++ - return health, details, criticalCount, warningCount - } - - if nodes == nil { - details = append(details, "❌ CRITICAL: No node data available") - criticalCount++ - return health, details, criticalCount, warningCount - } - - nodeList := nodes.UnstructuredContent() - items, ok := nodeList["items"].([]interface{}) - if !ok { - details = append(details, "❌ CRITICAL: Unable to parse node data") - criticalCount++ - return health, details, criticalCount, warningCount - } - - health.Total = len(items) - - for _, item := range items { - node, ok := item.(map[string]interface{}) - if !ok { - continue - } - - metadata, _ := node["metadata"].(map[string]interface{}) - name, _ := metadata["name"].(string) - - // Check unschedulable - spec, _ := node["spec"].(map[string]interface{}) - if unschedulable, ok := spec["unschedulable"].(bool); ok && unschedulable { - health.Unschedulable = append(health.Unschedulable, name) - warningCount++ - if health.Status == "healthy" { - health.Status = "warning" - } - } - - status, ok := node["status"].(map[string]interface{}) - if !ok { - continue - } - - conditions, ok := status["conditions"].([]interface{}) - if !ok { - continue - } - - nodeReady := true - pressureTypes := []string{} - - for _, cond := range conditions { - condition, ok := cond.(map[string]interface{}) - if !ok { - continue - } - - condType, _ := condition["type"].(string) - condStatus, _ := condition["status"].(string) - - if condType == "Ready" && condStatus != "True" { - health.NotReady = append(health.NotReady, name) - criticalCount++ - health.Status = "critical" - nodeReady = false - if verbose { - message, _ := condition["message"].(string) - details = append(details, fmt.Sprintf(" ❌ %s: Not Ready - %s", name, message)) - } - } else if (condType == "MemoryPressure" || condType == "DiskPressure" || condType == "PIDPressure") && condStatus == "True" { - pressureTypes = append(pressureTypes, condType) - } - } - - if len(pressureTypes) > 0 { - pressureInfo := fmt.Sprintf("%s (%s)", name, strings.Join(pressureTypes, ", ")) - health.UnderPressure = append(health.UnderPressure, pressureInfo) - warningCount++ - if health.Status == "healthy" { - health.Status = "warning" - } - if verbose && nodeReady { - details = append(details, fmt.Sprintf(" ⚠️ %s: Under pressure - %s", name, strings.Join(pressureTypes, ", "))) - } - } - } - - if len(health.NotReady) > 0 { - details = append(details, fmt.Sprintf("❌ CRITICAL: %d node(s) not ready: %s", len(health.NotReady), strings.Join(health.NotReady, ", "))) - } - if len(health.Unschedulable) > 0 { - details = append(details, fmt.Sprintf("⚠️ WARNING: %d node(s) unschedulable: %s", len(health.Unschedulable), strings.Join(health.Unschedulable, ", "))) - } - if len(health.UnderPressure) > 0 { - details = append(details, fmt.Sprintf("⚠️ WARNING: %d node(s) under pressure: %s", len(health.UnderPressure), strings.Join(health.UnderPressure, ", "))) - } - if len(health.NotReady) == 0 && len(health.Unschedulable) == 0 && len(health.UnderPressure) == 0 { - details = append(details, fmt.Sprintf("✅ All nodes healthy (%d)", health.Total)) - } - - return health, details, criticalCount, warningCount -} - -// checkPodHealth examines the health status of all pods across all namespaces. -// It identifies failed pods, crash looping pods, image pull errors, and pods with high restart counts. -// Returns: health status, detail messages, critical count, warning count. -func checkPodHealth(params api.ToolHandlerParams, verbose bool) (*PodHealth, []string, int, int) { - health := &PodHealth{ - Status: "healthy", - Failed: []string{}, - CrashLooping: []string{}, - ImagePullErrors: []string{}, - HighRestarts: []string{}, - } - details := []string{"Checking Pod Health..."} - criticalCount := 0 - warningCount := 0 - - pods, err := params.PodsListInAllNamespaces(params, internalk8s.ResourceListOptions{}) - if err != nil { - details = append(details, fmt.Sprintf("❌ CRITICAL: Unable to check pods: %v", err)) - criticalCount++ - return health, details, criticalCount, warningCount - } - - if pods == nil { - details = append(details, "❌ CRITICAL: No pod data available") - criticalCount++ - return health, details, criticalCount, warningCount - } - - podList := pods.UnstructuredContent() - items, ok := podList["items"].([]interface{}) - if !ok { - details = append(details, "❌ CRITICAL: Unable to parse pod data") - criticalCount++ - return health, details, criticalCount, warningCount - } - - health.Total = len(items) - - for _, item := range items { - pod, ok := item.(map[string]interface{}) - if !ok { - continue - } - - metadata, _ := pod["metadata"].(map[string]interface{}) - name, _ := metadata["name"].(string) - namespace, _ := metadata["namespace"].(string) - podName := fmt.Sprintf("%s/%s", namespace, name) - - status, ok := pod["status"].(map[string]interface{}) - if !ok { - continue - } - - phase, _ := status["phase"].(string) - - // Check for failed/pending pods - if phase != "Running" && phase != "Succeeded" { - health.Failed = append(health.Failed, fmt.Sprintf("%s [%s]", podName, phase)) - criticalCount++ - health.Status = "critical" - } - - // Check container statuses - containerStatuses, ok := status["containerStatuses"].([]interface{}) - if !ok { - continue - } - - for _, cs := range containerStatuses { - containerStatus, ok := cs.(map[string]interface{}) - if !ok { - continue - } - - // Check restart count - restartCount, _ := containerStatus["restartCount"].(float64) - if restartCount > HighRestartThreshold { - health.HighRestarts = append(health.HighRestarts, fmt.Sprintf("%s [%d restarts]", podName, int(restartCount))) - warningCount++ - if health.Status == "healthy" { - health.Status = "warning" - } - } - - // Check for CrashLoopBackOff - state, ok := containerStatus["state"].(map[string]interface{}) - if ok { - if waiting, ok := state["waiting"].(map[string]interface{}); ok { - reason, _ := waiting["reason"].(string) - switch reason { - case "CrashLoopBackOff": - health.CrashLooping = append(health.CrashLooping, podName) - criticalCount++ - health.Status = "critical" - case "ImagePullBackOff", "ErrImagePull": - health.ImagePullErrors = append(health.ImagePullErrors, podName) - criticalCount++ - health.Status = "critical" - } - } - } - } - } - - if len(health.Failed) > 0 { - details = append(details, fmt.Sprintf("❌ CRITICAL: %d pod(s) in failed/pending state", len(health.Failed))) - if verbose { - for _, pod := range health.Failed { - details = append(details, fmt.Sprintf(" - %s", pod)) - } - } - } - if len(health.CrashLooping) > 0 { - details = append(details, fmt.Sprintf("❌ CRITICAL: %d pod(s) in CrashLoopBackOff: %s", len(health.CrashLooping), strings.Join(health.CrashLooping, ", "))) - } - if len(health.ImagePullErrors) > 0 { - details = append(details, fmt.Sprintf("❌ CRITICAL: %d pod(s) with image pull errors: %s", len(health.ImagePullErrors), strings.Join(health.ImagePullErrors, ", "))) - } - if len(health.HighRestarts) > 0 { - details = append(details, fmt.Sprintf("⚠️ WARNING: %d pod(s) with high restart count (>%d)", len(health.HighRestarts), HighRestartThreshold)) - if verbose { - for _, pod := range health.HighRestarts { - details = append(details, fmt.Sprintf(" - %s", pod)) - } - } - } - if len(health.Failed) == 0 && len(health.CrashLooping) == 0 && len(health.ImagePullErrors) == 0 && len(health.HighRestarts) == 0 { - details = append(details, fmt.Sprintf("✅ All pods healthy (%d)", health.Total)) - } - - return health, details, criticalCount, warningCount -} - -// checkWorkloadHealth examines the health of workload controllers (Deployments, StatefulSets, DaemonSets). -// It identifies controllers with unavailable or not-ready replicas. -// Returns: health status, detail messages, warning count. -func checkWorkloadHealth(params api.ToolHandlerParams, verbose bool) (*WorkloadHealth, []string, int) { - health := &WorkloadHealth{ - UnhealthyDeployments: []string{}, - UnhealthyStatefulSets: []string{}, - UnhealthyDaemonSets: []string{}, - } - details := []string{"Checking Workload Controllers..."} - warningCount := 0 - - // Check Deployments - deploymentsGVK := &schema.GroupVersionKind{ - Group: "apps", - Version: "v1", - Kind: "Deployment", - } - deployments, err := params.ResourcesList(params, deploymentsGVK, "", internalk8s.ResourceListOptions{}) - if err == nil && deployments != nil { - depList := deployments.UnstructuredContent() - if items, ok := depList["items"].([]interface{}); ok { - for _, item := range items { - dep, _ := item.(map[string]interface{}) - metadata, _ := dep["metadata"].(map[string]interface{}) - name, _ := metadata["name"].(string) - namespace, _ := metadata["namespace"].(string) - - status, _ := dep["status"].(map[string]interface{}) - spec, _ := dep["spec"].(map[string]interface{}) - - replicas, _ := spec["replicas"].(float64) - readyReplicas, _ := status["readyReplicas"].(float64) - unavailableReplicas, _ := status["unavailableReplicas"].(float64) - - if unavailableReplicas > 0 || readyReplicas != replicas { - health.UnhealthyDeployments = append(health.UnhealthyDeployments, - fmt.Sprintf("%s/%s [Ready: %d/%d]", namespace, name, int(readyReplicas), int(replicas))) - warningCount++ - } - } - } - } - - // Check StatefulSets - stsGVK := &schema.GroupVersionKind{ - Group: "apps", - Version: "v1", - Kind: "StatefulSet", - } - statefulsets, err := params.ResourcesList(params, stsGVK, "", internalk8s.ResourceListOptions{}) - if err == nil && statefulsets != nil { - stsList := statefulsets.UnstructuredContent() - if items, ok := stsList["items"].([]interface{}); ok { - for _, item := range items { - sts, _ := item.(map[string]interface{}) - metadata, _ := sts["metadata"].(map[string]interface{}) - name, _ := metadata["name"].(string) - namespace, _ := metadata["namespace"].(string) - - status, _ := sts["status"].(map[string]interface{}) - spec, _ := sts["spec"].(map[string]interface{}) - - replicas, _ := spec["replicas"].(float64) - readyReplicas, _ := status["readyReplicas"].(float64) - - if readyReplicas != replicas { - health.UnhealthyStatefulSets = append(health.UnhealthyStatefulSets, - fmt.Sprintf("%s/%s [Ready: %d/%d]", namespace, name, int(readyReplicas), int(replicas))) - warningCount++ - } - } - } - } - - // Check DaemonSets - dsGVK := &schema.GroupVersionKind{ - Group: "apps", - Version: "v1", - Kind: "DaemonSet", - } - daemonsets, err := params.ResourcesList(params, dsGVK, "", internalk8s.ResourceListOptions{}) - if err == nil && daemonsets != nil { - dsList := daemonsets.UnstructuredContent() - if items, ok := dsList["items"].([]interface{}); ok { - for _, item := range items { - ds, _ := item.(map[string]interface{}) - metadata, _ := ds["metadata"].(map[string]interface{}) - name, _ := metadata["name"].(string) - namespace, _ := metadata["namespace"].(string) - - status, _ := ds["status"].(map[string]interface{}) - - desiredNumberScheduled, _ := status["desiredNumberScheduled"].(float64) - numberReady, _ := status["numberReady"].(float64) - - if numberReady != desiredNumberScheduled { - health.UnhealthyDaemonSets = append(health.UnhealthyDaemonSets, - fmt.Sprintf("%s/%s [Ready: %d/%d]", namespace, name, int(numberReady), int(desiredNumberScheduled))) - warningCount++ - } - } - } - } - - if len(health.UnhealthyDeployments) > 0 { - details = append(details, fmt.Sprintf("⚠️ WARNING: %d deployment(s) with unavailable replicas", len(health.UnhealthyDeployments))) - if verbose { - for _, dep := range health.UnhealthyDeployments { - details = append(details, fmt.Sprintf(" - %s", dep)) - } - } - } else { - details = append(details, "✅ All deployments healthy") - } - - if len(health.UnhealthyStatefulSets) > 0 { - details = append(details, fmt.Sprintf("⚠️ WARNING: %d statefulset(s) with unavailable replicas", len(health.UnhealthyStatefulSets))) - if verbose { - for _, sts := range health.UnhealthyStatefulSets { - details = append(details, fmt.Sprintf(" - %s", sts)) - } - } - } else { - details = append(details, "✅ All statefulsets healthy") - } - - if len(health.UnhealthyDaemonSets) > 0 { - details = append(details, fmt.Sprintf("⚠️ WARNING: %d daemonset(s) with unavailable pods", len(health.UnhealthyDaemonSets))) - if verbose { - for _, ds := range health.UnhealthyDaemonSets { - details = append(details, fmt.Sprintf(" - %s", ds)) - } - } - } else { - details = append(details, "✅ All daemonsets healthy") - } - - return health, details, warningCount -} - -// checkStorageHealth examines the health of persistent volume claims (PVCs). -// It identifies PVCs that are not in Bound state. -// Returns: health status, detail messages, warning count. -func checkStorageHealth(params api.ToolHandlerParams, verbose bool) (*StorageHealth, []string, int) { - health := &StorageHealth{ - PendingPVCs: []string{}, - } - details := []string{"Checking Storage..."} - warningCount := 0 - - pvcGVK := &schema.GroupVersionKind{ - Group: "", - Version: "v1", - Kind: "PersistentVolumeClaim", - } - - pvcs, err := params.ResourcesList(params, pvcGVK, "", internalk8s.ResourceListOptions{}) - if err != nil { - if !errors.IsNotFound(err) { - details = append(details, fmt.Sprintf("⚠️ WARNING: Unable to check PVCs: %v", err)) - warningCount++ - } - return health, details, warningCount - } - - if pvcs != nil { - pvcList := pvcs.UnstructuredContent() - if items, ok := pvcList["items"].([]interface{}); ok { - for _, item := range items { - pvc, _ := item.(map[string]interface{}) - metadata, _ := pvc["metadata"].(map[string]interface{}) - name, _ := metadata["name"].(string) - namespace, _ := metadata["namespace"].(string) - - status, _ := pvc["status"].(map[string]interface{}) - phase, _ := status["phase"].(string) - - if phase != "Bound" { - health.PendingPVCs = append(health.PendingPVCs, fmt.Sprintf("%s/%s [%s]", namespace, name, phase)) - warningCount++ - } - } - } - } - - if len(health.PendingPVCs) > 0 { - details = append(details, fmt.Sprintf("⚠️ WARNING: %d PVC(s) not bound", len(health.PendingPVCs))) - if verbose { - for _, pvc := range health.PendingPVCs { - details = append(details, fmt.Sprintf(" - %s", pvc)) - } - } - } else { - details = append(details, "✅ All PVCs bound") - } - - return health, details, warningCount -} - -// checkRecentEvents analyzes recent warning events in the cluster. -// It collects warning events from the last 30 minutes across all namespaces. -// Returns: event health status, detail messages. -func checkRecentEvents(params api.ToolHandlerParams, verbose bool) (*EventHealth, []string) { - health := &EventHealth{ - RecentWarnings: []string{}, - } - details := []string{"Checking Recent Events..."} - - // Get events from all namespaces - eventList, err := params.EventsList(params, "") - if err != nil { - details = append(details, fmt.Sprintf("⚠️ Unable to check events: %v", err)) - return health, details - } - - // Parse events - thirtyMinutesAgo := time.Now().Add(-EventsTimeWindow) - - for _, event := range eventList { - eventType, _ := event["Type"].(string) - if eventType != string(corev1.EventTypeWarning) { - continue - } - - // Check timestamp - timestamp, _ := event["Timestamp"].(string) - if timestamp != "" { - eventTime, err := time.Parse(time.RFC3339, timestamp) - if err == nil && eventTime.After(thirtyMinutesAgo) { - involvedObject, _ := event["InvolvedObject"].(map[string]string) - namespace, _ := event["Namespace"].(string) - name := involvedObject["Name"] - message, _ := event["Message"].(string) - - eventStr := fmt.Sprintf("%s [%s/%s]: %s", timestamp, namespace, name, message) - health.RecentWarnings = append(health.RecentWarnings, eventStr) - - if len(health.RecentWarnings) >= MaxRecentWarnings { - break - } - } - } - } - - if len(health.RecentWarnings) > 0 { - details = append(details, fmt.Sprintf("⚠️ %d warning event(s) in last %d minutes", len(health.RecentWarnings), int(EventsTimeWindow.Minutes()))) - if verbose { - for i, event := range health.RecentWarnings { - if i >= MaxVerboseWarnings { - details = append(details, fmt.Sprintf(" ... and %d more", len(health.RecentWarnings)-MaxVerboseWarnings)) - break - } - details = append(details, fmt.Sprintf(" - %s", event)) - } - } - } else { - details = append(details, "✅ No recent warning events") - } - - return health, details -} - -// formatTextOutput formats the health check result into a human-readable text report. -// The report includes cluster information, detailed check results, and a summary. -// Returns: formatted text output. -func formatTextOutput(result *HealthCheckResult, verbose bool) string { - var output strings.Builder - - output.WriteString("===============================================\n") - output.WriteString("Cluster Health Check Report\n") - output.WriteString("===============================================\n") - output.WriteString(fmt.Sprintf("Cluster Type: %s\n", result.Cluster.Type)) - if result.Cluster.Version != "" { - output.WriteString(fmt.Sprintf("Cluster Version: %s\n", result.Cluster.Version)) - } - output.WriteString(fmt.Sprintf("Check Time: %s\n", result.Timestamp)) - output.WriteString("\n") - - // Write details - for _, detail := range result.Details { - output.WriteString(detail) - output.WriteString("\n") - } - - output.WriteString("\n") - output.WriteString("===============================================\n") - output.WriteString("Summary\n") - output.WriteString("===============================================\n") - output.WriteString(fmt.Sprintf("Critical Issues: %d\n", result.Summary.CriticalIssues)) - output.WriteString(fmt.Sprintf("Warnings: %d\n", result.Summary.Warnings)) - output.WriteString("\n") - - if result.Summary.CriticalIssues == 0 && result.Summary.Warnings == 0 { - output.WriteString("✅ Cluster is healthy - no issues detected\n") - } else if result.Summary.CriticalIssues > 0 { - output.WriteString("❌ Cluster has CRITICAL issues requiring immediate attention\n") - } else { - output.WriteString("⚠️ Cluster has warnings - monitoring recommended\n") - } - - return output.String() -} diff --git a/pkg/toolsets/core/toolset.go b/pkg/toolsets/core/toolset.go index 1e30b5b3..dfd61f42 100644 --- a/pkg/toolsets/core/toolset.go +++ b/pkg/toolsets/core/toolset.go @@ -23,7 +23,6 @@ func (t *Toolset) GetDescription() string { func (t *Toolset) GetTools(o internalk8s.Openshift) []api.ServerTool { return slices.Concat( initEvents(), - initHealthCheck(o), initNamespaces(o), initNodes(), initPods(), From afb7d6382b6ec37810314028c374d90b18635722 Mon Sep 17 00:00:00 2001 From: Rohit Patil Date: Wed, 5 Nov 2025 21:32:22 +0530 Subject: [PATCH 3/4] update issue Signed-off-by: Rohit Patil --- pkg/promptsets/core/health_check.go | 9 +++++---- pkg/promptsets/core/health_check_test.go | 6 ++++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/pkg/promptsets/core/health_check.go b/pkg/promptsets/core/health_check.go index a2e69a2c..1a45e184 100644 --- a/pkg/promptsets/core/health_check.go +++ b/pkg/promptsets/core/health_check.go @@ -36,10 +36,11 @@ func initHealthCheckPrompts(o internalk8s.Openshift) []api.ServerPrompt { func buildHealthCheckPromptMessages(verbose bool, namespace string) []api.PromptMessage { scopeMsg := "across all namespaces" - namespaceFilter := "" + podListInstruction := "- Use pods_list to get all pods" + if namespace != "" { scopeMsg = fmt.Sprintf("in namespace '%s'", namespace) - namespaceFilter = fmt.Sprintf(" in namespace '%s'", namespace) + podListInstruction = fmt.Sprintf("- Use pods_list_in_namespace with namespace parameter set to '%s' to get all pods in namespace '%s'", namespace, namespace) } verboseMsg := "" @@ -77,7 +78,7 @@ Follow these steps systematically: - Count total nodes and categorize issues ## 3. Check Pod Health -- Use pods_list to get all pods%s +%s - Identify problematic pods: * Phase = Failed or Pending (CRITICAL) * Container state waiting with reason: @@ -161,7 +162,7 @@ Warnings: [count] - Be efficient: don't call the same tool multiple times unnecessarily - If a resource type doesn't exist (e.g., ClusterOperator on vanilla K8s), skip it gracefully - Provide clear, actionable insights in your summary -- Use emojis for visual clarity: ✅ (healthy), ⚠️ (warning), ❌ (critical)`, scopeMsg, namespaceFilter, verboseMsg) +- Use emojis for visual clarity: ✅ (healthy), ⚠️ (warning), ❌ (critical)`, scopeMsg, podListInstruction, verboseMsg) assistantMessage := `I'll perform a comprehensive cluster health check following the systematic approach outlined. Let me start by gathering information about the cluster components.` diff --git a/pkg/promptsets/core/health_check_test.go b/pkg/promptsets/core/health_check_test.go index 2d272dcd..2e6e0996 100644 --- a/pkg/promptsets/core/health_check_test.go +++ b/pkg/promptsets/core/health_check_test.go @@ -39,10 +39,10 @@ func TestBuildHealthCheckPromptMessages(t *testing.T) { // Check user message content userContent := messages[0].Content assert.Contains(t, userContent, "across all namespaces") - assert.Contains(t, userContent, "pods_list") + assert.Contains(t, userContent, "Use pods_list to get all pods") assert.Contains(t, userContent, "resources_list") assert.Contains(t, userContent, "events_list") - assert.NotContains(t, userContent, "pods_list_in_all_namespaces") + assert.NotContains(t, userContent, "pods_list_in_namespace") // Check assistant message assert.Contains(t, messages[1].Content, "comprehensive cluster health check") @@ -58,6 +58,8 @@ func TestBuildHealthCheckPromptMessages(t *testing.T) { userContent := messages[0].Content assert.Contains(t, userContent, "in namespace 'test-namespace'") assert.NotContains(t, userContent, "across all namespaces") + assert.Contains(t, userContent, "Use pods_list_in_namespace with namespace parameter set to 'test-namespace'") + assert.NotContains(t, userContent, "Use pods_list to get all pods") }) t.Run("Messages with verbose mode", func(t *testing.T) { From b7fde2ed2bc738ab12f24474b04e53f967dfda15 Mon Sep 17 00:00:00 2001 From: Rohit Patil Date: Fri, 7 Nov 2025 20:55:08 +0530 Subject: [PATCH 4/4] update few errors Signed-off-by: Rohit Patil --- pkg/promptsets/core/health_check.go | 66 ++++++++++++++++++------ pkg/promptsets/core/health_check_test.go | 65 ++++++++++++++++++----- pkg/promptsets/core/promptset.go | 2 +- 3 files changed, 104 insertions(+), 29 deletions(-) diff --git a/pkg/promptsets/core/health_check.go b/pkg/promptsets/core/health_check.go index 1a45e184..3050a26e 100644 --- a/pkg/promptsets/core/health_check.go +++ b/pkg/promptsets/core/health_check.go @@ -2,12 +2,32 @@ package core import ( "fmt" + "strings" "github.com/containers/kubernetes-mcp-server/pkg/api" - internalk8s "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" ) -func initHealthCheckPrompts(o internalk8s.Openshift) []api.ServerPrompt { +const ( + // Health check configuration constants + defaultRestartThreshold = 5 + eventLookbackMinutes = 30 + maxWarningEvents = 20 +) + +// isVerboseEnabled checks if the verbose flag is enabled. +// It accepts "true", "1", "yes", or "y" (case-insensitive) as truthy values. +func isVerboseEnabled(value string) bool { + switch strings.ToLower(value) { + case "true", "1", "yes", "y": + return true + default: + return false + } +} + +// initHealthCheckPrompts creates prompts for cluster health diagnostics. +// These prompts guide LLMs to systematically check cluster components using existing tools. +func initHealthCheckPrompts() []api.ServerPrompt { return []api.ServerPrompt{ { Name: "cluster_health_check", @@ -25,7 +45,7 @@ func initHealthCheckPrompts(o internalk8s.Openshift) []api.ServerPrompt { }, }, GetMessages: func(arguments map[string]string) []api.PromptMessage { - verbose := arguments["verbose"] == "true" + verbose := isVerboseEnabled(arguments["verbose"]) namespace := arguments["namespace"] return buildHealthCheckPromptMessages(verbose, namespace) @@ -34,13 +54,15 @@ func initHealthCheckPrompts(o internalk8s.Openshift) []api.ServerPrompt { } } +// buildHealthCheckPromptMessages constructs the prompt messages for cluster health checks. +// It adapts the instructions based on verbose mode and namespace filtering. func buildHealthCheckPromptMessages(verbose bool, namespace string) []api.PromptMessage { scopeMsg := "across all namespaces" podListInstruction := "- Use pods_list to get all pods" if namespace != "" { scopeMsg = fmt.Sprintf("in namespace '%s'", namespace) - podListInstruction = fmt.Sprintf("- Use pods_list_in_namespace with namespace parameter set to '%s' to get all pods in namespace '%s'", namespace, namespace) + podListInstruction = fmt.Sprintf("- Use pods_list_in_namespace with namespace '%s'", namespace) } verboseMsg := "" @@ -52,6 +74,9 @@ func buildHealthCheckPromptMessages(verbose bool, namespace string) []api.Prompt "- Event messages and timestamps" } + // Construct the event display range dynamically using maxWarningEvents + eventDisplayRange := fmt.Sprintf("10-%d", maxWarningEvents) + userMessage := fmt.Sprintf(`Please perform a comprehensive health check on the Kubernetes cluster %s. Follow these steps systematically: @@ -59,7 +84,7 @@ Follow these steps systematically: ## 1. Check Cluster-Level Components ### For OpenShift Clusters: -- Use resources_list with kind=ClusterOperator to check cluster operator health +- Use resources_list with apiVersion=config.openshift.io/v1 and kind=ClusterOperator to check cluster operator health - Look for operators with: * Degraded=True (CRITICAL) * Available=False (CRITICAL) @@ -70,7 +95,7 @@ Follow these steps systematically: - Note the cluster type in your report ## 2. Check Node Health -- Use resources_list with kind=Node to examine all nodes +- Use resources_list with apiVersion=v1 and kind=Node to examine all nodes - Check each node for: * Ready condition != True (CRITICAL) * Unschedulable spec field = true (WARNING) @@ -84,21 +109,21 @@ Follow these steps systematically: * Container state waiting with reason: - CrashLoopBackOff (CRITICAL) - ImagePullBackOff or ErrImagePull (CRITICAL) - * RestartCount > 5 (WARNING - configurable threshold) + * RestartCount > %d (WARNING - configurable threshold) - Group issues by type and count occurrences ## 4. Check Workload Controllers - Use resources_list for each workload type: - * kind=Deployment (apps/v1) - * kind=StatefulSet (apps/v1) - * kind=DaemonSet (apps/v1) + * apiVersion=apps/v1, kind=Deployment + * apiVersion=apps/v1, kind=StatefulSet + * apiVersion=apps/v1, kind=DaemonSet - For each controller, compare: * spec.replicas vs status.readyReplicas (Deployment/StatefulSet) * status.desiredNumberScheduled vs status.numberReady (DaemonSet) * Report mismatches as WARNINGs ## 5. Check Storage -- Use resources_list with kind=PersistentVolumeClaim +- Use resources_list with apiVersion=v1 and kind=PersistentVolumeClaim - Identify PVCs not in Bound phase (WARNING) - Note namespace and PVC name for each issue @@ -106,8 +131,8 @@ Follow these steps systematically: - Use events_list to get cluster events - Filter for: * Type = Warning - * Timestamp within last 30 minutes -- Limit to 10-20 most recent warnings + * Timestamp within last %d minutes +- Limit to %s most recent warnings - Include event message and involved object%s ## Output Format @@ -139,7 +164,7 @@ Scope: [all namespaces / specific namespace] [PVC status: total, bound, pending/other] ### Recent Events -[Warning events from last 30 minutes] +[Warning events from last %d minutes] ================================================ Summary @@ -162,7 +187,18 @@ Warnings: [count] - Be efficient: don't call the same tool multiple times unnecessarily - If a resource type doesn't exist (e.g., ClusterOperator on vanilla K8s), skip it gracefully - Provide clear, actionable insights in your summary -- Use emojis for visual clarity: ✅ (healthy), ⚠️ (warning), ❌ (critical)`, scopeMsg, podListInstruction, verboseMsg) +- Use emojis for visual clarity: ✅ (healthy), ⚠️ (warning), ❌ (critical) + +### Common apiVersion Values + +When using resources_list, specify the correct apiVersion for each resource type: +- Core resources: apiVersion=v1 (Pod, Service, Node, PersistentVolumeClaim, ConfigMap, Secret, Namespace) +- Apps: apiVersion=apps/v1 (Deployment, StatefulSet, DaemonSet, ReplicaSet) +- Batch: apiVersion=batch/v1 (Job, CronJob) +- RBAC: apiVersion=rbac.authorization.k8s.io/v1 (Role, RoleBinding, ClusterRole, ClusterRoleBinding) +- Networking: apiVersion=networking.k8s.io/v1 (Ingress, NetworkPolicy) +- OpenShift Config: apiVersion=config.openshift.io/v1 (ClusterOperator, ClusterVersion) +- OpenShift Routes: apiVersion=route.openshift.io/v1 (Route)`, scopeMsg, podListInstruction, defaultRestartThreshold, eventLookbackMinutes, eventDisplayRange, verboseMsg, eventLookbackMinutes) assistantMessage := `I'll perform a comprehensive cluster health check following the systematic approach outlined. Let me start by gathering information about the cluster components.` diff --git a/pkg/promptsets/core/health_check_test.go b/pkg/promptsets/core/health_check_test.go index 2e6e0996..2968ad29 100644 --- a/pkg/promptsets/core/health_check_test.go +++ b/pkg/promptsets/core/health_check_test.go @@ -8,9 +8,39 @@ import ( "github.com/stretchr/testify/require" ) +func TestIsVerboseEnabled(t *testing.T) { + tests := []struct { + name string + input string + expected bool + }{ + {"true lowercase", "true", true}, + {"true capitalized", "True", true}, + {"true uppercase", "TRUE", true}, + {"numeric 1", "1", true}, + {"yes lowercase", "yes", true}, + {"yes capitalized", "Yes", true}, + {"yes uppercase", "YES", true}, + {"y lowercase", "y", true}, + {"y uppercase", "Y", true}, + {"false", "false", false}, + {"0", "0", false}, + {"no", "no", false}, + {"empty string", "", false}, + {"random string", "random", false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := isVerboseEnabled(tt.input) + assert.Equal(t, tt.expected, result, "isVerboseEnabled(%q) should return %v", tt.input, tt.expected) + }) + } +} + func TestInitHealthCheckPrompts(t *testing.T) { // When - prompts := initHealthCheckPrompts(nil) + prompts := initHealthCheckPrompts() // Then require.Len(t, prompts, 1) @@ -58,7 +88,7 @@ func TestBuildHealthCheckPromptMessages(t *testing.T) { userContent := messages[0].Content assert.Contains(t, userContent, "in namespace 'test-namespace'") assert.NotContains(t, userContent, "across all namespaces") - assert.Contains(t, userContent, "Use pods_list_in_namespace with namespace parameter set to 'test-namespace'") + assert.Contains(t, userContent, "Use pods_list_in_namespace with namespace 'test-namespace'") assert.NotContains(t, userContent, "Use pods_list to get all pods") }) @@ -156,24 +186,25 @@ func TestBuildHealthCheckPromptMessages(t *testing.T) { } }) - t.Run("User message contains workload types", func(t *testing.T) { + t.Run("User message contains workload types with apiVersions", func(t *testing.T) { // When messages := buildHealthCheckPromptMessages(false, "") // Then userContent := messages[0].Content - workloadTypes := []string{ - "kind=Deployment", - "kind=StatefulSet", - "kind=DaemonSet", - "kind=ClusterOperator", - "kind=Node", - "kind=PersistentVolumeClaim", + // Check for apiVersion + kind pairs + resourceSpecs := []string{ + "apiVersion=apps/v1, kind=Deployment", + "apiVersion=apps/v1, kind=StatefulSet", + "apiVersion=apps/v1, kind=DaemonSet", + "apiVersion=config.openshift.io/v1 and kind=ClusterOperator", + "apiVersion=v1 and kind=Node", + "apiVersion=v1 and kind=PersistentVolumeClaim", } - for _, wl := range workloadTypes { - assert.Contains(t, userContent, wl, "Missing workload type: %s", wl) + for _, spec := range resourceSpecs { + assert.Contains(t, userContent, spec, "Missing resource spec: %s", spec) } }) @@ -218,7 +249,7 @@ func TestBuildHealthCheckPromptMessages(t *testing.T) { func TestGetMessagesWithArguments(t *testing.T) { // Given - prompts := initHealthCheckPrompts(nil) + prompts := initHealthCheckPrompts() require.Len(t, prompts, 1) getMessages := prompts[0].GetMessages @@ -329,4 +360,12 @@ func TestHealthCheckPromptCompleteness(t *testing.T) { } assert.Greater(t, foundVerbs, 3, "Prompt should use clear imperative language") }) + + t.Run("Includes apiVersion reference section", func(t *testing.T) { + assert.Contains(t, userContent, "Common apiVersion Values") + assert.Contains(t, userContent, "apiVersion=config.openshift.io/v1") + assert.Contains(t, userContent, "apiVersion=apps/v1") + assert.Contains(t, userContent, "apiVersion=v1") + assert.Contains(t, userContent, "ClusterOperator, ClusterVersion") + }) } diff --git a/pkg/promptsets/core/promptset.go b/pkg/promptsets/core/promptset.go index f9e61e2a..8bd35785 100644 --- a/pkg/promptsets/core/promptset.go +++ b/pkg/promptsets/core/promptset.go @@ -25,7 +25,7 @@ func (t *PromptSet) GetPrompts(o internalk8s.Openshift) []api.ServerPrompt { prompts := make([]api.ServerPrompt, 0) // Health check prompts - prompts = append(prompts, initHealthCheckPrompts(o)...) + prompts = append(prompts, initHealthCheckPrompts()...) // Future: Add more prompts here // prompts = append(prompts, initTroubleshootingPrompts(o)...)