Skip to content

Commit 915c1e5

Browse files
JAORMXclaude
andauthored
Fix MCPServer and MCPRemoteProxy operator spec change reconciliation (#2320)
Both MCPServer and MCPRemoteProxy operators were not properly reconciling spec changes, requiring users to delete and recreate resources to apply configuration updates. This fixes the issue by adding ConfigMap checksum tracking to pod template annotations. Changes: - Create shared RunConfigChecksumFetcher in pkg/controllerutil - Add RunConfig ConfigMap checksum to pod template annotations for both operators - Trigger automatic pod restarts when configuration changes - Add constants for checksum annotation keys - Refactor deploymentNeedsUpdate() in both controllers to check pod template annotations - Add comprehensive error handling and nil guards - Handle ConfigMap not found during initial creation with 5-second requeue delay - Update all tests to pass checksum parameter The fix ensures that any spec change (OIDC config, authz, telemetry, tool filters, image, etc.) properly propagates to running pods through Kubernetes rolling updates, enabling declarative GitOps workflows. Fixes #2314 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude <noreply@anthropic.com>
1 parent 0083fd7 commit 915c1e5

9 files changed

+324
-28
lines changed

cmd/thv-operator/controllers/mcpremoteproxy_controller.go

Lines changed: 132 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"fmt"
88
"maps"
99
"reflect"
10+
"time"
1011

1112
appsv1 "k8s.io/api/apps/v1"
1213
corev1 "k8s.io/api/core/v1"
@@ -24,6 +25,7 @@ import (
2425

2526
mcpv1alpha1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1alpha1"
2627
ctrlutil "github.com/stacklok/toolhive/cmd/thv-operator/pkg/controllerutil"
28+
"github.com/stacklok/toolhive/cmd/thv-operator/pkg/runconfig/configmap/checksum"
2729
)
2830

2931
// MCPRemoteProxyReconciler reconciles a MCPRemoteProxy object
@@ -175,17 +177,58 @@ func (r *MCPRemoteProxyReconciler) ensureAuthzConfigMapForProxy(ctx context.Cont
175177
)
176178
}
177179

178-
// ensureDeployment ensures the Deployment exists and is up to date
180+
// getRunConfigChecksum fetches the RunConfig ConfigMap checksum annotation for this proxy.
181+
// Uses the shared RunConfigChecksumFetcher to maintain consistency with MCPServer.
182+
func (r *MCPRemoteProxyReconciler) getRunConfigChecksum(
183+
ctx context.Context, proxy *mcpv1alpha1.MCPRemoteProxy,
184+
) (string, error) {
185+
if proxy == nil {
186+
return "", fmt.Errorf("proxy cannot be nil")
187+
}
188+
189+
fetcher := checksum.NewRunConfigChecksumFetcher(r.Client)
190+
return fetcher.GetRunConfigChecksum(ctx, proxy.Namespace, proxy.Name)
191+
}
192+
193+
// ensureDeployment ensures the Deployment exists and is up to date.
194+
//
195+
// This function coordinates deployment creation and updates, including:
196+
// - Fetching the RunConfig ConfigMap checksum for pod restart triggering
197+
// - Creating deployments when they don't exist
198+
// - Updating deployments when configuration changes
199+
// - Preserving replica counts for HPA compatibility
200+
//
201+
// If the RunConfig ConfigMap doesn't exist yet (e.g., during initial resource creation),
202+
// the function returns an error that will trigger reconciliation requeue, allowing the
203+
// ConfigMap to be created first in ensureAllResources().
179204
func (r *MCPRemoteProxyReconciler) ensureDeployment(
180205
ctx context.Context, proxy *mcpv1alpha1.MCPRemoteProxy,
181206
) (ctrl.Result, error) {
182207
ctxLogger := log.FromContext(ctx)
183208

209+
// Fetch RunConfig ConfigMap checksum to include in pod template annotations
210+
// This ensures pods restart when configuration changes
211+
runConfigChecksum, err := r.getRunConfigChecksum(ctx, proxy)
212+
if err != nil {
213+
if errors.IsNotFound(err) {
214+
// ConfigMap doesn't exist yet - it will be created by ensureRunConfigConfigMap
215+
// before this function is called. If we still hit this, it's likely a timing
216+
// issue with API server consistency. Requeue with a short delay to allow
217+
// API server propagation.
218+
ctxLogger.Info("RunConfig ConfigMap not found yet, will retry",
219+
"proxy", proxy.Name, "namespace", proxy.Namespace)
220+
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
221+
}
222+
// Other errors (missing annotation, empty checksum, etc.) are real problems
223+
ctxLogger.Error(err, "Failed to get RunConfig checksum")
224+
return ctrl.Result{}, err
225+
}
226+
184227
deployment := &appsv1.Deployment{}
185-
err := r.Get(ctx, types.NamespacedName{Name: proxy.Name, Namespace: proxy.Namespace}, deployment)
228+
err = r.Get(ctx, types.NamespacedName{Name: proxy.Name, Namespace: proxy.Namespace}, deployment)
186229

187230
if errors.IsNotFound(err) {
188-
dep := r.deploymentForMCPRemoteProxy(ctx, proxy)
231+
dep := r.deploymentForMCPRemoteProxy(ctx, proxy, runConfigChecksum)
189232
if dep == nil {
190233
return ctrl.Result{}, fmt.Errorf("failed to create Deployment object")
191234
}
@@ -201,8 +244,8 @@ func (r *MCPRemoteProxyReconciler) ensureDeployment(
201244
}
202245

203246
// Deployment exists - check if it needs to be updated
204-
if r.deploymentNeedsUpdate(ctx, deployment, proxy) {
205-
newDeployment := r.deploymentForMCPRemoteProxy(ctx, proxy)
247+
if r.deploymentNeedsUpdate(ctx, deployment, proxy, runConfigChecksum) {
248+
newDeployment := r.deploymentForMCPRemoteProxy(ctx, proxy, runConfigChecksum)
206249
if newDeployment == nil {
207250
return ctrl.Result{}, fmt.Errorf("failed to create updated Deployment object")
208251
}
@@ -531,16 +574,55 @@ func createProxyServiceURL(proxyName, namespace string, port int32) string {
531574
return fmt.Sprintf("http://%s.%s.svc.cluster.local:%d", serviceName, namespace, port)
532575
}
533576

534-
// deploymentNeedsUpdate checks if the deployment needs to be updated based on spec changes
577+
// deploymentNeedsUpdate checks if the deployment needs to be updated based on spec changes.
578+
//
579+
// This function compares the existing deployment with the desired state derived from the
580+
// MCPRemoteProxy spec. It checks container specs, deployment metadata, and pod template
581+
// metadata (including the RunConfig checksum annotation).
582+
//
583+
// Returns true if any aspect of the deployment differs from the desired state.
535584
func (r *MCPRemoteProxyReconciler) deploymentNeedsUpdate(
536585
ctx context.Context,
537586
deployment *appsv1.Deployment,
538587
proxy *mcpv1alpha1.MCPRemoteProxy,
588+
runConfigChecksum string,
539589
) bool {
590+
if deployment == nil || proxy == nil {
591+
return true
592+
}
593+
540594
if len(deployment.Spec.Template.Spec.Containers) == 0 {
541595
return true
542596
}
543597

598+
if r.containerNeedsUpdate(ctx, deployment, proxy) {
599+
return true
600+
}
601+
602+
if r.deploymentMetadataNeedsUpdate(deployment, proxy) {
603+
return true
604+
}
605+
606+
if r.podTemplateMetadataNeedsUpdate(deployment, proxy, runConfigChecksum) {
607+
return true
608+
}
609+
610+
return false
611+
}
612+
613+
// containerNeedsUpdate checks if the container specification has changed.
614+
//
615+
// Compares container image, ports, environment variables, resource requirements,
616+
// and service account between the existing deployment and desired state.
617+
func (r *MCPRemoteProxyReconciler) containerNeedsUpdate(
618+
ctx context.Context,
619+
deployment *appsv1.Deployment,
620+
proxy *mcpv1alpha1.MCPRemoteProxy,
621+
) bool {
622+
if deployment == nil || proxy == nil || len(deployment.Spec.Template.Spec.Containers) == 0 {
623+
return true
624+
}
625+
544626
container := deployment.Spec.Template.Spec.Containers[0]
545627

546628
// Check if runner image has changed
@@ -572,7 +654,21 @@ func (r *MCPRemoteProxyReconciler) deploymentNeedsUpdate(
572654
return true
573655
}
574656

575-
// Check if deployment metadata has changed
657+
return false
658+
}
659+
660+
// deploymentMetadataNeedsUpdate checks if deployment-level metadata has changed.
661+
//
662+
// Compares deployment labels and annotations, including any user-specified overrides
663+
// from ResourceOverrides.ProxyDeployment.
664+
func (*MCPRemoteProxyReconciler) deploymentMetadataNeedsUpdate(
665+
deployment *appsv1.Deployment,
666+
proxy *mcpv1alpha1.MCPRemoteProxy,
667+
) bool {
668+
if deployment == nil || proxy == nil {
669+
return true
670+
}
671+
576672
expectedLabels := labelsForMCPRemoteProxy(proxy.Name)
577673
expectedAnnotations := make(map[string]string)
578674

@@ -599,6 +695,35 @@ func (r *MCPRemoteProxyReconciler) deploymentNeedsUpdate(
599695
return false
600696
}
601697

698+
// podTemplateMetadataNeedsUpdate checks if pod template metadata has changed.
699+
//
700+
// Compares pod template labels and annotations, including the critical RunConfig
701+
// checksum annotation that triggers pod restarts when configuration changes.
702+
// Also includes any user-specified overrides from ResourceOverrides.PodTemplateMetadata.
703+
func (r *MCPRemoteProxyReconciler) podTemplateMetadataNeedsUpdate(
704+
deployment *appsv1.Deployment,
705+
proxy *mcpv1alpha1.MCPRemoteProxy,
706+
runConfigChecksum string,
707+
) bool {
708+
if deployment == nil || proxy == nil {
709+
return true
710+
}
711+
712+
expectedPodTemplateLabels, expectedPodTemplateAnnotations := r.buildPodTemplateMetadata(
713+
labelsForMCPRemoteProxy(proxy.Name), proxy, runConfigChecksum,
714+
)
715+
716+
if !maps.Equal(deployment.Spec.Template.Labels, expectedPodTemplateLabels) {
717+
return true
718+
}
719+
720+
if !maps.Equal(deployment.Spec.Template.Annotations, expectedPodTemplateAnnotations) {
721+
return true
722+
}
723+
724+
return false
725+
}
726+
602727
// serviceNeedsUpdate checks if the service needs to be updated
603728
func (*MCPRemoteProxyReconciler) serviceNeedsUpdate(service *corev1.Service, proxy *mcpv1alpha1.MCPRemoteProxy) bool {
604729
// Check if port has changed

cmd/thv-operator/controllers/mcpremoteproxy_deployment.go

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,13 @@ import (
1313

1414
mcpv1alpha1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1alpha1"
1515
ctrlutil "github.com/stacklok/toolhive/cmd/thv-operator/pkg/controllerutil"
16+
"github.com/stacklok/toolhive/cmd/thv-operator/pkg/runconfig/configmap/checksum"
1617
"github.com/stacklok/toolhive/pkg/container/kubernetes"
1718
)
1819

1920
// deploymentForMCPRemoteProxy returns a MCPRemoteProxy Deployment object
2021
func (r *MCPRemoteProxyReconciler) deploymentForMCPRemoteProxy(
21-
ctx context.Context, proxy *mcpv1alpha1.MCPRemoteProxy,
22+
ctx context.Context, proxy *mcpv1alpha1.MCPRemoteProxy, runConfigChecksum string,
2223
) *appsv1.Deployment {
2324
ls := labelsForMCPRemoteProxy(proxy.Name)
2425
replicas := int32(1)
@@ -29,7 +30,7 @@ func (r *MCPRemoteProxyReconciler) deploymentForMCPRemoteProxy(
2930
env := r.buildEnvVarsForProxy(ctx, proxy)
3031
resources := ctrlutil.BuildResourceRequirements(proxy.Spec.Resources)
3132
deploymentLabels, deploymentAnnotations := r.buildDeploymentMetadata(ls, proxy)
32-
deploymentTemplateLabels, deploymentTemplateAnnotations := r.buildPodTemplateMetadata(ls, proxy)
33+
deploymentTemplateLabels, deploymentTemplateAnnotations := r.buildPodTemplateMetadata(ls, proxy, runConfigChecksum)
3334
podSecurityContext, containerSecurityContext := r.buildSecurityContexts(ctx, proxy)
3435

3536
dep := &appsv1.Deployment{
@@ -184,13 +185,24 @@ func (*MCPRemoteProxyReconciler) buildDeploymentMetadata(
184185
return deploymentLabels, deploymentAnnotations
185186
}
186187

187-
// buildPodTemplateMetadata builds pod template labels and annotations
188+
// buildPodTemplateMetadata builds pod template labels and annotations.
189+
//
190+
// The runConfigChecksum parameter must be a non-empty SHA256 hash of the RunConfig.
191+
// This checksum is added as an annotation to the pod template, which triggers
192+
// Kubernetes to perform a rolling update when the configuration changes.
193+
//
194+
// User-specified overrides from ResourceOverrides.PodTemplateMetadataOverrides
195+
// are merged after the checksum annotation is set.
188196
func (*MCPRemoteProxyReconciler) buildPodTemplateMetadata(
189-
baseLabels map[string]string, proxy *mcpv1alpha1.MCPRemoteProxy,
197+
baseLabels map[string]string, proxy *mcpv1alpha1.MCPRemoteProxy, runConfigChecksum string,
190198
) (map[string]string, map[string]string) {
191199
templateLabels := baseLabels
192200
templateAnnotations := make(map[string]string)
193201

202+
// Add RunConfig checksum annotation to trigger pod rollout when config changes
203+
// This is critical for ensuring pods restart with updated configuration
204+
templateAnnotations = checksum.AddRunConfigChecksumToPodTemplate(templateAnnotations, runConfigChecksum)
205+
194206
if proxy.Spec.ResourceOverrides != nil &&
195207
proxy.Spec.ResourceOverrides.ProxyDeployment != nil &&
196208
proxy.Spec.ResourceOverrides.ProxyDeployment.PodTemplateMetadataOverrides != nil {

cmd/thv-operator/controllers/mcpremoteproxy_deployment_test.go

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ package controllers
1616

1717
import (
1818
"context"
19+
"fmt"
1920
"testing"
2021

2122
"github.com/stretchr/testify/assert"
@@ -175,7 +176,7 @@ func TestDeploymentForMCPRemoteProxy(t *testing.T) {
175176
PlatformDetector: ctrlutil.NewSharedPlatformDetector(),
176177
}
177178

178-
dep := reconciler.deploymentForMCPRemoteProxy(context.TODO(), tt.proxy)
179+
dep := reconciler.deploymentForMCPRemoteProxy(context.TODO(), tt.proxy, "test-checksum")
179180
require.NotNil(t, dep)
180181

181182
if tt.validate != nil {
@@ -406,6 +407,22 @@ func TestEnsureDeployment(t *testing.T) {
406407
objects = append(objects, tt.existingDeployment)
407408
}
408409

410+
// Add RunConfig ConfigMap with checksum annotation
411+
configMapName := fmt.Sprintf("%s-runconfig", tt.proxy.Name)
412+
runConfigCM := &corev1.ConfigMap{
413+
ObjectMeta: metav1.ObjectMeta{
414+
Name: configMapName,
415+
Namespace: tt.proxy.Namespace,
416+
Annotations: map[string]string{
417+
"toolhive.stacklok.dev/content-checksum": "test-checksum-123",
418+
},
419+
},
420+
Data: map[string]string{
421+
"runconfig.json": "{}",
422+
},
423+
}
424+
objects = append(objects, runConfigCM)
425+
409426
fakeClient := fake.NewClientBuilder().
410427
WithScheme(scheme).
411428
WithRuntimeObjects(objects...).

0 commit comments

Comments
 (0)