From 3e9e798eda7a2c785b76f2b311b239c1b9d979d5 Mon Sep 17 00:00:00 2001 From: Jakub Hrozek Date: Fri, 1 May 2026 09:49:19 +0100 Subject: [PATCH] Preserve runconfig-checksum on pod template overrides deploymentForMCPServer merged user-supplied PodTemplateMetadataOverrides.Annotations onto the wrong base map: it passed deploymentAnnotations (the Deployment-level overrides, typically empty) instead of deploymentTemplateAnnotations (which already had the runconfig-checksum stamped on it). Because ctrlutil.MergeAnnotations is "first-arg-wins on conflict", the checksum was silently dropped from the pod template and any Deployment-level annotations leaked onto pods. Two user-visible symptoms followed: 1. Proxy pods stopped rolling on RunConfig changes whenever PodTemplateMetadataOverrides.Annotations was set. Without the checksum on the pod template, Kubernetes had no signal to recreate pods, so telemetry / inline-OIDC / external- auth-ref edits landed in the ConfigMap on disk but never reached the running proxy. Vault Agent users were the canonical population affected, since vault.hashicorp.com/* keys live in this override field. 2. The drift-checker deploymentNeedsUpdate built the expected pod-template annotations correctly, so it disagreed with the constructor on every reconcile. The operator hot-looped on r.Update without converging. The fix flips one variable so the merge starts from deploymentTemplateAnnotations. Add three layers of test coverage in mcpserver_resource_overrides_test.go: - an expectedPodTemplateAnns column on the table-driven resource-overrides test so every case asserts on deployment.Spec.Template.Annotations - a focused regression test pinning the no-leakage contract (checksum survives, user override survives, no extras) - a parity test asserting deploymentNeedsUpdate reports no drift immediately after deploymentForMCPServer with the same checksum and overrides Why this is safe: - The comparator deploymentNeedsUpdate in the same file was already merging in this order, and the sibling controller MCPRemoteProxyReconciler.buildPodTemplateMetadata uses this exact pattern. The fix brings the buggy constructor in line with both. - Kubernetes convention treats Deployment-level and pod-template annotations as separate scopes; the CRD exposes them as separate user-facing fields (ProxyDeployment.Annotations vs ProxyDeployment.PodTemplateMetadataOverrides.Annotations). No documentation, example (including examples/operator/vault/mcpserver-github-with-vault.yaml), or test in this repository places sidecar-injection annotations at the deployment level, so no documented user is regressed. - MergeAnnotations keeps the first-argument map winning on conflict, so a user cannot accidentally overwrite the runconfig-checksum from their override map. Fixes #5148 --- .../controllers/mcpserver_controller.go | 2 +- .../mcpserver_resource_overrides_test.go | 101 ++++++++++++++++++ 2 files changed, 102 insertions(+), 1 deletion(-) diff --git a/cmd/thv-operator/controllers/mcpserver_controller.go b/cmd/thv-operator/controllers/mcpserver_controller.go index 55e9240095..5ee07f5071 100644 --- a/cmd/thv-operator/controllers/mcpserver_controller.go +++ b/cmd/thv-operator/controllers/mcpserver_controller.go @@ -1218,7 +1218,7 @@ func (r *MCPServerReconciler) deploymentForMCPServer( m.Spec.ResourceOverrides.ProxyDeployment.PodTemplateMetadataOverrides.Labels) } if m.Spec.ResourceOverrides.ProxyDeployment.PodTemplateMetadataOverrides.Annotations != nil { - deploymentTemplateAnnotations = ctrlutil.MergeAnnotations(deploymentAnnotations, + deploymentTemplateAnnotations = ctrlutil.MergeAnnotations(deploymentTemplateAnnotations, m.Spec.ResourceOverrides.ProxyDeployment.PodTemplateMetadataOverrides.Annotations) } } diff --git a/cmd/thv-operator/controllers/mcpserver_resource_overrides_test.go b/cmd/thv-operator/controllers/mcpserver_resource_overrides_test.go index d370b5f91f..403b5c3a0d 100644 --- a/cmd/thv-operator/controllers/mcpserver_resource_overrides_test.go +++ b/cmd/thv-operator/controllers/mcpserver_resource_overrides_test.go @@ -26,6 +26,7 @@ import ( mcpv1beta1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1beta1" ctrlutil "github.com/stacklok/toolhive/cmd/thv-operator/pkg/controllerutil" + checksum "github.com/stacklok/toolhive/cmd/thv-operator/pkg/runconfig/configmap/checksum" "github.com/stacklok/toolhive/pkg/container/kubernetes" ) @@ -223,6 +224,7 @@ func TestResourceOverrides(t *testing.T) { mcpServer *mcpv1beta1.MCPServer expectedDeploymentLabels map[string]string expectedDeploymentAnns map[string]string + expectedPodTemplateAnns map[string]string expectedServiceLabels map[string]string expectedServiceAnns map[string]string }{ @@ -246,6 +248,9 @@ func TestResourceOverrides(t *testing.T) { "toolhive-name": "test-server", }, expectedDeploymentAnns: map[string]string{}, + expectedPodTemplateAnns: map[string]string{ + "toolhive.stacklok.dev/runconfig-checksum": "test-checksum", + }, expectedServiceLabels: map[string]string{ "app": "mcpserver", "app.kubernetes.io/name": "mcpserver", @@ -306,6 +311,9 @@ func TestResourceOverrides(t *testing.T) { "custom-annotation": "deployment-annotation", "monitoring/scrape": "true", }, + expectedPodTemplateAnns: map[string]string{ + "toolhive.stacklok.dev/runconfig-checksum": "test-checksum", + }, expectedServiceLabels: map[string]string{ "app": "mcpserver", "app.kubernetes.io/name": "mcpserver", @@ -364,6 +372,9 @@ func TestResourceOverrides(t *testing.T) { "environment": "test", }, expectedDeploymentAnns: map[string]string{}, + expectedPodTemplateAnns: map[string]string{ + "toolhive.stacklok.dev/runconfig-checksum": "test-checksum", + }, expectedServiceLabels: map[string]string{ "app": "mcpserver", "app.kubernetes.io/name": "mcpserver", @@ -400,6 +411,9 @@ func TestResourceOverrides(t *testing.T) { "toolhive-name": "test-server", }, expectedDeploymentAnns: map[string]string{}, + expectedPodTemplateAnns: map[string]string{ + "toolhive.stacklok.dev/runconfig-checksum": "test-checksum", + }, expectedServiceLabels: map[string]string{ "app": "mcpserver", "app.kubernetes.io/name": "mcpserver", @@ -463,6 +477,9 @@ func TestResourceOverrides(t *testing.T) { "monitoring/enabled": "true", "version": "v1.2.3", }, + expectedPodTemplateAnns: map[string]string{ + "toolhive.stacklok.dev/runconfig-checksum": "test-checksum", + }, expectedServiceLabels: map[string]string{ "app": "mcpserver", "app.kubernetes.io/name": "mcpserver", @@ -504,6 +521,11 @@ func TestResourceOverrides(t *testing.T) { "toolhive-name": "test-server", }, expectedDeploymentAnns: map[string]string{}, + expectedPodTemplateAnns: map[string]string{ + "vault.hashicorp.com/agent-inject": "true", + "vault.hashicorp.com/role": "toolhive-mcp-workloads", + "toolhive.stacklok.dev/runconfig-checksum": "test-checksum", + }, expectedServiceLabels: map[string]string{ "app": "mcpserver", "app.kubernetes.io/name": "mcpserver", @@ -529,6 +551,8 @@ func TestResourceOverrides(t *testing.T) { assert.Equal(t, tt.expectedDeploymentLabels, deployment.Labels) assert.Equal(t, tt.expectedDeploymentAnns, deployment.Annotations) + assert.Equal(t, tt.expectedPodTemplateAnns, deployment.Spec.Template.Annotations, + "pod template annotations must contain user overrides plus the runconfig-checksum") // Test service creation service := r.serviceForMCPServer(t.Context(), tt.mcpServer) @@ -593,6 +617,83 @@ func TestResourceOverrides(t *testing.T) { } } +func TestDeploymentForMCPServer_PodTemplateOverridesPreserveRunConfigChecksum(t *testing.T) { + t.Parallel() + + scheme := runtime.NewScheme() + require.NoError(t, mcpv1beta1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + client := fake.NewClientBuilder().WithScheme(scheme).Build() + r := newTestMCPServerReconciler(client, scheme, kubernetes.PlatformKubernetes) + + mcpServer := &mcpv1beta1.MCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: "test-server", Namespace: "default"}, + Spec: mcpv1beta1.MCPServerSpec{ + Image: "test:latest", + ResourceOverrides: &mcpv1beta1.ResourceOverrides{ + ProxyDeployment: &mcpv1beta1.ProxyDeploymentOverrides{ + PodTemplateMetadataOverrides: &mcpv1beta1.ResourceMetadataOverrides{ + Annotations: map[string]string{ + "user.example.com/some-key": "value", + }, + }, + }, + }, + }, + } + + deployment := r.deploymentForMCPServer(t.Context(), mcpServer, "C1") + require.NotNil(t, deployment) + + assert.Equal(t, "C1", + deployment.Spec.Template.Annotations[checksum.RunConfigChecksumAnnotation], + "runconfig-checksum must survive when PodTemplateMetadataOverrides.Annotations is set") + assert.Equal(t, "value", + deployment.Spec.Template.Annotations["user.example.com/some-key"], + "user override must survive") + assert.Len(t, deployment.Spec.Template.Annotations, 2, + "no extra keys should leak into the pod template") +} + +func TestDeploymentNeedsUpdate_StableAfterBuildWithPodTemplateOverrides(t *testing.T) { + t.Parallel() + + scheme := runtime.NewScheme() + require.NoError(t, mcpv1beta1.AddToScheme(scheme)) + require.NoError(t, corev1.AddToScheme(scheme)) + + client := fake.NewClientBuilder().WithScheme(scheme).Build() + r := newTestMCPServerReconciler(client, scheme, kubernetes.PlatformKubernetes) + + mcpServer := &mcpv1beta1.MCPServer{ + ObjectMeta: metav1.ObjectMeta{Name: "test-server", Namespace: "default"}, + Spec: mcpv1beta1.MCPServerSpec{ + Image: "test:latest", + ResourceOverrides: &mcpv1beta1.ResourceOverrides{ + ProxyDeployment: &mcpv1beta1.ProxyDeploymentOverrides{ + PodTemplateMetadataOverrides: &mcpv1beta1.ResourceMetadataOverrides{ + Annotations: map[string]string{ + "vault.hashicorp.com/agent-inject": "true", + "vault.hashicorp.com/role": "toolhive-mcp-workloads", + }, + }, + }, + }, + }, + } + + const runConfigChecksum = "stable-checksum" + built := r.deploymentForMCPServer(t.Context(), mcpServer, runConfigChecksum) + require.NotNil(t, built) + + // Constructor and comparator must agree on the same input — otherwise the + // operator gets stuck in a perpetual r.Update loop on every reconcile. + needsUpdate := r.deploymentNeedsUpdate(t.Context(), built, mcpServer, runConfigChecksum) + assert.False(t, needsUpdate, + "deploymentNeedsUpdate must report no drift immediately after deploymentForMCPServer with the same checksum and overrides") +} + func TestMergeStringMaps(t *testing.T) { t.Parallel()