diff --git a/cmd/thv-operator/api/v1beta1/embeddingserver_types.go b/cmd/thv-operator/api/v1beta1/embeddingserver_types.go index 2b14b22c34..e088b9876e 100644 --- a/cmd/thv-operator/api/v1beta1/embeddingserver_types.go +++ b/cmd/thv-operator/api/v1beta1/embeddingserver_types.go @@ -210,6 +210,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:resource:shortName=emb;embedding,categories=toolhive //+kubebuilder:printcolumn:name="Status",type="string",JSONPath=".status.phase" //+kubebuilder:printcolumn:name="Model",type="string",JSONPath=".spec.model" diff --git a/cmd/thv-operator/api/v1beta1/mcpexternalauthconfig_types.go b/cmd/thv-operator/api/v1beta1/mcpexternalauthconfig_types.go index 01558865d1..2a77ccda96 100644 --- a/cmd/thv-operator/api/v1beta1/mcpexternalauthconfig_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpexternalauthconfig_types.go @@ -768,6 +768,7 @@ type MCPExternalAuthConfigStatus struct { // +kubebuilder:object:root=true // +kubebuilder:storageversion // +kubebuilder:subresource:status +// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true // +kubebuilder:resource:shortName=extauth;mcpextauth,categories=toolhive // +kubebuilder:printcolumn:name="Type",type=string,JSONPath=`.spec.type` // +kubebuilder:printcolumn:name="Valid",type=string,JSONPath=`.status.conditions[?(@.type=='Valid')].status` diff --git a/cmd/thv-operator/api/v1beta1/mcpgroup_types.go b/cmd/thv-operator/api/v1beta1/mcpgroup_types.go index a151448943..4793986182 100644 --- a/cmd/thv-operator/api/v1beta1/mcpgroup_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpgroup_types.go @@ -88,6 +88,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:resource:shortName=mcpg;mcpgroup,categories=toolhive //+kubebuilder:printcolumn:name="Servers",type="integer",JSONPath=".status.serverCount" //+kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase" diff --git a/cmd/thv-operator/api/v1beta1/mcpoidcconfig_types.go b/cmd/thv-operator/api/v1beta1/mcpoidcconfig_types.go index bd711bfd4f..6d05419f4c 100644 --- a/cmd/thv-operator/api/v1beta1/mcpoidcconfig_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpoidcconfig_types.go @@ -196,6 +196,7 @@ type MCPOIDCConfigStatus struct { // +kubebuilder:object:root=true // +kubebuilder:storageversion // +kubebuilder:subresource:status +// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true // +kubebuilder:resource:shortName=mcpoidc,categories=toolhive // +kubebuilder:printcolumn:name="Source",type=string,JSONPath=`.spec.type` // +kubebuilder:printcolumn:name="Valid",type=string,JSONPath=`.status.conditions[?(@.type=='Valid')].status` diff --git a/cmd/thv-operator/api/v1beta1/mcpregistry_types.go b/cmd/thv-operator/api/v1beta1/mcpregistry_types.go index a95fd5a775..560c46715d 100644 --- a/cmd/thv-operator/api/v1beta1/mcpregistry_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpregistry_types.go @@ -170,6 +170,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:printcolumn:name="Status",type="string",JSONPath=".status.phase" //+kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" //+kubebuilder:printcolumn:name="Replicas",type="integer",JSONPath=".status.readyReplicas" diff --git a/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go b/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go index 3b38d591a7..2105719152 100644 --- a/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpremoteproxy_types.go @@ -348,6 +348,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:resource:shortName=rp;mcprp,categories=toolhive //+kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase" //+kubebuilder:printcolumn:name="Remote URL",type="string",JSONPath=".spec.remoteUrl" diff --git a/cmd/thv-operator/api/v1beta1/mcpserver_types.go b/cmd/thv-operator/api/v1beta1/mcpserver_types.go index 3a21eac58e..1fc7534f82 100644 --- a/cmd/thv-operator/api/v1beta1/mcpserver_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpserver_types.go @@ -869,6 +869,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:resource:shortName=mcpserver;mcpservers,categories=toolhive //+kubebuilder:printcolumn:name="Status",type="string",JSONPath=".status.phase" //+kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" diff --git a/cmd/thv-operator/api/v1beta1/mcpserverentry_types.go b/cmd/thv-operator/api/v1beta1/mcpserverentry_types.go index 6c5043d6e7..96e22571b0 100644 --- a/cmd/thv-operator/api/v1beta1/mcpserverentry_types.go +++ b/cmd/thv-operator/api/v1beta1/mcpserverentry_types.go @@ -151,6 +151,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:resource:shortName=mcpentry,categories=toolhive //+kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase" //+kubebuilder:printcolumn:name="Transport",type="string",JSONPath=".spec.transport" diff --git a/cmd/thv-operator/api/v1beta1/mcptelemetryconfig_types.go b/cmd/thv-operator/api/v1beta1/mcptelemetryconfig_types.go index 7e1685227c..c35eb73de2 100644 --- a/cmd/thv-operator/api/v1beta1/mcptelemetryconfig_types.go +++ b/cmd/thv-operator/api/v1beta1/mcptelemetryconfig_types.go @@ -139,6 +139,7 @@ type MCPTelemetryConfigStatus struct { // +kubebuilder:object:root=true // +kubebuilder:storageversion // +kubebuilder:subresource:status +// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true // +kubebuilder:resource:shortName=mcpotel,categories=toolhive // +kubebuilder:printcolumn:name="Endpoint",type=string,JSONPath=`.spec.openTelemetry.endpoint` // +kubebuilder:printcolumn:name="Valid",type=string,JSONPath=`.status.conditions[?(@.type=='Valid')].status` diff --git a/cmd/thv-operator/api/v1beta1/toolconfig_types.go b/cmd/thv-operator/api/v1beta1/toolconfig_types.go index 55e206b414..582f8d2ed4 100644 --- a/cmd/thv-operator/api/v1beta1/toolconfig_types.go +++ b/cmd/thv-operator/api/v1beta1/toolconfig_types.go @@ -108,6 +108,7 @@ type MCPToolConfigStatus struct { // +kubebuilder:object:root=true // +kubebuilder:storageversion // +kubebuilder:subresource:status +// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true // +kubebuilder:resource:shortName=tc;toolconfig,categories=toolhive // +kubebuilder:printcolumn:name="Valid",type=string,JSONPath=`.status.conditions[?(@.type=='Valid')].status` // +kubebuilder:printcolumn:name="References",type=string,JSONPath=`.status.referencingWorkloads` diff --git a/cmd/thv-operator/api/v1beta1/virtualmcpcompositetooldefinition_types.go b/cmd/thv-operator/api/v1beta1/virtualmcpcompositetooldefinition_types.go index e2a4ec0da6..7f09e7ebeb 100644 --- a/cmd/thv-operator/api/v1beta1/virtualmcpcompositetooldefinition_types.go +++ b/cmd/thv-operator/api/v1beta1/virtualmcpcompositetooldefinition_types.go @@ -100,6 +100,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:resource:shortName=vmcpctd;compositetool,categories=toolhive //+kubebuilder:printcolumn:name="Workflow",type="string",JSONPath=".spec.name",description="Workflow name" //+kubebuilder:printcolumn:name="Steps",type="integer",JSONPath=".spec.steps[*]",description="Number of steps" diff --git a/cmd/thv-operator/api/v1beta1/virtualmcpserver_types.go b/cmd/thv-operator/api/v1beta1/virtualmcpserver_types.go index 6a9f72b3d9..1e99d9450a 100644 --- a/cmd/thv-operator/api/v1beta1/virtualmcpserver_types.go +++ b/cmd/thv-operator/api/v1beta1/virtualmcpserver_types.go @@ -392,6 +392,7 @@ const ( //+kubebuilder:object:root=true //+kubebuilder:storageversion //+kubebuilder:subresource:status +//+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true //+kubebuilder:resource:shortName=vmcp;virtualmcp,categories=toolhive //+kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase",description="The phase of the VirtualMCPServer" //+kubebuilder:printcolumn:name="URL",type="string",JSONPath=".status.url",description="Virtual MCP server URL" diff --git a/cmd/thv-operator/controllers/marker_coverage_test.go b/cmd/thv-operator/controllers/marker_coverage_test.go new file mode 100644 index 0000000000..bce7f70741 --- /dev/null +++ b/cmd/thv-operator/controllers/marker_coverage_test.go @@ -0,0 +1,151 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package controllers + +import ( + "bufio" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestV1beta1TypesMarkerCoverage guards against a subtle failure mode in the +// opt-in-label design for storage-version migration: if a new CRD root type is +// added to cmd/thv-operator/api/v1beta1/ without the +// +// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true +// +// marker, the StorageVersionMigrator controller silently excludes it from +// reconciliation. The problem surfaces only when a future release tries to +// drop a deprecated version — at which point it is far too late. +// +// The test scans every root type (marker block contains +// +kubebuilder:object:root=true AND +kubebuilder:storageversion) and fails if +// any lacks either the migrate marker or an explicit +// +thv:storage-version-migrator:exclude sibling marker. List types +// (+kubebuilder:object:root=true WITHOUT +kubebuilder:storageversion) are +// intentionally skipped — CRD-level labels are keyed on the root type, not +// the list type. +func TestV1beta1TypesMarkerCoverage(t *testing.T) { + t.Parallel() + + typesDir := filepath.Join("..", "api", "v1beta1") + entries, err := os.ReadDir(typesDir) + require.NoError(t, err, "reading %s", typesDir) + + const ( + rootMarker = "+kubebuilder:object:root=true" + storageMarker = "+kubebuilder:storageversion" + migrateMarker = "+kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true" + excludeMarker = "+thv:storage-version-migrator:exclude" + groupversionInfoGo = "groupversion_info.go" + zzGeneratedPrefix = "zz_generated" + suffixTypesGo = "_types.go" + ) + + type rootType struct { + file string + typeName string + markerBlock []string + } + + var roots []rootType + + for _, e := range entries { + if e.IsDir() { + continue + } + name := e.Name() + if !strings.HasSuffix(name, suffixTypesGo) { + continue + } + if name == groupversionInfoGo || strings.HasPrefix(name, zzGeneratedPrefix) { + continue + } + + path := filepath.Join(typesDir, name) + f, err := os.Open(path) + require.NoError(t, err, "open %s", path) + + scanner := bufio.NewScanner(f) + // Raise the max token size — some of the *_types.go files have very + // long comment lines (printcolumn JSONPath expressions). + scanner.Buffer(make([]byte, 64*1024), 1024*1024) + + var block []string + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + + switch { + case strings.HasPrefix(line, "//"): + // Accumulate every comment/marker line. kubebuilder convention + // often separates markers from the godoc comment with a blank + // line, so we keep the block alive across blanks and only + // reset on a non-comment, non-blank, non-type line. + block = append(block, strings.TrimSpace(strings.TrimPrefix(line, "//"))) + case strings.HasPrefix(line, "type "): + // Only root-level types matter (must contain object:root=true + // AND storageversion markers). List types have only object:root=true. + if containsMarker(block, rootMarker) && containsMarker(block, storageMarker) { + typeName := extractTypeName(line) + if typeName != "" { + copied := append([]string(nil), block...) + roots = append(roots, rootType{file: name, typeName: typeName, markerBlock: copied}) + } + } + block = nil + case line == "": + // Blank line — keep block alive (comment-then-blank-then-type + // is idiomatic Go + kubebuilder). + default: + // Anything else (e.g. struct body, package clause, import) — + // drop any in-flight comment block. + block = nil + } + } + require.NoError(t, scanner.Err(), "scan %s", path) + require.NoError(t, f.Close()) + } + + require.NotEmpty(t, roots, + "no v1beta1 root types found — scanner likely broken; this test is meaningless without coverage") + + for _, r := range roots { + hasMigrate := containsMarker(r.markerBlock, migrateMarker) + hasExclude := containsMarker(r.markerBlock, excludeMarker) + assert.Truef(t, hasMigrate || hasExclude, + "v1beta1 root type %s.%s is missing either\n"+ + " %s\n"+ + "(opt in to storage-version migration) or\n"+ + " %s\n"+ + "(explicit opt-out). Every root type must declare one. See\n"+ + "cmd/thv-operator/controllers/storageversionmigrator_controller.go for context.", + r.file, r.typeName, migrateMarker, excludeMarker) + } +} + +// containsMarker returns true if any line in block contains the given +// marker substring. +func containsMarker(block []string, marker string) bool { + for _, l := range block { + if strings.Contains(l, marker) { + return true + } + } + return false +} + +// extractTypeName returns the identifier in a line of the form `type Foo struct {`. +// Returns empty string if the line is not a type declaration we care about. +func extractTypeName(line string) string { + fields := strings.Fields(line) + if len(fields) < 2 || fields[0] != "type" { + return "" + } + return fields[1] +} diff --git a/cmd/thv-operator/controllers/storageversionmigrator_controller.go b/cmd/thv-operator/controllers/storageversionmigrator_controller.go new file mode 100644 index 0000000000..10c147c6b3 --- /dev/null +++ b/cmd/thv-operator/controllers/storageversionmigrator_controller.go @@ -0,0 +1,481 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package controllers + +import ( + "context" + "errors" + "fmt" + "strings" + "sync" + "time" + + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + apitypes "k8s.io/apimachinery/pkg/types" + kerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/predicate" +) + +// Public contract for the StorageVersionMigrator controller. + +// AutoMigrateLabel identifies CRDs that opt in to storage-version migration. +// Applied via a kubebuilder marker on each root type in api/v1beta1/. +const AutoMigrateLabel = "toolhive.stacklok.dev/auto-migrate-storage-version" + +// AutoMigrateValue is the label value that enables migration for a CRD. +const AutoMigrateValue = "true" + +// ToolhiveGroup is the API group the controller is scoped to (belt-and-braces +// filter in addition to the opt-in label). +const ToolhiveGroup = "toolhive.stacklok.dev" + +// EventReasonMigrationSucceeded and EventReasonMigrationFailed are the event +// reasons emitted on the owning CRD when a migration completes or fails. +const ( + EventReasonMigrationSucceeded = "StorageVersionMigrationSucceeded" + EventReasonMigrationFailed = "StorageVersionMigrationFailed" +) + +const ( + defaultMigrationCacheTTL = 1 * time.Hour + defaultListPageSize = 500 + defaultCacheGCInterval = 10 * time.Minute +) + +// errMigrationRetriedDueToConflicts is returned by restoreCRs when at least one +// CR re-store hit a typed Conflict (and no other errors occurred). The caller +// must NOT trim CRD.status.storedVersions in this case: the post-conflict state +// of the affected object is unverified, so reasoning about whether the storage +// re-encode actually happened is unsafe. The next reconcile retries cleanly. +var errMigrationRetriedDueToConflicts = errors.New( + "storage version migration retried due to concurrent writes; storedVersions left unchanged") + +// The wildcard CR RBAC below is intentional. The set of opted-in CRDs isn't +// known at codegen time — it's a per-CRD runtime label decision — so the +// kubebuilder marker can't enumerate kinds. The runtime gate is the +// isManagedCRD check inside Reconcile, which requires both the toolhive +// API group AND the opt-in label. Wildcard RBAC plus isManagedCRD form the +// defence in depth: RBAC bounds the controller to a single API group, and +// the label gate further restricts it to opted-in CRDs. + +//+kubebuilder:rbac:groups=apiextensions.k8s.io,resources=customresourcedefinitions,verbs=get;list;watch +//+kubebuilder:rbac:groups=apiextensions.k8s.io,resources=customresourcedefinitions/status,verbs=update;patch +//+kubebuilder:rbac:groups=toolhive.stacklok.dev,resources=*,verbs=get;list;update +//+kubebuilder:rbac:groups=toolhive.stacklok.dev,resources=*/status,verbs=update + +// StorageVersionMigratorReconciler reconciles CustomResourceDefinition objects +// in the toolhive.stacklok.dev group that carry the opt-in +// AutoMigrateLabel=AutoMigrateValue. For each such CRD it re-stores every CR +// at the current storage version by doing a Get + Update on the live object. +// The Update is a full PUT of the unmodified object; the apiserver re-encodes +// the request body at the current storage version, then compares the +// resulting bytes to what's in etcd. When the CR was originally stored at a +// different version (the actual migration scenario) the bytes carry a +// different apiVersion stamp than etcd's record, the comparison fails, and +// the write proceeds — re-encoding the object at the current storage +// version. When the CR is already at the current storage version, the bytes +// match and the apiserver harmlessly elides the write — there was nothing to +// migrate. After all CRs have been processed it patches +// CRD.status.storedVersions down to [] so a future +// release can drop deprecated versions from spec.versions without orphaning +// etcd objects. See https://github.com/kubernetes-sigs/kube-storage-version-migrator/issues/65 +// for the upstream maintainers' explanation of this mechanism. +// +// Enabled by default. Opt out operator-wide via +// operator.features.storageVersionMigrator (ENABLE_STORAGE_VERSION_MIGRATOR=false) +// for admins who prefer to run kube-storage-version-migrator externally. +// Per-kind escape hatch: remove the label from the CRD (emergency only — will +// be re-applied by GitOps / helm upgrade). +type StorageVersionMigratorReconciler struct { + // used for CR Update writes and the CRD /status storedVersions patch; + // reads go through APIReader to bypass the informer cache. + client.Client + APIReader client.Reader // live reads for CRDs and CR list pages (bypasses informer) + Scheme *runtime.Scheme // kubebuilder reconciler convention + Recorder record.EventRecorder // MigrationSucceeded / MigrationFailed events on the CRD + PageSize int64 // overrideable for tests; defaults to defaultListPageSize + CacheGCInterval time.Duration // overrideable for tests; defaults to defaultCacheGCInterval + cache *migrationCache +} + +// Reconcile runs for each opted-in toolhive.stacklok.dev CRD event. See the +// package-level docs on StorageVersionMigratorReconciler for the full flow. +// Returns a non-nil error to trigger exponential backoff; the CRD watch +// re-enqueues on any status change, so explicit requeue intervals are not used. +func (r *StorageVersionMigratorReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx).WithValues("crd", req.Name) + + r.ensureInitialized() + + // Live-read the CRD. Informer cache may lag label or storedVersions updates. + crd := &apiextensionsv1.CustomResourceDefinition{} + if err := r.APIReader.Get(ctx, req.NamespacedName, crd); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("get CRD %s: %w", req.Name, err) + } + + // Re-verify the filter against live state; watch predicate could have + // fired on stale informer data. + if !isManagedCRD(crd) { + return ctrl.Result{}, nil + } + + storageVersion, ok := findStorageVersion(crd) + if !ok { + // CRDs without a storage version are malformed from our perspective; + // log and skip rather than fail (the API server would have rejected + // a CRD without a storage version, so this is unreachable in practice). + logger.Info("CRD has no storage version, skipping", "spec.versions", crd.Spec.Versions) + return ctrl.Result{}, nil + } + + if !isMigrationNeeded(crd, storageVersion) { + return ctrl.Result{}, nil + } + + logger.Info("migrating storage versions", + "storageVersion", storageVersion, + "storedVersions", crd.Status.StoredVersions, + ) + + if err := r.restoreCRs(ctx, crd, storageVersion); err != nil { + r.Recorder.Eventf(crd, "Warning", EventReasonMigrationFailed, + "storage version migration failed: %v", err) + return ctrl.Result{}, fmt.Errorf("re-store CRs for %s: %w", crd.Name, err) + } + + if err := r.patchStoredVersions(ctx, crd, storageVersion); err != nil { + r.Recorder.Eventf(crd, "Warning", EventReasonMigrationFailed, + "storedVersions patch failed: %v", err) + return ctrl.Result{}, fmt.Errorf("patch storedVersions for %s: %w", crd.Name, err) + } + + r.Recorder.Eventf(crd, "Normal", EventReasonMigrationSucceeded, + "storage version migrated to %s", storageVersion) + logger.Info("storage version migration complete", "storageVersion", storageVersion) + return ctrl.Result{}, nil +} + +// SetupWithManager wires the reconciler to watch CRDs using PartialObjectMetadata +// (no full-object cache), filtered on the opt-in label and the toolhive.stacklok.dev +// group. The filter is evaluated twice — once on informer events here, and again +// inside Reconcile after the live APIReader read — because label removals can +// briefly race the informer. +// +// It also registers a Runnable that periodically sweeps expired entries from +// the migration cache so deleted CRs (whose UIDs never recur in subsequent +// list pages and therefore never trigger lazy eviction in has()) don't grow +// the map without bound on long-running operators with high CR churn. +func (r *StorageVersionMigratorReconciler) SetupWithManager(mgr ctrl.Manager) error { + r.ensureInitialized() + + labelSelector, err := labels.Parse(AutoMigrateLabel + "=" + AutoMigrateValue) + if err != nil { + return fmt.Errorf("parse label selector: %w", err) + } + + if err := ctrl.NewControllerManagedBy(mgr). + Named("storageversionmigrator"). + For( + &apiextensionsv1.CustomResourceDefinition{}, + builder.OnlyMetadata, + builder.WithPredicates( + predicate.NewPredicateFuncs(func(obj client.Object) bool { + return labelSelector.Matches(labels.Set(obj.GetLabels())) && + isToolhiveCRDName(obj.GetName()) + }), + predicate.ResourceVersionChangedPredicate{}, + ), + ). + Complete(r); err != nil { + return err + } + + // Periodic cache GC. Registered after Complete so the controller is fully + // wired when the runnable starts. + return mgr.Add(manager.RunnableFunc(func(ctx context.Context) error { + ticker := time.NewTicker(r.CacheGCInterval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return nil + case <-ticker.C: + r.cache.gc() + } + } + })) +} + +// ------------------------------------------------------------------ +// Private implementation below. +// ------------------------------------------------------------------ + +func (r *StorageVersionMigratorReconciler) ensureInitialized() { + if r.PageSize == 0 { + r.PageSize = defaultListPageSize + } + if r.CacheGCInterval == 0 { + r.CacheGCInterval = defaultCacheGCInterval + } + if r.cache == nil { + r.cache = newMigrationCache(defaultMigrationCacheTTL) + } +} + +// restoreCRs lists all CRs of the CRD's served kind (served version = storageVersion) +// and issues a main-resource Update on each one, forcing the apiserver to +// re-encode the object at the current storage version. +// +// Per-CR error handling: +// - IsNotFound: silently skipped (object deleted between list and update — +// it can't be at the old storage version anymore). +// - IsConflict: silently skipped at the per-CR level, but a function-level +// counter is incremented. After the loop, if any conflicts occurred and no +// other errors did, errMigrationRetriedDueToConflicts is returned so the +// caller leaves storedVersions untouched (the post-conflict state of the +// conflicting object is unverified). +// - All other errors are aggregated and returned. +func (r *StorageVersionMigratorReconciler) restoreCRs( + ctx context.Context, + crd *apiextensionsv1.CustomResourceDefinition, + storageVersion string, +) error { + logger := log.FromContext(ctx) + gvk := schema.GroupVersionKind{ + Group: crd.Spec.Group, + Version: storageVersion, + Kind: crd.Spec.Names.Kind, + } + + listGVK := gvk + listGVK.Kind = crd.Spec.Names.ListKind + + var errs []error + conflicts := 0 + var continueToken string + for { + list := &unstructured.UnstructuredList{} + list.SetGroupVersionKind(listGVK) + listOpts := []client.ListOption{client.Limit(r.PageSize)} + if continueToken != "" { + listOpts = append(listOpts, client.Continue(continueToken)) + } + if err := r.APIReader.List(ctx, list, listOpts...); err != nil { + return fmt.Errorf("list %s: %w", listGVK.String(), err) + } + + if err := meta.EachListItem(list, func(obj runtime.Object) error { + u, ok := obj.(*unstructured.Unstructured) + if !ok { + errs = append(errs, fmt.Errorf("unexpected list item type %T", obj)) + return nil + } + if r.cache.has(crd.Name, u.GetUID(), u.GetResourceVersion()) { + return nil + } + restored, err := r.restoreOne(ctx, gvk, u) + if err != nil { + switch { + case apierrors.IsNotFound(err): + logger.V(1).Info("skip CR — deleted", + "object", client.ObjectKeyFromObject(u), "err", err) + case apierrors.IsConflict(err): + conflicts++ + logger.V(1).Info("skip CR — concurrent write conflict", + "object", client.ObjectKeyFromObject(u), "err", err) + default: + errs = append(errs, fmt.Errorf("re-store %s/%s: %w", + u.GetNamespace(), u.GetName(), err)) + } + return nil + } + r.cache.add(crd.Name, restored.GetUID(), restored.GetResourceVersion()) + return nil + }); err != nil { + errs = append(errs, err) + } + + continueToken = list.GetContinue() + if continueToken == "" { + break + } + } + + if len(errs) == 0 && conflicts > 0 { + return errMigrationRetriedDueToConflicts + } + return kerrors.NewAggregate(errs) +} + +// restoreOne issues a plain Get + Update on the live CR. The apiserver +// re-encodes the request body at the current storage version and compares +// it to etcd's record; when the CR was originally stored at a different +// apiVersion the bytes differ, the write proceeds, and etcd is re-encoded +// at the current storage version. When the CR is already at the current +// storage version the bytes match and the apiserver harmlessly elides the +// write — there was nothing to migrate. The Update goes through the main +// resource, so validating/mutating admission webhooks on the kind see this +// request as part of normal admission flow; only requests that actually +// persist produce downstream state changes. Returns the live object after +// the update so the caller can record its post-update resourceVersion in +// the cache. +func (r *StorageVersionMigratorReconciler) restoreOne( + ctx context.Context, + gvk schema.GroupVersionKind, + original *unstructured.Unstructured, +) (*unstructured.Unstructured, error) { + live := &unstructured.Unstructured{} + live.SetGroupVersionKind(gvk) + if err := r.APIReader.Get(ctx, client.ObjectKeyFromObject(original), live); err != nil { + // IsNotFound is propagated to the caller, which handles it. + return nil, err + } + if err := r.Update(ctx, live); err != nil { + return nil, err + } + return live, nil +} + +// patchStoredVersions overwrites CRD.status.storedVersions to exactly +// [storageVersion], using an optimistic lock on the CRD's resourceVersion so +// a concurrent API-server write rejects the patch and triggers a requeue. +func (r *StorageVersionMigratorReconciler) patchStoredVersions( + ctx context.Context, + crd *apiextensionsv1.CustomResourceDefinition, + storageVersion string, +) error { + original := crd.DeepCopy() + crd.Status.StoredVersions = []string{storageVersion} + return r.Client.Status().Patch(ctx, crd, + client.MergeFromWithOptions(original, client.MergeFromWithOptimisticLock{})) +} + +// isManagedCRD returns true if a CRD is opted in to migration: the group matches +// toolhive.stacklok.dev and the opt-in label is set to the expected value. +func isManagedCRD(crd *apiextensionsv1.CustomResourceDefinition) bool { + if crd.Spec.Group != ToolhiveGroup { + return false + } + return crd.GetLabels()[AutoMigrateLabel] == AutoMigrateValue +} + +// isToolhiveCRDName checks whether a CRD name is of the form .toolhive.stacklok.dev, +// which is sufficient to filter at watch time. Reconcile re-verifies via the live CRD. +func isToolhiveCRDName(name string) bool { + return strings.HasSuffix(name, "."+ToolhiveGroup) +} + +// findStorageVersion returns the single version marked storage=true in the CRD spec. +func findStorageVersion(crd *apiextensionsv1.CustomResourceDefinition) (string, bool) { + for _, v := range crd.Spec.Versions { + if v.Storage { + return v.Name, true + } + } + return "", false +} + +// isMigrationNeeded returns true iff status.storedVersions is anything other +// than exactly [storageVersion]. The set of served versions does not affect +// this check — under spec.conversion.strategy=None with identical schemas, +// normal writers cannot reintroduce stale versions to storedVersions, so a +// defensive re-scan based on servedCount has no scenario to defend against. +func isMigrationNeeded( + crd *apiextensionsv1.CustomResourceDefinition, + storageVersion string, +) bool { + stored := crd.Status.StoredVersions + return len(stored) != 1 || stored[0] != storageVersion +} + +// ------------------------------------------------------------------ +// migrationCache: short-lived de-duplication of re-store writes. +// ------------------------------------------------------------------ + +// migrationCache records successfully-migrated (UID, resourceVersion) pairs +// so subsequent reconciles within the TTL window skip already-fresh objects. +// It is a correctness optimization only — a cache miss simply issues a +// redundant (but harmless) Update. +// +// Eviction: lazy on lookup in has(), plus a periodic sweep via gc() driven +// from a manager.Runnable registered in SetupWithManager. The periodic sweep +// is required because lookups never recur for deleted CRs, so without it +// their entries would persist forever. +type migrationCache struct { + mu sync.Mutex + entries map[string]cacheEntry + ttl time.Duration + now func() time.Time +} + +type cacheEntry struct { + resourceVersion string + expiresAt time.Time +} + +func newMigrationCache(ttl time.Duration) *migrationCache { + return &migrationCache{ + entries: make(map[string]cacheEntry), + ttl: ttl, + now: time.Now, + } +} + +func (c *migrationCache) has(crdName string, uid apitypes.UID, resourceVersion string) bool { + key := c.key(crdName, uid) + c.mu.Lock() + defer c.mu.Unlock() + entry, ok := c.entries[key] + if !ok { + return false + } + if c.now().After(entry.expiresAt) { + delete(c.entries, key) + return false + } + return entry.resourceVersion == resourceVersion +} + +func (c *migrationCache) add(crdName string, uid apitypes.UID, resourceVersion string) { + key := c.key(crdName, uid) + c.mu.Lock() + defer c.mu.Unlock() + c.entries[key] = cacheEntry{ + resourceVersion: resourceVersion, + expiresAt: c.now().Add(c.ttl), + } +} + +// gc evicts every expired entry from the cache. Called from a periodic +// manager.Runnable so entries for deleted CRs (whose UIDs never recur in +// subsequent list pages) don't accumulate without bound. +func (c *migrationCache) gc() { + c.mu.Lock() + defer c.mu.Unlock() + now := c.now() + for k, e := range c.entries { + if now.After(e.expiresAt) { + delete(c.entries, k) + } + } +} + +func (*migrationCache) key(crdName string, uid apitypes.UID) string { + return crdName + "|" + string(uid) +} diff --git a/cmd/thv-operator/main.go b/cmd/thv-operator/main.go index 85073a385b..a47e7e1027 100644 --- a/cmd/thv-operator/main.go +++ b/cmd/thv-operator/main.go @@ -17,6 +17,7 @@ import ( "strings" "github.com/go-logr/logr" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" @@ -43,9 +44,10 @@ var ( // Feature flags for controller groups const ( - featureServer = "ENABLE_SERVER" - featureRegistry = "ENABLE_REGISTRY" - featureVMCP = "ENABLE_VMCP" + featureServer = "ENABLE_SERVER" + featureRegistry = "ENABLE_REGISTRY" + featureVMCP = "ENABLE_VMCP" + featureStorageVersionMigrator = "ENABLE_STORAGE_VERSION_MIGRATOR" ) // controllerDependencies maps each controller group to its required dependencies @@ -57,6 +59,7 @@ func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) utilruntime.Must(mcpv1alpha1.AddToScheme(scheme)) utilruntime.Must(mcpv1beta1.AddToScheme(scheme)) + utilruntime.Must(apiextensionsv1.AddToScheme(scheme)) //+kubebuilder:scaffold:scheme } @@ -133,12 +136,14 @@ func setupControllersAndWebhooks(mgr ctrl.Manager) error { enableServer := isFeatureEnabled(featureServer, true) enableRegistry := isFeatureEnabled(featureRegistry, true) enableVMCP := isFeatureEnabled(featureVMCP, true) + enableStorageVersionMigrator := isFeatureEnabled(featureStorageVersionMigrator, true) // Track enabled features for dependency checking enabledFeatures := map[string]bool{ - featureServer: enableServer, - featureRegistry: enableRegistry, - featureVMCP: enableVMCP, + featureServer: enableServer, + featureRegistry: enableRegistry, + featureVMCP: enableVMCP, + featureStorageVersionMigrator: enableStorageVersionMigrator, } // Check dependencies and log warnings for missing dependencies @@ -186,10 +191,35 @@ func setupControllersAndWebhooks(mgr ctrl.Manager) error { setupLog.Info("ENABLE_VMCP is disabled, skipping Virtual MCP controllers and webhooks") } + // Set up StorageVersionMigrator controller + if enabledFeatures[featureStorageVersionMigrator] { + if err := setupStorageVersionMigrator(mgr); err != nil { + return err + } + } else { + setupLog.Info("ENABLE_STORAGE_VERSION_MIGRATOR is disabled, skipping StorageVersionMigrator controller") + } + //+kubebuilder:scaffold:builder return nil } +// setupStorageVersionMigrator sets up the StorageVersionMigrator controller, +// which reconciles status.storedVersions on opted-in toolhive.stacklok.dev CRDs +// down to [] so future releases can drop deprecated +// versions from spec.versions without orphaning etcd objects. +func setupStorageVersionMigrator(mgr ctrl.Manager) error { + if err := (&controllers.StorageVersionMigratorReconciler{ + Client: mgr.GetClient(), + APIReader: mgr.GetAPIReader(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("storageversionmigrator-controller"), + }).SetupWithManager(mgr); err != nil { + return fmt.Errorf("unable to create controller StorageVersionMigrator: %w", err) + } + return nil +} + // setupGroupRefFieldIndexes sets up field indexing for spec.groupRef on all resource types // that can reference an MCPGroup. This enables efficient lookups by groupRef in controllers. func setupGroupRefFieldIndexes(mgr ctrl.Manager) error { diff --git a/cmd/thv-operator/test-integration/storageversionmigrator/controller_test.go b/cmd/thv-operator/test-integration/storageversionmigrator/controller_test.go new file mode 100644 index 0000000000..35c318a27d --- /dev/null +++ b/cmd/thv-operator/test-integration/storageversionmigrator/controller_test.go @@ -0,0 +1,712 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package storageversionmigrator + +import ( + "context" + "fmt" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/stacklok/toolhive/cmd/thv-operator/controllers" +) + +const ( + toolhiveGroup = controllers.ToolhiveGroup + migrateLabel = controllers.AutoMigrateLabel + migrateValue = controllers.AutoMigrateValue +) + +// crdSpec describes a test CRD fixture. +type crdSpec struct { + Name string + Group string + Kind string + ListKind string + Plural string + Singular string + Versions []versionSpec + Labelled bool + HasStatusOnStored bool +} + +type versionSpec struct { + Name string + Served bool + Storage bool +} + +func buildCRD(s crdSpec) *apiextensionsv1.CustomResourceDefinition { + versions := make([]apiextensionsv1.CustomResourceDefinitionVersion, 0, len(s.Versions)) + for _, v := range s.Versions { + cdv := apiextensionsv1.CustomResourceDefinitionVersion{ + Name: v.Name, + Served: v.Served, + Storage: v.Storage, + Schema: &apiextensionsv1.CustomResourceValidation{ + OpenAPIV3Schema: &apiextensionsv1.JSONSchemaProps{ + Type: "object", + Properties: map[string]apiextensionsv1.JSONSchemaProps{ + "spec": { + Type: "object", + XPreserveUnknownFields: ptrBool(true), + }, + "status": { + Type: "object", + XPreserveUnknownFields: ptrBool(true), + }, + }, + }, + }, + } + if v.Storage && s.HasStatusOnStored { + cdv.Subresources = &apiextensionsv1.CustomResourceSubresources{ + Status: &apiextensionsv1.CustomResourceSubresourceStatus{}, + } + } + versions = append(versions, cdv) + } + + crd := &apiextensionsv1.CustomResourceDefinition{ + ObjectMeta: metav1.ObjectMeta{Name: s.Name}, + Spec: apiextensionsv1.CustomResourceDefinitionSpec{ + Group: s.Group, + Names: apiextensionsv1.CustomResourceDefinitionNames{ + Kind: s.Kind, + ListKind: s.ListKind, + Plural: s.Plural, + Singular: s.Singular, + }, + Scope: apiextensionsv1.NamespaceScoped, + Versions: versions, + }, + } + if s.Labelled { + crd.Labels = map[string]string{migrateLabel: migrateValue} + } + return crd +} + +func ptrBool(b bool) *bool { return &b } + +// installCRD creates a CRD and waits for the apiserver to publish it so +// unstructured CR creates of that kind will succeed. +func installCRD(c crdSpec) { + crd := buildCRD(c) + Expect(k8sClient.Create(ctx, crd)).To(Succeed()) + + Eventually(func() bool { + live := &apiextensionsv1.CustomResourceDefinition{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: c.Name}, live); err != nil { + return false + } + for _, cond := range live.Status.Conditions { + if cond.Type == apiextensionsv1.Established && cond.Status == apiextensionsv1.ConditionTrue { + return true + } + } + return false + }, time.Second*10, time.Millisecond*200).Should(BeTrue(), "CRD %s never became Established", c.Name) +} + +func deleteCRD(name string) { + crd := &apiextensionsv1.CustomResourceDefinition{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: name}, crd); err != nil { + if apierrors.IsNotFound(err) { + return + } + Fail(fmt.Sprintf("get CRD %s before delete: %v", name, err)) + } + Expect(k8sClient.Delete(ctx, crd)).To(Succeed()) + Eventually(func() bool { + return apierrors.IsNotFound(k8sClient.Get(ctx, types.NamespacedName{Name: name}, &apiextensionsv1.CustomResourceDefinition{})) + }, time.Second*30, time.Millisecond*200).Should(BeTrue(), "CRD %s never fully deleted", name) +} + +// setStoredVersions overwrites status.storedVersions, simulating a historical +// state where objects were stored at earlier versions. +func setStoredVersions(crdName string, versions []string) { + Eventually(func() error { + crd := &apiextensionsv1.CustomResourceDefinition{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: crdName}, crd); err != nil { + return err + } + orig := crd.DeepCopy() + crd.Status.StoredVersions = versions + return k8sClient.Status().Patch(ctx, crd, client.MergeFrom(orig)) + }, time.Second*5, time.Millisecond*100).Should(Succeed()) +} + +func getStoredVersions(crdName string) []string { + crd := &apiextensionsv1.CustomResourceDefinition{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: crdName}, crd)).To(Succeed()) + return append([]string{}, crd.Status.StoredVersions...) +} + +// createCRs creates count CRs in the default namespace with the given kind +// and a name derived from basename. Returns the created objects so tests can +// assert on them post-reconcile. +func createCRs(gvk schema.GroupVersionKind, basename string, count int) []*unstructured.Unstructured { + out := make([]*unstructured.Unstructured, 0, count) + for i := 0; i < count; i++ { + u := &unstructured.Unstructured{} + u.SetGroupVersionKind(gvk) + u.SetNamespace("default") + u.SetName(fmt.Sprintf("%s-%d", basename, i)) + Expect(unstructured.SetNestedField(u.Object, "placeholder", "spec", "marker")).To(Succeed()) + Expect(k8sClient.Create(ctx, u)).To(Succeed()) + out = append(out, u) + } + return out +} + +// Note on "did the re-store actually fire" verification: +// +// The controller does a plain Get + Update on each CR. When the CR is already +// at the current storage version the apiserver freshly re-encodes the request +// body, sees it matches etcd byte-for-byte, and elides the write — that's +// correct behaviour, not a controller bug, and it means the per-CR RV does +// not bump for an already-clean CR. The dedicated cross-version test +// ("re-encodes CRs that are stored at a prior storage version") proves the +// migration mechanism actually works for objects stored at older versions: +// it stores a CR at v1alpha1, flips storage to v1beta1, and asserts the CR's +// RV bumps after reconcile. +// +// The pagination test additionally verifies the continue-token loop via a +// list-call counter, and the partial-failure test asserts storedVersions is +// not trimmed when any CR re-store fails. + +// newReconciler constructs a StorageVersionMigratorReconciler for a single +// test. Every test has its own instance so the migration cache doesn't leak +// between tests and state is fully explicit. +func newReconciler() *controllers.StorageVersionMigratorReconciler { + return &controllers.StorageVersionMigratorReconciler{ + Client: k8sClient, + APIReader: k8sClient, + Scheme: k8sClient.Scheme(), + Recorder: &noopRecorder{}, + } +} + +// reconcile invokes the reconciler once for the given CRD and returns the +// result and error directly — tests assert on both. +func reconcile(r *controllers.StorageVersionMigratorReconciler, crdName string) (ctrl.Result, error) { + return r.Reconcile(ctx, ctrl.Request{NamespacedName: types.NamespacedName{Name: crdName}}) +} + +var crdCounter int + +func uniqueSuffix() string { + crdCounter++ + return fmt.Sprintf("t%d", crdCounter) +} + +// ------------------------------------------------------------------ +// Tests +// ------------------------------------------------------------------ + +var _ = Describe("StorageVersionMigrator", func() { + Describe("Reconcile", func() { + + It("is a noop when storedVersions is already [storageVersion] and only one version is served", func() { + suf := uniqueSuffix() + spec := crdSpec{ + Name: "noops" + suf + "." + toolhiveGroup, + Group: toolhiveGroup, + Kind: "Noop" + suf, + ListKind: "Noop" + suf + "List", + Plural: "noops" + suf, + Singular: "noop" + suf, + Labelled: true, + HasStatusOnStored: true, + Versions: []versionSpec{ + {Name: "v1beta1", Served: true, Storage: true}, + }, + } + installCRD(spec) + DeferCleanup(func() { deleteCRD(spec.Name) }) + + // envtest leaves storedVersions empty until a write happens. + // Seed it explicitly so the isMigrationNeeded check sees the + // "clean" state we want to exercise. + setStoredVersions(spec.Name, []string{"v1beta1"}) + + _, err := reconcile(newReconciler(), spec.Name) + Expect(err).NotTo(HaveOccurred()) + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1beta1"})) + }) + + It("migrates storedVersions from [v1alpha1,v1beta1] to [v1beta1]", func() { + suf := uniqueSuffix() + spec := crdSpec{ + Name: "happies" + suf + "." + toolhiveGroup, + Group: toolhiveGroup, + Kind: "Happy" + suf, + ListKind: "Happy" + suf + "List", + Plural: "happies" + suf, + Singular: "happy" + suf, + Labelled: true, + HasStatusOnStored: true, + Versions: []versionSpec{ + {Name: "v1alpha1", Served: true, Storage: false}, + {Name: "v1beta1", Served: true, Storage: true}, + }, + } + installCRD(spec) + DeferCleanup(func() { deleteCRD(spec.Name) }) + + // The CRs created here are already stored at v1beta1 (the + // current storage version), so the apiserver will elide each + // per-CR Update — its freshly-encoded body matches etcd + // byte-for-byte. That's correct apiserver behaviour, not a + // controller bug, so this test does NOT assert per-CR + // resourceVersion bumps. The cross-version test below proves + // the migration mechanism actually re-encodes objects that were + // stored at an older apiVersion. + createCRs( + schema.GroupVersionKind{Group: spec.Group, Version: "v1beta1", Kind: spec.Kind}, + "obj-"+suf, 3, + ) + + setStoredVersions(spec.Name, []string{"v1alpha1", "v1beta1"}) + + _, err := reconcile(newReconciler(), spec.Name) + Expect(err).NotTo(HaveOccurred()) + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1beta1"})) + }) + + // Load-bearing proof of the migration mechanism: a CR stored at + // v1alpha1, after the storage version has flipped to v1beta1, must + // have its resourceVersion bumped by reconcile — that's the + // observable evidence the apiserver actually re-encoded the etcd + // document. See the upstream confirmation at + // https://github.com/kubernetes-sigs/kube-storage-version-migrator/issues/65. + It("re-encodes CRs that are stored at a prior storage version", func() { + suf := uniqueSuffix() + crdName := "crossvers" + suf + "." + toolhiveGroup + kind := "CrossVer" + suf + plural := "crossvers" + suf + + versionSchema := func() *apiextensionsv1.CustomResourceValidation { + return &apiextensionsv1.CustomResourceValidation{ + OpenAPIV3Schema: &apiextensionsv1.JSONSchemaProps{ + Type: "object", + Properties: map[string]apiextensionsv1.JSONSchemaProps{ + "spec": {Type: "object", XPreserveUnknownFields: ptrBool(true)}, + "status": {Type: "object", XPreserveUnknownFields: ptrBool(true)}, + }, + }, + } + } + + // Step 1: install CRD with v1alpha1 as the storage version so + // CRs created next are written to etcd as v1alpha1 bytes. + crd := &apiextensionsv1.CustomResourceDefinition{ + ObjectMeta: metav1.ObjectMeta{ + Name: crdName, + Labels: map[string]string{migrateLabel: migrateValue}, + }, + Spec: apiextensionsv1.CustomResourceDefinitionSpec{ + Group: toolhiveGroup, + Names: apiextensionsv1.CustomResourceDefinitionNames{ + Kind: kind, + ListKind: kind + "List", + Plural: plural, + Singular: "crossver" + suf, + }, + Scope: apiextensionsv1.NamespaceScoped, + Versions: []apiextensionsv1.CustomResourceDefinitionVersion{ + {Name: "v1alpha1", Served: true, Storage: true, Schema: versionSchema()}, + {Name: "v1beta1", Served: true, Storage: false, Schema: versionSchema()}, + }, + }, + } + Expect(k8sClient.Create(ctx, crd)).To(Succeed()) + DeferCleanup(func() { deleteCRD(crdName) }) + + Eventually(func() bool { + live := &apiextensionsv1.CustomResourceDefinition{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: crdName}, live); err != nil { + return false + } + for _, c := range live.Status.Conditions { + if c.Type == apiextensionsv1.Established && c.Status == apiextensionsv1.ConditionTrue { + return true + } + } + return false + }, time.Second*10, time.Millisecond*200).Should(BeTrue()) + + // Step 2: create one CR — etcd writes apiVersion: v1alpha1 bytes. + cr := createCRs( + schema.GroupVersionKind{Group: toolhiveGroup, Version: "v1alpha1", Kind: kind}, + "obj-"+suf, 1, + )[0] + + // Step 3: flip storage to v1beta1. + Eventually(func() error { + live := &apiextensionsv1.CustomResourceDefinition{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: crdName}, live); err != nil { + return err + } + orig := live.DeepCopy() + for i := range live.Spec.Versions { + live.Spec.Versions[i].Storage = (live.Spec.Versions[i].Name == "v1beta1") + } + return k8sClient.Patch(ctx, live, client.MergeFrom(orig)) + }, time.Second*10, time.Millisecond*200).Should(Succeed()) + + // Confirm the storage flip settled before proceeding. + Eventually(func() bool { + live := &apiextensionsv1.CustomResourceDefinition{} + if err := k8sClient.Get(ctx, types.NamespacedName{Name: crdName}, live); err != nil { + return false + } + for _, v := range live.Spec.Versions { + if v.Name == "v1beta1" && v.Storage { + return true + } + } + return false + }, time.Second*10, time.Millisecond*200).Should(BeTrue()) + + // Step 4: storedVersions reflects the historical v1alpha1 entry. + setStoredVersions(crdName, []string{"v1alpha1", "v1beta1"}) + + // Step 5: snapshot RV before reconcile. + preLive := &unstructured.Unstructured{} + preLive.SetGroupVersionKind(schema.GroupVersionKind{ + Group: toolhiveGroup, Version: "v1beta1", Kind: kind, + }) + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(cr), preLive)).To(Succeed()) + preRV := preLive.GetResourceVersion() + + // Step 6: reconcile. + _, err := reconcile(newReconciler(), crdName) + Expect(err).NotTo(HaveOccurred()) + + // Step 7: storedVersions trimmed. + Expect(getStoredVersions(crdName)).To(Equal([]string{"v1beta1"})) + + // Step 8: empirical proof — RV bumped because the cross-version + // Update actually wrote etcd. + postLive := &unstructured.Unstructured{} + postLive.SetGroupVersionKind(preLive.GroupVersionKind()) + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(cr), postLive)).To(Succeed()) + Expect(postLive.GetResourceVersion()).NotTo(Equal(preRV), + "CR %s/%s resourceVersion did not bump (pre=%s post=%s) — cross-version re-store did not write to etcd", + cr.GetNamespace(), cr.GetName(), preRV, postLive.GetResourceVersion()) + }) + + It("skips CRDs in foreign API groups", func() { + suf := uniqueSuffix() + spec := crdSpec{ + Name: "outsiders" + suf + ".example.com", + Group: "example.com", + Kind: "Outsider" + suf, + ListKind: "Outsider" + suf + "List", + Plural: "outsiders" + suf, + Singular: "outsider" + suf, + Labelled: true, + HasStatusOnStored: true, + Versions: []versionSpec{ + {Name: "v1alpha1", Served: true, Storage: false}, + {Name: "v1beta1", Served: true, Storage: true}, + }, + } + installCRD(spec) + DeferCleanup(func() { deleteCRD(spec.Name) }) + + setStoredVersions(spec.Name, []string{"v1alpha1", "v1beta1"}) + + _, err := reconcile(newReconciler(), spec.Name) + Expect(err).NotTo(HaveOccurred()) + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1alpha1", "v1beta1"}), + "storedVersions must be untouched for foreign-group CRDs") + }) + + It("skips toolhive CRDs missing the opt-in label", func() { + suf := uniqueSuffix() + spec := crdSpec{ + Name: "unlabelled" + suf + "." + toolhiveGroup, + Group: toolhiveGroup, + Kind: "Unlabelled" + suf, + ListKind: "Unlabelled" + suf + "List", + Plural: "unlabelled" + suf, + Singular: "unlabelled" + suf, + Labelled: false, + HasStatusOnStored: true, + Versions: []versionSpec{ + {Name: "v1alpha1", Served: true, Storage: false}, + {Name: "v1beta1", Served: true, Storage: true}, + }, + } + installCRD(spec) + DeferCleanup(func() { deleteCRD(spec.Name) }) + + setStoredVersions(spec.Name, []string{"v1alpha1", "v1beta1"}) + + _, err := reconcile(newReconciler(), spec.Name) + Expect(err).NotTo(HaveOccurred()) + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1alpha1", "v1beta1"}), + "storedVersions must be untouched for unlabelled CRDs") + }) + + It("handles pagination across multiple list pages", func() { + suf := uniqueSuffix() + spec := crdSpec{ + Name: "paginated" + suf + "." + toolhiveGroup, + Group: toolhiveGroup, + Kind: "Paginated" + suf, + ListKind: "Paginated" + suf + "List", + Plural: "paginated" + suf, + Singular: "paginated" + suf, + Labelled: true, + HasStatusOnStored: true, + Versions: []versionSpec{ + {Name: "v1alpha1", Served: true, Storage: false}, + {Name: "v1beta1", Served: true, Storage: true}, + }, + } + installCRD(spec) + DeferCleanup(func() { deleteCRD(spec.Name) }) + + // Seven CRs with PageSize=3 forces three pages (3+3+1) and + // exercises the continue-token loop far more cheaply than 501 + // writes against envtest. + createCRs( + schema.GroupVersionKind{Group: spec.Group, Version: "v1beta1", Kind: spec.Kind}, + "obj-"+suf, 7, + ) + setStoredVersions(spec.Name, []string{"v1alpha1", "v1beta1"}) + + // Wrap APIReader to count List calls for this kind. This is the + // only direct proof that the continue-token loop actually ran — + // metadata-only SSAs don't leave a managedFields fingerprint. + counting := &countingAPIReader{Reader: k8sClient, kind: spec.Kind} + r := &controllers.StorageVersionMigratorReconciler{ + Client: k8sClient, + APIReader: counting, + Scheme: k8sClient.Scheme(), + Recorder: &noopRecorder{}, + PageSize: 3, + } + _, err := reconcile(r, spec.Name) + Expect(err).NotTo(HaveOccurred()) + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1beta1"})) + + // 7 CRs with PageSize=3 ⇒ 3 list calls (pages of 3+3+1). + Expect(counting.listCalls).To(BeNumerically(">=", 3), + "pagination should have triggered at least 3 list calls for 7 CRs at pageSize=3; got %d", + counting.listCalls) + }) + + It("does not touch storedVersions when a CR re-store fails", func() { + suf := uniqueSuffix() + spec := crdSpec{ + Name: "failures" + suf + "." + toolhiveGroup, + Group: toolhiveGroup, + Kind: "Failure" + suf, + ListKind: "Failure" + suf + "List", + Plural: "failures" + suf, + Singular: "failure" + suf, + Labelled: true, + HasStatusOnStored: true, + Versions: []versionSpec{ + {Name: "v1alpha1", Served: true, Storage: false}, + {Name: "v1beta1", Served: true, Storage: true}, + }, + } + installCRD(spec) + DeferCleanup(func() { deleteCRD(spec.Name) }) + + crs := createCRs( + schema.GroupVersionKind{Group: spec.Group, Version: "v1beta1", Kind: spec.Kind}, + "obj-"+suf, 3, + ) + setStoredVersions(spec.Name, []string{"v1alpha1", "v1beta1"}) + + failureTarget := client.ObjectKeyFromObject(crs[0]) + failing := &failingUpdateClient{ + Client: k8sClient, + errFn: func(key client.ObjectKey) error { + if key == failureTarget { + return fmt.Errorf("injected update failure for %s", key) + } + return nil + }, + } + r := &controllers.StorageVersionMigratorReconciler{ + Client: failing, + APIReader: k8sClient, + Scheme: k8sClient.Scheme(), + Recorder: &noopRecorder{}, + } + _, err := reconcile(r, spec.Name) + Expect(err).To(HaveOccurred(), "reconcile should surface the injected failure") + Expect(err.Error()).To(ContainSubstring(failureTarget.Name)) + + // Critical contract: storedVersions must NOT be trimmed when any + // CR re-store failed. Otherwise the next release's v1alpha1 + // removal would orphan the un-migrated object in etcd. + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1alpha1", "v1beta1"})) + }) + + It("leaves storedVersions untouched when a CR re-store hits a Conflict, then trims on retry", func() { + suf := uniqueSuffix() + spec := crdSpec{ + Name: "conflicts" + suf + "." + toolhiveGroup, + Group: toolhiveGroup, + Kind: "Conflict" + suf, + ListKind: "Conflict" + suf + "List", + Plural: "conflicts" + suf, + Singular: "conflict" + suf, + Labelled: true, + HasStatusOnStored: true, + Versions: []versionSpec{ + {Name: "v1alpha1", Served: true, Storage: false}, + {Name: "v1beta1", Served: true, Storage: true}, + }, + } + installCRD(spec) + DeferCleanup(func() { deleteCRD(spec.Name) }) + + crs := createCRs( + schema.GroupVersionKind{Group: spec.Group, Version: "v1beta1", Kind: spec.Kind}, + "obj-"+suf, 2, + ) + setStoredVersions(spec.Name, []string{"v1alpha1", "v1beta1"}) + + conflictTarget := client.ObjectKeyFromObject(crs[0]) + injectConflict := true + gr := schema.GroupResource{Group: spec.Group, Resource: spec.Plural} + conflicting := &failingUpdateClient{ + Client: k8sClient, + errFn: func(key client.ObjectKey) error { + if injectConflict && key == conflictTarget { + return apierrors.NewConflict(gr, key.Name, + fmt.Errorf("injected conflict")) + } + return nil + }, + } + r := &controllers.StorageVersionMigratorReconciler{ + Client: conflicting, + APIReader: k8sClient, + Scheme: k8sClient.Scheme(), + Recorder: &noopRecorder{}, + } + + // First pass: conflict swallowed at the per-CR level, but the + // function-level conflict counter trips errMigrationRetriedDueToConflicts + // so storedVersions is left untouched. + _, err := reconcile(r, spec.Name) + Expect(err).To(HaveOccurred(), + "reconcile must return an error when a Conflict was swallowed") + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1alpha1", "v1beta1"}), + "storedVersions must not be trimmed on a pass with any swallowed Conflict") + + // Drop the injection and retry. The cache may have absorbed the + // non-conflicting CR's RV from the first pass — that's fine, the + // conflicting one was never recorded in the cache so it'll be + // re-attempted, succeed, and let the storedVersions patch fire. + injectConflict = false + _, err = reconcile(r, spec.Name) + Expect(err).NotTo(HaveOccurred()) + Expect(getStoredVersions(spec.Name)).To(Equal([]string{"v1beta1"})) + }) + }) +}) + +// ------------------------------------------------------------------ +// Test doubles +// ------------------------------------------------------------------ + +// failingUpdateClient wraps a real client.Client and intercepts Update (and +// Status().Update) for specific object keys. The controller's restoreOne goes +// through Update — so this wrapper is how we inject failures and conflicts. +// +// errFn returns the error to inject for a given key, or nil to let the call +// pass through to the wrapped client. Returning a non-nil error short-circuits +// the call so the underlying object is not modified. +type failingUpdateClient struct { + client.Client + errFn func(key client.ObjectKey) error +} + +func (f *failingUpdateClient) Update(ctx context.Context, obj client.Object, opts ...client.UpdateOption) error { + if err := f.errFn(client.ObjectKeyFromObject(obj)); err != nil { + return err + } + return f.Client.Update(ctx, obj, opts...) +} + +func (f *failingUpdateClient) Status() client.SubResourceWriter { + return &failingUpdateStatus{ + inner: f.Client.Status(), + errFn: f.errFn, + } +} + +type failingUpdateStatus struct { + inner client.SubResourceWriter + errFn func(key client.ObjectKey) error +} + +func (s *failingUpdateStatus) Create(ctx context.Context, obj client.Object, sub client.Object, opts ...client.SubResourceCreateOption) error { + return s.inner.Create(ctx, obj, sub, opts...) +} + +func (s *failingUpdateStatus) Update(ctx context.Context, obj client.Object, opts ...client.SubResourceUpdateOption) error { + if err := s.errFn(client.ObjectKeyFromObject(obj)); err != nil { + return err + } + return s.inner.Update(ctx, obj, opts...) +} + +func (s *failingUpdateStatus) Patch(ctx context.Context, obj client.Object, patch client.Patch, opts ...client.SubResourcePatchOption) error { + return s.inner.Patch(ctx, obj, patch, opts...) +} + +func (s *failingUpdateStatus) Apply(ctx context.Context, obj runtime.ApplyConfiguration, opts ...client.SubResourceApplyOption) error { + return s.inner.Apply(ctx, obj, opts...) +} + +// noopRecorder is a minimal EventRecorder for direct-Reconcile tests. +type noopRecorder struct{} + +func (*noopRecorder) Event(_ runtime.Object, _, _, _ string) {} +func (*noopRecorder) Eventf(_ runtime.Object, _, _, _ string, _ ...any) {} +func (*noopRecorder) AnnotatedEventf(_ runtime.Object, _ map[string]string, _, _, _ string, _ ...any) { +} + +// countingAPIReader wraps a client.Reader and records how many List calls +// targeted a given kind. Used by the pagination test to verify the +// continue-token loop ran as expected. +type countingAPIReader struct { + client.Reader + kind string + listCalls int +} + +func (c *countingAPIReader) List(ctx context.Context, list client.ObjectList, opts ...client.ListOption) error { + if u, ok := list.(*unstructured.UnstructuredList); ok { + // ListKind is "List"; match on the configured kind prefix. + if u.GetKind() == c.kind+"List" { + c.listCalls++ + } + } + return c.Reader.List(ctx, list, opts...) +} diff --git a/cmd/thv-operator/test-integration/storageversionmigrator/suite_test.go b/cmd/thv-operator/test-integration/storageversionmigrator/suite_test.go new file mode 100644 index 0000000000..0e94586912 --- /dev/null +++ b/cmd/thv-operator/test-integration/storageversionmigrator/suite_test.go @@ -0,0 +1,83 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +// Package storageversionmigrator contains envtest-backed integration tests +// for the StorageVersionMigrator controller. +// +// The suite does NOT pre-install any CRDs — each test constructs and installs +// the exact CRD it needs, which keeps scenarios independent and lets us +// exercise edge cases (foreign groups, missing status subresource, etc.) that +// wouldn't be possible with the real toolhive CRD manifests. +// +// The suite also does NOT start a controller manager. Each test constructs its +// own reconciler and calls Reconcile directly. This is more deterministic than +// manager-driven tests (no Eventually() races against the background +// controller) and lets individual tests inject custom clients to exercise +// failure paths. +package storageversionmigrator + +import ( + "context" + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "go.uber.org/zap/zapcore" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/envtest" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" +) + +var ( + cfg *rest.Config + k8sClient client.Client + testEnv *envtest.Environment + ctx context.Context + cancel context.CancelFunc +) + +func TestStorageVersionMigrator(t *testing.T) { + t.Parallel() + RegisterFailHandler(Fail) + + suiteConfig, reporterConfig := GinkgoConfiguration() + reporterConfig.Verbose = false + reporterConfig.VeryVerbose = false + reporterConfig.FullTrace = false + + RunSpecs(t, "StorageVersionMigrator Controller Integration Test Suite", suiteConfig, reporterConfig) +} + +var _ = BeforeSuite(func() { + logLevel := zapcore.ErrorLevel + logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true), zap.Level(logLevel))) + + ctx, cancel = context.WithCancel(context.TODO()) + + By("bootstrapping envtest") + testEnv = &envtest.Environment{ + ErrorIfCRDPathMissing: false, // tests install CRDs on demand + } + + var err error + cfg, err = testEnv.Start() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg).NotTo(BeNil()) + + utilruntime.Must(apiextensionsv1.AddToScheme(scheme.Scheme)) + + k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) + Expect(err).NotTo(HaveOccurred()) + Expect(k8sClient).NotTo(BeNil()) +}) + +var _ = AfterSuite(func() { + By("tearing down envtest") + cancel() + Expect(testEnv.Stop()).NotTo(HaveOccurred()) +}) diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_embeddingservers.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_embeddingservers.yaml index ab99253a70..cf1790b3a5 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_embeddingservers.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_embeddingservers.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: embeddingservers.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml index 7f9a3d2a7f..2eaf68493d 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpexternalauthconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpgroups.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpgroups.yaml index 4270eb0334..1be61dcefb 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpgroups.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpgroups.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpgroups.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpoidcconfigs.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpoidcconfigs.yaml index 8890392814..41aa7225a9 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpoidcconfigs.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpoidcconfigs.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpoidcconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpregistries.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpregistries.yaml index fea184018f..4c4cac98a6 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpregistries.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpregistries.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpregistries.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml index 915417ddd9..20ccacbb34 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpremoteproxies.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpremoteproxies.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpserverentries.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpserverentries.yaml index 78c2523eca..c7a9896551 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpserverentries.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpserverentries.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpserverentries.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpservers.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpservers.yaml index 44eb6f7a02..a96e309653 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpservers.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpservers.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpservers.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptelemetryconfigs.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptelemetryconfigs.yaml index 9a61add0cb..2262801a88 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptelemetryconfigs.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptelemetryconfigs.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcptelemetryconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptoolconfigs.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptoolconfigs.yaml index 719026f13d..a5256f7925 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptoolconfigs.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcptoolconfigs.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcptoolconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml index a9c17deff5..bd13d9ea55 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: virtualmcpcompositetooldefinitions.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml index 411d84cdc3..e35cb99e0f 100644 --- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml +++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml @@ -4,6 +4,8 @@ kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: virtualmcpservers.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_embeddingservers.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_embeddingservers.yaml index 96ed64805a..360f2ac179 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_embeddingservers.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_embeddingservers.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: embeddingservers.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml index ea6684ae6c..33b6b2eb60 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpexternalauthconfigs.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpexternalauthconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpgroups.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpgroups.yaml index 5a072011b2..929ca3ef82 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpgroups.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpgroups.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpgroups.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpoidcconfigs.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpoidcconfigs.yaml index 280b14bea2..14b4fa41f9 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpoidcconfigs.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpoidcconfigs.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpoidcconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpregistries.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpregistries.yaml index 6e55bc9de4..9fbf917813 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpregistries.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpregistries.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpregistries.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml index 79154701fb..cb6acd389a 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpremoteproxies.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpremoteproxies.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpserverentries.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpserverentries.yaml index 2cff1d9a86..961d0a5644 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpserverentries.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpserverentries.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpserverentries.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpservers.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpservers.yaml index a248508d41..6ee8d6a079 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpservers.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpservers.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcpservers.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptelemetryconfigs.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptelemetryconfigs.yaml index ca666e2693..d1ad1e4495 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptelemetryconfigs.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptelemetryconfigs.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcptelemetryconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptoolconfigs.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptoolconfigs.yaml index 7df79aba95..cbe2794c4d 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptoolconfigs.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcptoolconfigs.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: mcptoolconfigs.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml index 347455410d..50694d18fd 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpcompositetooldefinitions.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: virtualmcpcompositetooldefinitions.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml index 9929904d14..407b8ea1e0 100644 --- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml +++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml @@ -7,6 +7,8 @@ metadata: helm.sh/resource-policy: keep {{- end }} controller-gen.kubebuilder.io/version: v0.17.3 + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" name: virtualmcpservers.toolhive.stacklok.dev spec: group: toolhive.stacklok.dev diff --git a/deploy/charts/operator/README.md b/deploy/charts/operator/README.md index 04fa310fe5..0757884e37 100644 --- a/deploy/charts/operator/README.md +++ b/deploy/charts/operator/README.md @@ -46,7 +46,7 @@ The command removes all the Kubernetes components associated with the chart and |-----|------|---------|-------------| | fullnameOverride | string | `"toolhive-operator"` | Provide a fully-qualified name override for resources | | nameOverride | string | `""` | Override the name of the chart | -| operator | object | `{"affinity":{},"autoscaling":{"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80},"containerSecurityContext":{"allowPrivilegeEscalation":false,"capabilities":{"drop":["ALL"]},"readOnlyRootFilesystem":true,"runAsNonRoot":true,"runAsUser":1000,"seccompProfile":{"type":"RuntimeDefault"}},"env":[],"features":{"experimental":false,"registry":true,"server":true,"virtualMCP":true},"gc":{"gogc":75,"gomemlimit":"110MiB"},"image":"ghcr.io/stacklok/toolhive/operator:v0.23.1","imagePullPolicy":"IfNotPresent","imagePullSecrets":[],"leaderElectionRole":{"binding":{"name":"toolhive-operator-leader-election-rolebinding"},"name":"toolhive-operator-leader-election-role","rules":[{"apiGroups":[""],"resources":["configmaps"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":["coordination.k8s.io"],"resources":["leases"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":[""],"resources":["events"],"verbs":["create","patch"]}]},"livenessProbe":{"httpGet":{"path":"/healthz","port":"health"},"initialDelaySeconds":15,"periodSeconds":20},"nodeSelector":{},"podAnnotations":{},"podLabels":{},"podSecurityContext":{"runAsNonRoot":true},"ports":[{"containerPort":8080,"name":"metrics","protocol":"TCP"},{"containerPort":8081,"name":"health","protocol":"TCP"}],"proxyHost":"0.0.0.0","rbac":{"allowedNamespaces":[],"scope":"cluster"},"readinessProbe":{"httpGet":{"path":"/readyz","port":"health"},"initialDelaySeconds":5,"periodSeconds":10},"replicaCount":1,"resources":{"limits":{"cpu":"500m","memory":"128Mi"},"requests":{"cpu":"10m","memory":"64Mi"}},"serviceAccount":{"annotations":{},"automountServiceAccountToken":true,"create":true,"labels":{},"name":"toolhive-operator"},"tolerations":[],"toolhiveRunnerImage":"ghcr.io/stacklok/toolhive/proxyrunner:v0.23.1","vmcpImage":"ghcr.io/stacklok/toolhive/vmcp:v0.23.1","volumeMounts":[],"volumes":[]}` | All values for the operator deployment and associated resources | +| operator | object | `{"affinity":{},"autoscaling":{"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80},"containerSecurityContext":{"allowPrivilegeEscalation":false,"capabilities":{"drop":["ALL"]},"readOnlyRootFilesystem":true,"runAsNonRoot":true,"runAsUser":1000,"seccompProfile":{"type":"RuntimeDefault"}},"env":[],"features":{"experimental":false,"registry":true,"server":true,"storageVersionMigrator":true,"virtualMCP":true},"gc":{"gogc":75,"gomemlimit":"110MiB"},"image":"ghcr.io/stacklok/toolhive/operator:v0.23.1","imagePullPolicy":"IfNotPresent","imagePullSecrets":[],"leaderElectionRole":{"binding":{"name":"toolhive-operator-leader-election-rolebinding"},"name":"toolhive-operator-leader-election-role","rules":[{"apiGroups":[""],"resources":["configmaps"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":["coordination.k8s.io"],"resources":["leases"],"verbs":["get","list","watch","create","update","patch","delete"]},{"apiGroups":[""],"resources":["events"],"verbs":["create","patch"]}]},"livenessProbe":{"httpGet":{"path":"/healthz","port":"health"},"initialDelaySeconds":15,"periodSeconds":20},"nodeSelector":{},"podAnnotations":{},"podLabels":{},"podSecurityContext":{"runAsNonRoot":true},"ports":[{"containerPort":8080,"name":"metrics","protocol":"TCP"},{"containerPort":8081,"name":"health","protocol":"TCP"}],"proxyHost":"0.0.0.0","rbac":{"allowedNamespaces":[],"scope":"cluster"},"readinessProbe":{"httpGet":{"path":"/readyz","port":"health"},"initialDelaySeconds":5,"periodSeconds":10},"replicaCount":1,"resources":{"limits":{"cpu":"500m","memory":"128Mi"},"requests":{"cpu":"10m","memory":"64Mi"}},"serviceAccount":{"annotations":{},"automountServiceAccountToken":true,"create":true,"labels":{},"name":"toolhive-operator"},"tolerations":[],"toolhiveRunnerImage":"ghcr.io/stacklok/toolhive/proxyrunner:v0.23.1","vmcpImage":"ghcr.io/stacklok/toolhive/vmcp:v0.23.1","volumeMounts":[],"volumes":[]}` | All values for the operator deployment and associated resources | | operator.affinity | object | `{}` | Affinity settings for the operator pod | | operator.autoscaling | object | `{"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}` | Configuration for horizontal pod autoscaling | | operator.autoscaling.enabled | bool | `false` | Enable autoscaling for the operator | @@ -58,6 +58,7 @@ The command removes all the Kubernetes components associated with the chart and | operator.features.experimental | bool | `false` | Enable experimental features | | operator.features.registry | bool | `true` | Enable registry controller (MCPRegistry). This automatically sets ENABLE_REGISTRY environment variable. | | operator.features.server | bool | `true` | Enable server-related controllers (MCPServer, MCPExternalAuthConfig, MCPRemoteProxy, and ToolConfig). This automatically sets ENABLE_SERVER environment variable. | +| operator.features.storageVersionMigrator | bool | `true` | Enable the StorageVersionMigrator controller, which auto-cleans status.storedVersions on opted-in toolhive.stacklok.dev CRDs so a future release can drop deprecated versions (e.g. v1alpha1) without orphaning etcd objects. Leave this on unless you are running kube-storage-version-migrator externally. This automatically sets ENABLE_STORAGE_VERSION_MIGRATOR environment variable. | | operator.features.virtualMCP | bool | `true` | Enable Virtual MCP aggregation features (VirtualMCPServer, MCPGroup controllers and webhooks). Set to false to disable Virtual MCP controllers when Virtual MCP CRDs are not installed. This automatically sets ENABLE_VMCP environment variable. Requires server to be enabled (server: true). | | operator.gc | object | `{"gogc":75,"gomemlimit":"110MiB"}` | Go memory limits and garbage collection percentage for the operator container | | operator.gc.gogc | int | `75` | Go garbage collection percentage for the operator container | diff --git a/deploy/charts/operator/templates/clusterrole/role.yaml b/deploy/charts/operator/templates/clusterrole/role.yaml index 120b549df6..13d0e291eb 100644 --- a/deploy/charts/operator/templates/clusterrole/role.yaml +++ b/deploy/charts/operator/templates/clusterrole/role.yaml @@ -48,6 +48,21 @@ rules: - pods/log verbs: - get +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions/status + verbs: + - patch + - update - apiGroups: - apps resources: @@ -98,26 +113,15 @@ rules: - apiGroups: - toolhive.stacklok.dev resources: - - embeddingservers - - mcpexternalauthconfigs - - mcpgroups - - mcpoidcconfigs - - mcpregistries - - mcpremoteproxies - - mcpservers - - mcptoolconfigs - - virtualmcpservers + - '*' verbs: - - create - - delete - get - list - - patch - update - - watch - apiGroups: - toolhive.stacklok.dev resources: + - '*/status' - embeddingservers/finalizers - mcpexternalauthconfigs/finalizers - mcpgroups/finalizers @@ -128,6 +132,26 @@ rules: - mcptoolconfigs/finalizers verbs: - update +- apiGroups: + - toolhive.stacklok.dev + resources: + - embeddingservers + - mcpexternalauthconfigs + - mcpgroups + - mcpoidcconfigs + - mcpregistries + - mcpremoteproxies + - mcpservers + - mcptoolconfigs + - virtualmcpservers + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - toolhive.stacklok.dev resources: diff --git a/deploy/charts/operator/templates/deployment.yaml b/deploy/charts/operator/templates/deployment.yaml index 12170c58b8..60fced6f91 100644 --- a/deploy/charts/operator/templates/deployment.yaml +++ b/deploy/charts/operator/templates/deployment.yaml @@ -64,6 +64,8 @@ spec: value: {{ .Values.operator.features.registry | quote }} - name: ENABLE_VMCP value: {{ .Values.operator.features.virtualMCP | quote }} + - name: ENABLE_STORAGE_VERSION_MIGRATOR + value: {{ .Values.operator.features.storageVersionMigrator | quote }} {{- if eq .Values.operator.rbac.scope "namespace" }} - name: WATCH_NAMESPACE value: "{{ .Values.operator.rbac.allowedNamespaces | join "," }}" diff --git a/deploy/charts/operator/values.yaml b/deploy/charts/operator/values.yaml index d8eaa5a720..f199324598 100644 --- a/deploy/charts/operator/values.yaml +++ b/deploy/charts/operator/values.yaml @@ -20,6 +20,13 @@ operator: # This automatically sets ENABLE_VMCP environment variable. # Requires server to be enabled (server: true). virtualMCP: true + # -- Enable the StorageVersionMigrator controller, which auto-cleans + # status.storedVersions on opted-in toolhive.stacklok.dev CRDs so a + # future release can drop deprecated versions (e.g. v1alpha1) without + # orphaning etcd objects. Leave this on unless you are running + # kube-storage-version-migrator externally. + # This automatically sets ENABLE_STORAGE_VERSION_MIGRATOR environment variable. + storageVersionMigrator: true # -- Number of replicas for the operator deployment replicaCount: 1 diff --git a/docs/operator/storage-version-migration.md b/docs/operator/storage-version-migration.md new file mode 100644 index 0000000000..828ddc1831 --- /dev/null +++ b/docs/operator/storage-version-migration.md @@ -0,0 +1,116 @@ +# Storage Version Migration + +The ToolHive operator ships a `StorageVersionMigrator` controller that keeps every ToolHive CRD's `status.storedVersions` list clean, so a future operator release can drop deprecated API versions (e.g. `v1alpha1`) without orphaning objects in etcd. + +## Why this exists + +When a CRD graduates from, say, `v1alpha1` to `v1beta1` with both versions served and `v1beta1` as the storage version, existing objects continue to work — they are transparently converted on read/write. But the API server records every version that has ever been used for storage in `CustomResourceDefinition.status.storedVersions`. Until that list is trimmed, the Kubernetes API server refuses to let you remove a version from `spec.versions`, because doing so would orphan any etcd-stored objects encoded at that version. + +The cleanup is not automatic. Someone has to re-store every existing object at the current storage version, then explicitly patch `status.storedVersions` to drop the old entry. The `StorageVersionMigrator` controller does this for you, on every opted-in ToolHive CRD, continuously. See [upstream Kubernetes documentation](https://kubernetes.io/docs/tasks/manage-kubernetes-objects/storage-version-migration/) for the mechanism. + +## What the controller does + +For each opted-in CRD: + +1. Reads `spec.versions` to find the entry with `storage: true`. +2. If `status.storedVersions` already equals `[]` and only one version is served, nothing to do. +3. Otherwise, lists every Custom Resource of that kind and, for each one, does a plain `Get` followed by an `Update` of the unmodified live object. The apiserver re-encodes the request body at the current storage version and compares it to what is in etcd. When the CR was originally stored at a different `apiVersion` (the actual migration scenario) the bytes carry a different stamp than etcd's record, the comparison fails, and the write proceeds — re-encoding the object at the current storage version. When the CR is already at the current storage version, the bytes match and the apiserver harmlessly elides the write — there was nothing to migrate. This matches the upstream `kube-storage-version-migrator`'s mechanism, confirmed at [kubernetes-sigs/kube-storage-version-migrator#65](https://github.com/kubernetes-sigs/kube-storage-version-migrator/issues/65). +4. Once every object has been processed, patches `CRD.status.storedVersions` to `[]` using an optimistic-lock merge — so concurrent API-server writes cause a clean retry rather than a silent overwrite. + +### Webhook interaction + +The migrator's writes go through the **main resource** Update path, so any validating or mutating admission webhooks registered on the kind see each request as part of normal admission flow. However, only requests that the apiserver actually persists produce downstream state changes a webhook would meaningfully react to — webhook load is bounded by "objects actually being migrated" (CRs whose etcd bytes carry an older `apiVersion` stamp), not "every reconcile pass × every CR". + +## The opt-in label + +A CRD participates in migration only if it carries: + +```yaml +metadata: + labels: + toolhive.stacklok.dev/auto-migrate-storage-version: "true" +``` + +The label is set at CRD-generation time via a kubebuilder marker on each Go type in `cmd/thv-operator/api/v1beta1/`: + +```go +// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true +type MCPServer struct { ... } +``` + +`task operator-manifests` bakes the label into the generated CRD YAML. All current ToolHive root types ship with the marker. A CI test (`TestV1beta1TypesMarkerCoverage`) fails the build if a root type is added without either this marker or an explicit `// +thv:storage-version-migrator:exclude` sibling marker — so the migrator cannot silently forget a new CRD. + +Adding a new CRD that should be migrated: + +```go +// +kubebuilder:metadata:labels=toolhive.stacklok.dev/auto-migrate-storage-version=true +type NewShinyThing struct { ... } +``` + +Adding a new CRD that deliberately should NOT be migrated (e.g. an experimental kind that is still stabilising its schema): + +```go +// +thv:storage-version-migrator:exclude +type ExperimentalThing struct { ... } +``` + +## Disabling the controller + +Set the Helm feature flag: + +```yaml +operator: + features: + storageVersionMigrator: false # default: true +``` + +This sets `ENABLE_STORAGE_VERSION_MIGRATOR=false` on the operator Deployment, and the reconciler is not registered with the manager. + +Disable only if you are running an external migrator such as [kube-storage-version-migrator](https://github.com/kubernetes-sigs/kube-storage-version-migrator). Disabling without a replacement is a footgun: the next ToolHive release that removes a deprecated API version will refuse to apply its CRD update until `storedVersions` is cleaned, and you will have to clean it yourself. + +## Per-CRD emergency escape hatch + +Removing the label on a live cluster excludes that single CRD from migration immediately: + +```bash +kubectl label crd/mcpservers.toolhive.stacklok.dev \ + toolhive.stacklok.dev/auto-migrate-storage-version- +``` + +Intended for incident response only. If you deploy the operator via GitOps (Argo CD, Flux) or `helm upgrade`, the chart will re-apply the chart-set label within seconds. Use the `storageVersionMigrator` feature flag for long-term opt-out. + +## Interaction with version removal releases + +The `StorageVersionMigrator` must have had time to run against your cluster *before* an operator release that drops a deprecated CRD version ships. The typical sequence is: + +1. **Release N**: both versions served, newer version is storage, `StorageVersionMigrator` enabled. The controller quietly re-stores all objects and trims `storedVersions` on every cluster during this deprecation window. +2. **Release N+1+**: the deprecated version is removed from `spec.versions`. Because every cluster's `storedVersions` was already cleaned in the previous release, the CRD update applies cleanly. + +If your cluster upgraded directly from a pre-migrator release to the version-removal release without ever running release N, you must clean `storedVersions` manually (or deploy `kube-storage-version-migrator` once) before the upgrade can succeed. + +## Verification + +For any ToolHive CRD in a cluster where the controller has run: + +```bash +kubectl get crd mcpservers.toolhive.stacklok.dev \ + -o jsonpath='{.status.storedVersions}' +# ["v1beta1"] +``` + +If the list contains more than one entry, the controller has not yet finished migrating — check operator logs for reconcile errors and the `StorageVersionMigrationFailed` event on the CRD. + +## RBAC + +The controller requires (generated from kubebuilder markers, applied by the operator Helm chart): + +- `customresourcedefinitions.apiextensions.k8s.io`: `get`, `list`, `watch` +- `customresourcedefinitions/status.apiextensions.k8s.io`: `update`, `patch` +- `*.toolhive.stacklok.dev`: `get`, `list`, `update` +- `*/status.toolhive.stacklok.dev`: `update` + +## Related + +- Issue: [stacklok/toolhive#4969](https://github.com/stacklok/toolhive/issues/4969) +- Kubernetes CRD versioning: [official docs](https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definition-versioning/) +- Reference implementation: [kubernetes-sigs/kube-storage-version-migrator](https://github.com/kubernetes-sigs/kube-storage-version-migrator) (the upstream SIG API Machinery tool — our controller uses the same Get+Update mechanism, scoped to ToolHive CRDs and triggered automatically rather than via per-migration `StorageVersionMigration` resources)