From cacf24f4ada1ab08f7ed3efae24d43c2c810e93f Mon Sep 17 00:00:00 2001
From: Sanskarzz <sanskar.gur@gmail.com>
Date: Mon, 27 Apr 2026 23:52:02 +0530
Subject: [PATCH 1/4] Configure rate limits on VirtualMCPServer

Signed-off-by: Sanskarzz <sanskar.gur@gmail.com>
---
 .../api/v1beta1/mcpserver_types.go            |  54 ++-
 .../api/v1beta1/mcpserver_types_test.go       |  45 +++
 .../api/v1beta1/virtualmcpserver_types.go     |   9 +
 .../api/v1beta1/zz_generated.deepcopy.go      |  17 +-
 .../virtualmcpserver_vmcpconfig_test.go       |  29 ++
 cmd/thv-operator/pkg/vmcpconfig/converter.go  |   1 +
 .../pkg/vmcpconfig/converter_test.go          |  36 ++
 ...irtualmcpserver_sessionstorage_cel_test.go |  55 +++
 .../toolhive.stacklok.dev_mcpservers.yaml     | 128 +++++-
 ...olhive.stacklok.dev_virtualmcpservers.yaml | 364 ++++++++++++++++++
 .../toolhive.stacklok.dev_mcpservers.yaml     | 128 +++++-
 ...olhive.stacklok.dev_virtualmcpservers.yaml | 364 ++++++++++++++++++
 docs/operator/crd-api.md                      |  48 ++-
 docs/server/docs.go                           |   6 +
 docs/server/swagger.json                      |   6 +
 docs/server/swagger.yaml                      |   4 +
 pkg/ratelimit/config/config.go                | 147 +++++++
 pkg/ratelimit/config/doc.go                   |   8 +
 pkg/ratelimit/limiter.go                      |  42 +-
 pkg/ratelimit/limiter_test.go                 | 172 ++++++++-
 pkg/ratelimit/middleware.go                   |  34 +-
 pkg/ratelimit/middleware_test.go              |  51 ++-
 pkg/runner/middleware.go                      |   2 +-
 pkg/vmcp/cli/serve.go                         |   2 +
 pkg/vmcp/config/config.go                     |   9 +
 pkg/vmcp/config/zz_generated.deepcopy.go      |   4 +
 pkg/vmcp/server/ratelimit.go                  |  72 ++++
 pkg/vmcp/server/ratelimit_test.go             |  76 ++++
 pkg/vmcp/server/server.go                     | 184 +++++----
 .../virtualmcp/virtualmcp_ratelimit_test.go   | 248 ++++++++++++
 30 files changed, 2185 insertions(+), 160 deletions(-)
 create mode 100644 pkg/ratelimit/config/config.go
 create mode 100644 pkg/ratelimit/config/doc.go
 create mode 100644 pkg/vmcp/server/ratelimit.go
 create mode 100644 pkg/vmcp/server/ratelimit_test.go
 create mode 100644 test/e2e/thv-operator/virtualmcp/virtualmcp_ratelimit_test.go

diff --git a/cmd/thv-operator/api/v1beta1/mcpserver_types.go b/cmd/thv-operator/api/v1beta1/mcpserver_types.go
index 3a21eac58e..6738e6e134 100644
--- a/cmd/thv-operator/api/v1beta1/mcpserver_types.go
+++ b/cmd/thv-operator/api/v1beta1/mcpserver_types.go
@@ -7,6 +7,8 @@ import (
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
+
+	rlconfig "github.com/stacklok/toolhive/pkg/ratelimit/config"
 )
 
 // Condition types for MCPServer
@@ -505,11 +507,15 @@ type SessionStorageConfig struct {
 // RateLimitConfig defines rate limiting configuration for an MCP server.
 // At least one of shared, perUser, or tools must be configured.
 //
-// +kubebuilder:validation:XValidation:rule="has(self.shared) || has(self.perUser) || (has(self.tools) && size(self.tools) > 0)",message="at least one of shared, perUser, or tools must be configured"
+// +kubebuilder:validation:XValidation:rule="has(self.global) || has(self.shared) || has(self.perUser) || (has(self.tools) && size(self.tools) > 0)",message="at least one of global, shared, perUser, or tools must be configured"
 //
 //nolint:lll // CEL validation rules exceed line length limit
 type RateLimitConfig struct {
-	// Shared is a token bucket shared across all users for the entire server.
+	// Global is a token bucket shared across all users for the entire server.
+	// +optional
+	Global *RateLimitBucket `json:"global,omitempty"`
+
+	// Shared is a deprecated alias for Global. Use global instead.
 	// +optional
 	Shared *RateLimitBucket `json:"shared,omitempty"`
 
@@ -547,9 +553,9 @@ type RateLimitBucket struct {
 }
 
 // ToolRateLimitConfig defines rate limits for a specific tool.
-// At least one of shared or perUser must be configured.
+// At least one of global, shared, or perUser must be configured.
 //
-// +kubebuilder:validation:XValidation:rule="has(self.shared) || has(self.perUser)",message="at least one of shared or perUser must be configured"
+// +kubebuilder:validation:XValidation:rule="has(self.global) || has(self.shared) || has(self.perUser)",message="at least one of global, shared, or perUser must be configured"
 //
 //nolint:lll // kubebuilder marker exceeds line length
 type ToolRateLimitConfig struct {
@@ -558,7 +564,11 @@ type ToolRateLimitConfig struct {
 	// +kubebuilder:validation:MinLength=1
 	Name string `json:"name"`
 
-	// Shared token bucket for this specific tool.
+	// Global token bucket for this specific tool.
+	// +optional
+	Global *RateLimitBucket `json:"global,omitempty"`
+
+	// Shared is a deprecated alias for Global. Use global instead.
 	// +optional
 	Shared *RateLimitBucket `json:"shared,omitempty"`
 
@@ -567,6 +577,40 @@ type ToolRateLimitConfig struct {
 	PerUser *RateLimitBucket `json:"perUser,omitempty"`
 }
 
+// ToInternal converts the Kubernetes API rate limit config to the runtime-neutral representation.
+func (in *RateLimitConfig) ToInternal() *rlconfig.Config {
+	if in == nil {
+		return nil
+	}
+	out := &rlconfig.Config{
+		Global:  rateLimitBucketToInternal(in.Global),
+		Shared:  rateLimitBucketToInternal(in.Shared),
+		PerUser: rateLimitBucketToInternal(in.PerUser),
+	}
+	if len(in.Tools) > 0 {
+		out.Tools = make([]rlconfig.ToolConfig, 0, len(in.Tools))
+		for _, tool := range in.Tools {
+			out.Tools = append(out.Tools, rlconfig.ToolConfig{
+				Name:    tool.Name,
+				Global:  rateLimitBucketToInternal(tool.Global),
+				Shared:  rateLimitBucketToInternal(tool.Shared),
+				PerUser: rateLimitBucketToInternal(tool.PerUser),
+			})
+		}
+	}
+	return out
+}
+
+func rateLimitBucketToInternal(in *RateLimitBucket) *rlconfig.Bucket {
+	if in == nil {
+		return nil
+	}
+	return &rlconfig.Bucket{
+		MaxTokens:    in.MaxTokens,
+		RefillPeriod: in.RefillPeriod,
+	}
+}
+
 // Permission profile types
 const (
 	// PermissionProfileTypeBuiltin is the type for built-in permission profiles
diff --git a/cmd/thv-operator/api/v1beta1/mcpserver_types_test.go b/cmd/thv-operator/api/v1beta1/mcpserver_types_test.go
index 0e69f33836..25506fb4dd 100644
--- a/cmd/thv-operator/api/v1beta1/mcpserver_types_test.go
+++ b/cmd/thv-operator/api/v1beta1/mcpserver_types_test.go
@@ -75,6 +75,13 @@ func TestRateLimitConfigJSONRoundtrip(t *testing.T) {
 		input    RateLimitConfig
 		wantJSON string
 	}{
+		{
+			name: "global only",
+			input: RateLimitConfig{
+				Global: &RateLimitBucket{MaxTokens: 100, RefillPeriod: metav1.Duration{Duration: time.Minute}},
+			},
+			wantJSON: `{"global":{"maxTokens":100,"refillPeriod":"1m0s"}}`,
+		},
 		{
 			name: "shared only",
 			input: RateLimitConfig{
@@ -116,6 +123,44 @@ func TestRateLimitConfigJSONRoundtrip(t *testing.T) {
 	}
 }
 
+func TestVirtualMCPServerSpecRateLimitingJSONRoundtrip(t *testing.T) {
+	t.Parallel()
+
+	spec := VirtualMCPServerSpec{
+		IncomingAuth: &IncomingAuthConfig{Type: "oidc"},
+		GroupRef:     &MCPGroupRef{Name: "group-a"},
+		SessionStorage: &SessionStorageConfig{
+			Provider: "redis",
+			Address:  "redis.default.svc.cluster.local:6379",
+		},
+		RateLimiting: &RateLimitConfig{
+			Global: &RateLimitBucket{MaxTokens: 10, RefillPeriod: metav1.Duration{Duration: time.Minute}},
+			PerUser: &RateLimitBucket{
+				MaxTokens:    2,
+				RefillPeriod: metav1.Duration{Duration: time.Minute},
+			},
+			Tools: []ToolRateLimitConfig{
+				{
+					Name: "backend_a_echo",
+					Global: &RateLimitBucket{
+						MaxTokens:    5,
+						RefillPeriod: metav1.Duration{Duration: 30 * time.Second},
+					},
+				},
+			},
+		},
+	}
+
+	b, err := json.Marshal(spec)
+	require.NoError(t, err)
+	out := string(b)
+	assert.Contains(t, out, `"rateLimiting"`)
+	assert.Contains(t, out, `"global"`)
+	assert.Contains(t, out, `"perUser"`)
+	assert.Contains(t, out, `"backend_a_echo"`)
+	assert.NotContains(t, out, `"config":{"rateLimiting"`)
+}
+
 func TestMCPServerSpecScalingFieldsJSONRoundtrip(t *testing.T) {
 	t.Parallel()
 
diff --git a/cmd/thv-operator/api/v1beta1/virtualmcpserver_types.go b/cmd/thv-operator/api/v1beta1/virtualmcpserver_types.go
index c63139b133..a23f2feaaa 100644
--- a/cmd/thv-operator/api/v1beta1/virtualmcpserver_types.go
+++ b/cmd/thv-operator/api/v1beta1/virtualmcpserver_types.go
@@ -16,6 +16,10 @@ import (
 
 // VirtualMCPServerSpec defines the desired state of VirtualMCPServer
 //
+// +kubebuilder:validation:XValidation:rule="!has(self.rateLimiting) || (has(self.sessionStorage) && self.sessionStorage.provider == 'redis')",message="rateLimiting requires sessionStorage with provider 'redis'"
+// +kubebuilder:validation:XValidation:rule="!(has(self.rateLimiting) && has(self.rateLimiting.perUser)) || (has(self.incomingAuth) && self.incomingAuth.type == 'oidc')",message="rateLimiting.perUser requires incomingAuth.type oidc"
+// +kubebuilder:validation:XValidation:rule="!has(self.rateLimiting) || !has(self.rateLimiting.tools) || self.rateLimiting.tools.all(t, !has(t.perUser)) || (has(self.incomingAuth) && self.incomingAuth.type == 'oidc')",message="per-tool perUser rate limiting requires incomingAuth.type oidc"
+//
 //nolint:lll // CEL validation rules exceed line length limit
 type VirtualMCPServerSpec struct {
 	// IncomingAuth configures authentication for clients connecting to the Virtual MCP server.
@@ -143,6 +147,11 @@ type VirtualMCPServerSpec struct {
 	// +listType=atomic
 	// +optional
 	ImagePullSecrets []corev1.LocalObjectReference `json:"imagePullSecrets,omitempty"`
+
+	// RateLimiting defines rate limiting configuration for the Virtual MCP server.
+	// Requires Redis session storage to be configured for distributed rate limiting.
+	// +optional
+	RateLimiting *RateLimitConfig `json:"rateLimiting,omitempty"`
 }
 
 // EmbeddingServerRef references an existing EmbeddingServer resource by name.
diff --git a/cmd/thv-operator/api/v1beta1/zz_generated.deepcopy.go b/cmd/thv-operator/api/v1beta1/zz_generated.deepcopy.go
index 04da44756d..fd3706419d 100644
--- a/cmd/thv-operator/api/v1beta1/zz_generated.deepcopy.go
+++ b/cmd/thv-operator/api/v1beta1/zz_generated.deepcopy.go
@@ -23,7 +23,7 @@ package v1beta1
 import (
 	corev1 "k8s.io/api/core/v1"
 	apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
-	"k8s.io/apimachinery/pkg/apis/meta/v1"
+	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 )
 
@@ -2210,6 +2210,11 @@ func (in *RateLimitBucket) DeepCopy() *RateLimitBucket {
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *RateLimitConfig) DeepCopyInto(out *RateLimitConfig) {
 	*out = *in
+	if in.Global != nil {
+		in, out := &in.Global, &out.Global
+		*out = new(RateLimitBucket)
+		**out = **in
+	}
 	if in.Shared != nil {
 		in, out := &in.Shared, &out.Shared
 		*out = new(RateLimitBucket)
@@ -2664,6 +2669,11 @@ func (in *ToolOverride) DeepCopy() *ToolOverride {
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *ToolRateLimitConfig) DeepCopyInto(out *ToolRateLimitConfig) {
 	*out = *in
+	if in.Global != nil {
+		in, out := &in.Global, &out.Global
+		*out = new(RateLimitBucket)
+		**out = **in
+	}
 	if in.Shared != nil {
 		in, out := &in.Shared, &out.Shared
 		*out = new(RateLimitBucket)
@@ -3008,6 +3018,11 @@ func (in *VirtualMCPServerSpec) DeepCopyInto(out *VirtualMCPServerSpec) {
 		*out = make([]corev1.LocalObjectReference, len(*in))
 		copy(*out, *in)
 	}
+	if in.RateLimiting != nil {
+		in, out := &in.RateLimiting, &out.RateLimiting
+		*out = new(RateLimitConfig)
+		(*in).DeepCopyInto(*out)
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VirtualMCPServerSpec.
diff --git a/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig_test.go b/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig_test.go
index 5d0fe5efde..76da31960f 100644
--- a/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig_test.go
+++ b/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig_test.go
@@ -459,6 +459,25 @@ func TestEnsureVmcpConfigConfigMap(t *testing.T) {
 		},
 		Spec: mcpv1beta1.VirtualMCPServerSpec{
 			GroupRef: &mcpv1beta1.MCPGroupRef{Name: "test-group"},
+			SessionStorage: &mcpv1beta1.SessionStorageConfig{
+				Provider: mcpv1beta1.SessionStorageProviderRedis,
+				Address:  "redis.default.svc.cluster.local:6379",
+			},
+			RateLimiting: &mcpv1beta1.RateLimitConfig{
+				PerUser: &mcpv1beta1.RateLimitBucket{
+					MaxTokens:    2,
+					RefillPeriod: metav1.Duration{Duration: time.Minute},
+				},
+				Tools: []mcpv1beta1.ToolRateLimitConfig{
+					{
+						Name: "backend_a_echo",
+						Global: &mcpv1beta1.RateLimitBucket{
+							MaxTokens:    5,
+							RefillPeriod: metav1.Duration{Duration: time.Minute},
+						},
+					},
+				},
+			},
 		},
 	}
 
@@ -507,6 +526,16 @@ func TestEnsureVmcpConfigConfigMap(t *testing.T) {
 	assert.Equal(t, "test-vmcp-vmcp-config", cm.Name)
 	assert.Contains(t, cm.Data, "config.yaml")
 	assert.NotEmpty(t, cm.Annotations["toolhive.stacklok.dev/content-checksum"])
+
+	var cfg vmcpconfig.Config
+	require.NoError(t, yaml.Unmarshal([]byte(cm.Data["config.yaml"]), &cfg))
+	require.NotNil(t, cfg.RateLimiting, "runtime config must include spec.rateLimiting")
+	require.NotNil(t, cfg.RateLimiting.PerUser)
+	assert.Equal(t, int32(2), cfg.RateLimiting.PerUser.MaxTokens)
+	require.Len(t, cfg.RateLimiting.Tools, 1)
+	assert.Equal(t, "backend_a_echo", cfg.RateLimiting.Tools[0].Name)
+	require.NotNil(t, cfg.RateLimiting.Tools[0].Global)
+	assert.Equal(t, int32(5), cfg.RateLimiting.Tools[0].Global.MaxTokens)
 }
 
 // TestSetAuthConfigConditions tests that auth config conditions reflect the current state
diff --git a/cmd/thv-operator/pkg/vmcpconfig/converter.go b/cmd/thv-operator/pkg/vmcpconfig/converter.go
index cd31af6d69..6c9a4975e9 100644
--- a/cmd/thv-operator/pkg/vmcpconfig/converter.go
+++ b/cmd/thv-operator/pkg/vmcpconfig/converter.go
@@ -140,6 +140,7 @@ func (c *Converter) Convert(
 	}
 
 	config.SessionStorage = convertSessionStorage(vmcp)
+	config.RateLimiting = vmcp.Spec.RateLimiting.ToInternal()
 
 	// Apply operational defaults (fills missing values)
 	config.EnsureOperationalDefaults()
diff --git a/cmd/thv-operator/pkg/vmcpconfig/converter_test.go b/cmd/thv-operator/pkg/vmcpconfig/converter_test.go
index cee72256af..0fdb67443b 100644
--- a/cmd/thv-operator/pkg/vmcpconfig/converter_test.go
+++ b/cmd/thv-operator/pkg/vmcpconfig/converter_test.go
@@ -97,6 +97,42 @@ func newTestConverterWithObjects(t *testing.T, resolver oidc.Resolver, objects .
 	return converter
 }
 
+func TestConverter_RateLimitingFromTopLevelSpec(t *testing.T) {
+	t.Parallel()
+
+	vmcp := &mcpv1beta1.VirtualMCPServer{
+		ObjectMeta: metav1.ObjectMeta{Name: "test-vmcp", Namespace: "default"},
+		Spec: mcpv1beta1.VirtualMCPServerSpec{
+			GroupRef:     &mcpv1beta1.MCPGroupRef{Name: "test-group"},
+			IncomingAuth: &mcpv1beta1.IncomingAuthConfig{Type: "anonymous"},
+			RateLimiting: &mcpv1beta1.RateLimitConfig{
+				Global: &mcpv1beta1.RateLimitBucket{
+					MaxTokens:    10,
+					RefillPeriod: metav1.Duration{Duration: time.Minute},
+				},
+				Tools: []mcpv1beta1.ToolRateLimitConfig{
+					{
+						Name: "backend_a_echo",
+						Global: &mcpv1beta1.RateLimitBucket{
+							MaxTokens:    1,
+							RefillPeriod: metav1.Duration{Duration: time.Minute},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	converter := newTestConverter(t, newNoOpMockResolver(t))
+	cfg, _, err := converter.Convert(context.Background(), vmcp, nil)
+	require.NoError(t, err)
+	require.NotNil(t, cfg.RateLimiting)
+	require.NotNil(t, cfg.RateLimiting.Global)
+	assert.Equal(t, int32(10), cfg.RateLimiting.Global.MaxTokens)
+	require.Len(t, cfg.RateLimiting.Tools, 1)
+	assert.Equal(t, "backend_a_echo", cfg.RateLimiting.Tools[0].Name)
+}
+
 func TestConverter_OIDCResolution(t *testing.T) {
 	t.Parallel()
 
diff --git a/cmd/thv-operator/test-integration/virtualmcp/virtualmcpserver_sessionstorage_cel_test.go b/cmd/thv-operator/test-integration/virtualmcp/virtualmcpserver_sessionstorage_cel_test.go
index 45b6043196..df614f32a6 100644
--- a/cmd/thv-operator/test-integration/virtualmcp/virtualmcpserver_sessionstorage_cel_test.go
+++ b/cmd/thv-operator/test-integration/virtualmcp/virtualmcpserver_sessionstorage_cel_test.go
@@ -5,6 +5,8 @@
 package controllers
 
 import (
+	"time"
+
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -106,4 +108,57 @@ var _ = Describe("CEL Validation for SessionStorageConfig on VirtualMCPServer",
 				Expect(err).To(HaveOccurred())
 			})
 		})
+
+		Context("rateLimiting", func() {
+			It("should reject rate limiting without redis session storage", func() {
+				vmcp := newVirtualMCPServerWithSessionStorage("vmcp-rl-no-redis", nil)
+				vmcp.Spec.RateLimiting = &mcpv1beta1.RateLimitConfig{
+					Global: &mcpv1beta1.RateLimitBucket{
+						MaxTokens:    1,
+						RefillPeriod: metav1.Duration{Duration: time.Minute},
+					},
+				}
+
+				err := k8sClient.Create(ctx, vmcp)
+				Expect(err).To(HaveOccurred())
+				Expect(err.Error()).To(ContainSubstring("rateLimiting requires sessionStorage with provider 'redis'"))
+			})
+
+			It("should reject perUser rate limiting with anonymous auth", func() {
+				vmcp := newVirtualMCPServerWithSessionStorage("vmcp-rl-peruser-anon", &mcpv1beta1.SessionStorageConfig{
+					Provider: "redis",
+					Address:  "redis:6379",
+				})
+				vmcp.Spec.RateLimiting = &mcpv1beta1.RateLimitConfig{
+					PerUser: &mcpv1beta1.RateLimitBucket{
+						MaxTokens:    1,
+						RefillPeriod: metav1.Duration{Duration: time.Minute},
+					},
+				}
+
+				err := k8sClient.Create(ctx, vmcp)
+				Expect(err).To(HaveOccurred())
+				Expect(err.Error()).To(ContainSubstring("rateLimiting.perUser requires incomingAuth.type oidc"))
+			})
+
+			It("should accept perUser rate limiting with oidc auth and redis session storage", func() {
+				vmcp := newVirtualMCPServerWithSessionStorage("vmcp-rl-peruser-oidc", &mcpv1beta1.SessionStorageConfig{
+					Provider: "redis",
+					Address:  "redis:6379",
+				})
+				vmcp.Spec.IncomingAuth = &mcpv1beta1.IncomingAuthConfig{
+					Type:          "oidc",
+					OIDCConfigRef: &mcpv1beta1.MCPOIDCConfigReference{Name: "oidc"},
+				}
+				vmcp.Spec.RateLimiting = &mcpv1beta1.RateLimitConfig{
+					PerUser: &mcpv1beta1.RateLimitBucket{
+						MaxTokens:    1,
+						RefillPeriod: metav1.Duration{Duration: time.Minute},
+					},
+				}
+
+				err := k8sClient.Create(ctx, vmcp)
+				Expect(err).NotTo(HaveOccurred())
+			})
+		})
 	})
diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpservers.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpservers.yaml
index 44eb6f7a02..33aa0febec 100644
--- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpservers.yaml
+++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpservers.yaml
@@ -320,6 +320,28 @@ spec:
                   RateLimiting defines rate limiting configuration for the MCP server.
                   Requires Redis session storage to be configured for distributed rate limiting.
                 properties:
+                  global:
+                    description: Global is a token bucket shared across all users
+                      for the entire server.
+                    properties:
+                      maxTokens:
+                        description: |-
+                          MaxTokens is the maximum number of tokens (bucket capacity).
+                          This is also the burst size: the maximum number of requests that can be served
+                          instantaneously before the bucket is depleted.
+                        format: int32
+                        minimum: 1
+                        type: integer
+                      refillPeriod:
+                        description: |-
+                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                          The effective refill rate is maxTokens / refillPeriod tokens per second.
+                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                        type: string
+                    required:
+                    - maxTokens
+                    - refillPeriod
+                    type: object
                   perUser:
                     description: |-
                       PerUser is a token bucket applied independently to each authenticated user
@@ -346,8 +368,8 @@ spec:
                     - refillPeriod
                     type: object
                   shared:
-                    description: Shared is a token bucket shared across all users
-                      for the entire server.
+                    description: Shared is a deprecated alias for Global. Use global
+                      instead.
                     properties:
                       maxTokens:
                         description: |-
@@ -375,8 +397,29 @@ spec:
                     items:
                       description: |-
                         ToolRateLimitConfig defines rate limits for a specific tool.
-                        At least one of shared or perUser must be configured.
+                        At least one of global, shared, or perUser must be configured.
                       properties:
+                        global:
+                          description: Global token bucket for this specific tool.
+                          properties:
+                            maxTokens:
+                              description: |-
+                                MaxTokens is the maximum number of tokens (bucket capacity).
+                                This is also the burst size: the maximum number of requests that can be served
+                                instantaneously before the bucket is depleted.
+                              format: int32
+                              minimum: 1
+                              type: integer
+                            refillPeriod:
+                              description: |-
+                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                                The effective refill rate is maxTokens / refillPeriod tokens per second.
+                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                              type: string
+                          required:
+                          - maxTokens
+                          - refillPeriod
+                          type: object
                         name:
                           description: Name is the MCP tool name this limit applies
                             to.
@@ -405,7 +448,8 @@ spec:
                           - refillPeriod
                           type: object
                         shared:
-                          description: Shared token bucket for this specific tool.
+                          description: Shared is a deprecated alias for Global. Use
+                            global instead.
                           properties:
                             maxTokens:
                               description: |-
@@ -429,17 +473,19 @@ spec:
                       - name
                       type: object
                       x-kubernetes-validations:
-                      - message: at least one of shared or perUser must be configured
-                        rule: has(self.shared) || has(self.perUser)
+                      - message: at least one of global, shared, or perUser must be
+                          configured
+                        rule: has(self.global) || has(self.shared) || has(self.perUser)
                     type: array
                     x-kubernetes-list-map-keys:
                     - name
                     x-kubernetes-list-type: map
                 type: object
                 x-kubernetes-validations:
-                - message: at least one of shared, perUser, or tools must be configured
-                  rule: has(self.shared) || has(self.perUser) || (has(self.tools)
-                    && size(self.tools) > 0)
+                - message: at least one of global, shared, perUser, or tools must
+                    be configured
+                  rule: has(self.global) || has(self.shared) || has(self.perUser)
+                    || (has(self.tools) && size(self.tools) > 0)
               replicas:
                 description: |-
                   Replicas is the desired number of proxy runner (thv run) pod replicas.
@@ -1169,6 +1215,28 @@ spec:
                   RateLimiting defines rate limiting configuration for the MCP server.
                   Requires Redis session storage to be configured for distributed rate limiting.
                 properties:
+                  global:
+                    description: Global is a token bucket shared across all users
+                      for the entire server.
+                    properties:
+                      maxTokens:
+                        description: |-
+                          MaxTokens is the maximum number of tokens (bucket capacity).
+                          This is also the burst size: the maximum number of requests that can be served
+                          instantaneously before the bucket is depleted.
+                        format: int32
+                        minimum: 1
+                        type: integer
+                      refillPeriod:
+                        description: |-
+                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                          The effective refill rate is maxTokens / refillPeriod tokens per second.
+                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                        type: string
+                    required:
+                    - maxTokens
+                    - refillPeriod
+                    type: object
                   perUser:
                     description: |-
                       PerUser is a token bucket applied independently to each authenticated user
@@ -1195,8 +1263,8 @@ spec:
                     - refillPeriod
                     type: object
                   shared:
-                    description: Shared is a token bucket shared across all users
-                      for the entire server.
+                    description: Shared is a deprecated alias for Global. Use global
+                      instead.
                     properties:
                       maxTokens:
                         description: |-
@@ -1224,8 +1292,29 @@ spec:
                     items:
                       description: |-
                         ToolRateLimitConfig defines rate limits for a specific tool.
-                        At least one of shared or perUser must be configured.
+                        At least one of global, shared, or perUser must be configured.
                       properties:
+                        global:
+                          description: Global token bucket for this specific tool.
+                          properties:
+                            maxTokens:
+                              description: |-
+                                MaxTokens is the maximum number of tokens (bucket capacity).
+                                This is also the burst size: the maximum number of requests that can be served
+                                instantaneously before the bucket is depleted.
+                              format: int32
+                              minimum: 1
+                              type: integer
+                            refillPeriod:
+                              description: |-
+                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                                The effective refill rate is maxTokens / refillPeriod tokens per second.
+                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                              type: string
+                          required:
+                          - maxTokens
+                          - refillPeriod
+                          type: object
                         name:
                           description: Name is the MCP tool name this limit applies
                             to.
@@ -1254,7 +1343,8 @@ spec:
                           - refillPeriod
                           type: object
                         shared:
-                          description: Shared token bucket for this specific tool.
+                          description: Shared is a deprecated alias for Global. Use
+                            global instead.
                           properties:
                             maxTokens:
                               description: |-
@@ -1278,17 +1368,19 @@ spec:
                       - name
                       type: object
                       x-kubernetes-validations:
-                      - message: at least one of shared or perUser must be configured
-                        rule: has(self.shared) || has(self.perUser)
+                      - message: at least one of global, shared, or perUser must be
+                          configured
+                        rule: has(self.global) || has(self.shared) || has(self.perUser)
                     type: array
                     x-kubernetes-list-map-keys:
                     - name
                     x-kubernetes-list-type: map
                 type: object
                 x-kubernetes-validations:
-                - message: at least one of shared, perUser, or tools must be configured
-                  rule: has(self.shared) || has(self.perUser) || (has(self.tools)
-                    && size(self.tools) > 0)
+                - message: at least one of global, shared, perUser, or tools must
+                    be configured
+                  rule: has(self.global) || has(self.shared) || has(self.perUser)
+                    || (has(self.tools) && size(self.tools) > 0)
               replicas:
                 description: |-
                   Replicas is the desired number of proxy runner (thv run) pod replicas.
diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml
index a51fe4b5bd..bbc8df3509 100644
--- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml
+++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml
@@ -2242,6 +2242,177 @@ spec:
                   This field accepts a PodTemplateSpec object as JSON/YAML.
                 type: object
                 x-kubernetes-preserve-unknown-fields: true
+              rateLimiting:
+                description: |-
+                  RateLimiting defines rate limiting configuration for the Virtual MCP server.
+                  Requires Redis session storage to be configured for distributed rate limiting.
+                properties:
+                  global:
+                    description: Global is a token bucket shared across all users
+                      for the entire server.
+                    properties:
+                      maxTokens:
+                        description: |-
+                          MaxTokens is the maximum number of tokens (bucket capacity).
+                          This is also the burst size: the maximum number of requests that can be served
+                          instantaneously before the bucket is depleted.
+                        format: int32
+                        minimum: 1
+                        type: integer
+                      refillPeriod:
+                        description: |-
+                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                          The effective refill rate is maxTokens / refillPeriod tokens per second.
+                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                        type: string
+                    required:
+                    - maxTokens
+                    - refillPeriod
+                    type: object
+                  perUser:
+                    description: |-
+                      PerUser is a token bucket applied independently to each authenticated user
+                      at the server level. Requires authentication to be enabled.
+                      Each unique userID creates Redis keys that expire after 2x refillPeriod.
+                      Memory formula: unique_users_per_TTL_window * (1 + num_tools_with_per_user_limits) keys.
+                    properties:
+                      maxTokens:
+                        description: |-
+                          MaxTokens is the maximum number of tokens (bucket capacity).
+                          This is also the burst size: the maximum number of requests that can be served
+                          instantaneously before the bucket is depleted.
+                        format: int32
+                        minimum: 1
+                        type: integer
+                      refillPeriod:
+                        description: |-
+                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                          The effective refill rate is maxTokens / refillPeriod tokens per second.
+                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                        type: string
+                    required:
+                    - maxTokens
+                    - refillPeriod
+                    type: object
+                  shared:
+                    description: Shared is a deprecated alias for Global. Use global
+                      instead.
+                    properties:
+                      maxTokens:
+                        description: |-
+                          MaxTokens is the maximum number of tokens (bucket capacity).
+                          This is also the burst size: the maximum number of requests that can be served
+                          instantaneously before the bucket is depleted.
+                        format: int32
+                        minimum: 1
+                        type: integer
+                      refillPeriod:
+                        description: |-
+                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                          The effective refill rate is maxTokens / refillPeriod tokens per second.
+                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                        type: string
+                    required:
+                    - maxTokens
+                    - refillPeriod
+                    type: object
+                  tools:
+                    description: |-
+                      Tools defines per-tool rate limit overrides.
+                      Each entry applies additional rate limits to calls targeting a specific tool name.
+                      A request must pass both the server-level limit and the per-tool limit.
+                    items:
+                      description: |-
+                        ToolRateLimitConfig defines rate limits for a specific tool.
+                        At least one of global, shared, or perUser must be configured.
+                      properties:
+                        global:
+                          description: Global token bucket for this specific tool.
+                          properties:
+                            maxTokens:
+                              description: |-
+                                MaxTokens is the maximum number of tokens (bucket capacity).
+                                This is also the burst size: the maximum number of requests that can be served
+                                instantaneously before the bucket is depleted.
+                              format: int32
+                              minimum: 1
+                              type: integer
+                            refillPeriod:
+                              description: |-
+                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                                The effective refill rate is maxTokens / refillPeriod tokens per second.
+                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                              type: string
+                          required:
+                          - maxTokens
+                          - refillPeriod
+                          type: object
+                        name:
+                          description: Name is the MCP tool name this limit applies
+                            to.
+                          minLength: 1
+                          type: string
+                        perUser:
+                          description: PerUser token bucket configuration for this
+                            tool.
+                          properties:
+                            maxTokens:
+                              description: |-
+                                MaxTokens is the maximum number of tokens (bucket capacity).
+                                This is also the burst size: the maximum number of requests that can be served
+                                instantaneously before the bucket is depleted.
+                              format: int32
+                              minimum: 1
+                              type: integer
+                            refillPeriod:
+                              description: |-
+                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                                The effective refill rate is maxTokens / refillPeriod tokens per second.
+                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                              type: string
+                          required:
+                          - maxTokens
+                          - refillPeriod
+                          type: object
+                        shared:
+                          description: Shared is a deprecated alias for Global. Use
+                            global instead.
+                          properties:
+                            maxTokens:
+                              description: |-
+                                MaxTokens is the maximum number of tokens (bucket capacity).
+                                This is also the burst size: the maximum number of requests that can be served
+                                instantaneously before the bucket is depleted.
+                              format: int32
+                              minimum: 1
+                              type: integer
+                            refillPeriod:
+                              description: |-
+                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                                The effective refill rate is maxTokens / refillPeriod tokens per second.
+                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                              type: string
+                          required:
+                          - maxTokens
+                          - refillPeriod
+                          type: object
+                      required:
+                      - name
+                      type: object
+                      x-kubernetes-validations:
+                      - message: at least one of global, shared, or perUser must be
+                          configured
+                        rule: has(self.global) || has(self.shared) || has(self.perUser)
+                    type: array
+                    x-kubernetes-list-map-keys:
+                    - name
+                    x-kubernetes-list-type: map
+                type: object
+                x-kubernetes-validations:
+                - message: at least one of global, shared, perUser, or tools must
+                    be configured
+                  rule: has(self.global) || has(self.shared) || has(self.perUser)
+                    || (has(self.tools) && size(self.tools) > 0)
               replicas:
                 description: |-
                   Replicas is the desired number of vMCP pod replicas.
@@ -2347,6 +2518,17 @@ spec:
             - groupRef
             - incomingAuth
             type: object
+            x-kubernetes-validations:
+            - message: rateLimiting requires sessionStorage with provider 'redis'
+              rule: '!has(self.rateLimiting) || (has(self.sessionStorage) && self.sessionStorage.provider
+                == ''redis'')'
+            - message: rateLimiting.perUser requires incomingAuth.type oidc
+              rule: '!(has(self.rateLimiting) && has(self.rateLimiting.perUser)) ||
+                (has(self.incomingAuth) && self.incomingAuth.type == ''oidc'')'
+            - message: per-tool perUser rate limiting requires incomingAuth.type oidc
+              rule: '!has(self.rateLimiting) || !has(self.rateLimiting.tools) || self.rateLimiting.tools.all(t,
+                !has(t.perUser)) || (has(self.incomingAuth) && self.incomingAuth.type
+                == ''oidc'')'
           status:
             description: VirtualMCPServerStatus defines the observed state of VirtualMCPServer
             properties:
@@ -4738,6 +4920,177 @@ spec:
                   This field accepts a PodTemplateSpec object as JSON/YAML.
                 type: object
                 x-kubernetes-preserve-unknown-fields: true
+              rateLimiting:
+                description: |-
+                  RateLimiting defines rate limiting configuration for the Virtual MCP server.
+                  Requires Redis session storage to be configured for distributed rate limiting.
+                properties:
+                  global:
+                    description: Global is a token bucket shared across all users
+                      for the entire server.
+                    properties:
+                      maxTokens:
+                        description: |-
+                          MaxTokens is the maximum number of tokens (bucket capacity).
+                          This is also the burst size: the maximum number of requests that can be served
+                          instantaneously before the bucket is depleted.
+                        format: int32
+                        minimum: 1
+                        type: integer
+                      refillPeriod:
+                        description: |-
+                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                          The effective refill rate is maxTokens / refillPeriod tokens per second.
+                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                        type: string
+                    required:
+                    - maxTokens
+                    - refillPeriod
+                    type: object
+                  perUser:
+                    description: |-
+                      PerUser is a token bucket applied independently to each authenticated user
+                      at the server level. Requires authentication to be enabled.
+                      Each unique userID creates Redis keys that expire after 2x refillPeriod.
+                      Memory formula: unique_users_per_TTL_window * (1 + num_tools_with_per_user_limits) keys.
+                    properties:
+                      maxTokens:
+                        description: |-
+                          MaxTokens is the maximum number of tokens (bucket capacity).
+                          This is also the burst size: the maximum number of requests that can be served
+                          instantaneously before the bucket is depleted.
+                        format: int32
+                        minimum: 1
+                        type: integer
+                      refillPeriod:
+                        description: |-
+                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                          The effective refill rate is maxTokens / refillPeriod tokens per second.
+                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                        type: string
+                    required:
+                    - maxTokens
+                    - refillPeriod
+                    type: object
+                  shared:
+                    description: Shared is a deprecated alias for Global. Use global
+                      instead.
+                    properties:
+                      maxTokens:
+                        description: |-
+                          MaxTokens is the maximum number of tokens (bucket capacity).
+                          This is also the burst size: the maximum number of requests that can be served
+                          instantaneously before the bucket is depleted.
+                        format: int32
+                        minimum: 1
+                        type: integer
+                      refillPeriod:
+                        description: |-
+                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                          The effective refill rate is maxTokens / refillPeriod tokens per second.
+                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                        type: string
+                    required:
+                    - maxTokens
+                    - refillPeriod
+                    type: object
+                  tools:
+                    description: |-
+                      Tools defines per-tool rate limit overrides.
+                      Each entry applies additional rate limits to calls targeting a specific tool name.
+                      A request must pass both the server-level limit and the per-tool limit.
+                    items:
+                      description: |-
+                        ToolRateLimitConfig defines rate limits for a specific tool.
+                        At least one of global, shared, or perUser must be configured.
+                      properties:
+                        global:
+                          description: Global token bucket for this specific tool.
+                          properties:
+                            maxTokens:
+                              description: |-
+                                MaxTokens is the maximum number of tokens (bucket capacity).
+                                This is also the burst size: the maximum number of requests that can be served
+                                instantaneously before the bucket is depleted.
+                              format: int32
+                              minimum: 1
+                              type: integer
+                            refillPeriod:
+                              description: |-
+                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                                The effective refill rate is maxTokens / refillPeriod tokens per second.
+                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                              type: string
+                          required:
+                          - maxTokens
+                          - refillPeriod
+                          type: object
+                        name:
+                          description: Name is the MCP tool name this limit applies
+                            to.
+                          minLength: 1
+                          type: string
+                        perUser:
+                          description: PerUser token bucket configuration for this
+                            tool.
+                          properties:
+                            maxTokens:
+                              description: |-
+                                MaxTokens is the maximum number of tokens (bucket capacity).
+                                This is also the burst size: the maximum number of requests that can be served
+                                instantaneously before the bucket is depleted.
+                              format: int32
+                              minimum: 1
+                              type: integer
+                            refillPeriod:
+                              description: |-
+                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                                The effective refill rate is maxTokens / refillPeriod tokens per second.
+                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                              type: string
+                          required:
+                          - maxTokens
+                          - refillPeriod
+                          type: object
+                        shared:
+                          description: Shared is a deprecated alias for Global. Use
+                            global instead.
+                          properties:
+                            maxTokens:
+                              description: |-
+                                MaxTokens is the maximum number of tokens (bucket capacity).
+                                This is also the burst size: the maximum number of requests that can be served
+                                instantaneously before the bucket is depleted.
+                              format: int32
+                              minimum: 1
+                              type: integer
+                            refillPeriod:
+                              description: |-
+                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                                The effective refill rate is maxTokens / refillPeriod tokens per second.
+                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                              type: string
+                          required:
+                          - maxTokens
+                          - refillPeriod
+                          type: object
+                      required:
+                      - name
+                      type: object
+                      x-kubernetes-validations:
+                      - message: at least one of global, shared, or perUser must be
+                          configured
+                        rule: has(self.global) || has(self.shared) || has(self.perUser)
+                    type: array
+                    x-kubernetes-list-map-keys:
+                    - name
+                    x-kubernetes-list-type: map
+                type: object
+                x-kubernetes-validations:
+                - message: at least one of global, shared, perUser, or tools must
+                    be configured
+                  rule: has(self.global) || has(self.shared) || has(self.perUser)
+                    || (has(self.tools) && size(self.tools) > 0)
               replicas:
                 description: |-
                   Replicas is the desired number of vMCP pod replicas.
@@ -4843,6 +5196,17 @@ spec:
             - groupRef
             - incomingAuth
             type: object
+            x-kubernetes-validations:
+            - message: rateLimiting requires sessionStorage with provider 'redis'
+              rule: '!has(self.rateLimiting) || (has(self.sessionStorage) && self.sessionStorage.provider
+                == ''redis'')'
+            - message: rateLimiting.perUser requires incomingAuth.type oidc
+              rule: '!(has(self.rateLimiting) && has(self.rateLimiting.perUser)) ||
+                (has(self.incomingAuth) && self.incomingAuth.type == ''oidc'')'
+            - message: per-tool perUser rate limiting requires incomingAuth.type oidc
+              rule: '!has(self.rateLimiting) || !has(self.rateLimiting.tools) || self.rateLimiting.tools.all(t,
+                !has(t.perUser)) || (has(self.incomingAuth) && self.incomingAuth.type
+                == ''oidc'')'
           status:
             description: VirtualMCPServerStatus defines the observed state of VirtualMCPServer
             properties:
diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpservers.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpservers.yaml
index a248508d41..8f07967a66 100644
--- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpservers.yaml
+++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpservers.yaml
@@ -323,6 +323,28 @@ spec:
                   RateLimiting defines rate limiting configuration for the MCP server.
                   Requires Redis session storage to be configured for distributed rate limiting.
                 properties:
+                  global:
+                    description: Global is a token bucket shared across all users
+                      for the entire server.
+                    properties:
+                      maxTokens:
+                        description: |-
+                          MaxTokens is the maximum number of tokens (bucket capacity).
+                          This is also the burst size: the maximum number of requests that can be served
+                          instantaneously before the bucket is depleted.
+                        format: int32
+                        minimum: 1
+                        type: integer
+                      refillPeriod:
+                        description: |-
+                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                          The effective refill rate is maxTokens / refillPeriod tokens per second.
+                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                        type: string
+                    required:
+                    - maxTokens
+                    - refillPeriod
+                    type: object
                   perUser:
                     description: |-
                       PerUser is a token bucket applied independently to each authenticated user
@@ -349,8 +371,8 @@ spec:
                     - refillPeriod
                     type: object
                   shared:
-                    description: Shared is a token bucket shared across all users
-                      for the entire server.
+                    description: Shared is a deprecated alias for Global. Use global
+                      instead.
                     properties:
                       maxTokens:
                         description: |-
@@ -378,8 +400,29 @@ spec:
                     items:
                       description: |-
                         ToolRateLimitConfig defines rate limits for a specific tool.
-                        At least one of shared or perUser must be configured.
+                        At least one of global, shared, or perUser must be configured.
                       properties:
+                        global:
+                          description: Global token bucket for this specific tool.
+                          properties:
+                            maxTokens:
+                              description: |-
+                                MaxTokens is the maximum number of tokens (bucket capacity).
+                                This is also the burst size: the maximum number of requests that can be served
+                                instantaneously before the bucket is depleted.
+                              format: int32
+                              minimum: 1
+                              type: integer
+                            refillPeriod:
+                              description: |-
+                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                                The effective refill rate is maxTokens / refillPeriod tokens per second.
+                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                              type: string
+                          required:
+                          - maxTokens
+                          - refillPeriod
+                          type: object
                         name:
                           description: Name is the MCP tool name this limit applies
                             to.
@@ -408,7 +451,8 @@ spec:
                           - refillPeriod
                           type: object
                         shared:
-                          description: Shared token bucket for this specific tool.
+                          description: Shared is a deprecated alias for Global. Use
+                            global instead.
                           properties:
                             maxTokens:
                               description: |-
@@ -432,17 +476,19 @@ spec:
                       - name
                       type: object
                       x-kubernetes-validations:
-                      - message: at least one of shared or perUser must be configured
-                        rule: has(self.shared) || has(self.perUser)
+                      - message: at least one of global, shared, or perUser must be
+                          configured
+                        rule: has(self.global) || has(self.shared) || has(self.perUser)
                     type: array
                     x-kubernetes-list-map-keys:
                     - name
                     x-kubernetes-list-type: map
                 type: object
                 x-kubernetes-validations:
-                - message: at least one of shared, perUser, or tools must be configured
-                  rule: has(self.shared) || has(self.perUser) || (has(self.tools)
-                    && size(self.tools) > 0)
+                - message: at least one of global, shared, perUser, or tools must
+                    be configured
+                  rule: has(self.global) || has(self.shared) || has(self.perUser)
+                    || (has(self.tools) && size(self.tools) > 0)
               replicas:
                 description: |-
                   Replicas is the desired number of proxy runner (thv run) pod replicas.
@@ -1172,6 +1218,28 @@ spec:
                   RateLimiting defines rate limiting configuration for the MCP server.
                   Requires Redis session storage to be configured for distributed rate limiting.
                 properties:
+                  global:
+                    description: Global is a token bucket shared across all users
+                      for the entire server.
+                    properties:
+                      maxTokens:
+                        description: |-
+                          MaxTokens is the maximum number of tokens (bucket capacity).
+                          This is also the burst size: the maximum number of requests that can be served
+                          instantaneously before the bucket is depleted.
+                        format: int32
+                        minimum: 1
+                        type: integer
+                      refillPeriod:
+                        description: |-
+                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                          The effective refill rate is maxTokens / refillPeriod tokens per second.
+                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                        type: string
+                    required:
+                    - maxTokens
+                    - refillPeriod
+                    type: object
                   perUser:
                     description: |-
                       PerUser is a token bucket applied independently to each authenticated user
@@ -1198,8 +1266,8 @@ spec:
                     - refillPeriod
                     type: object
                   shared:
-                    description: Shared is a token bucket shared across all users
-                      for the entire server.
+                    description: Shared is a deprecated alias for Global. Use global
+                      instead.
                     properties:
                       maxTokens:
                         description: |-
@@ -1227,8 +1295,29 @@ spec:
                     items:
                       description: |-
                         ToolRateLimitConfig defines rate limits for a specific tool.
-                        At least one of shared or perUser must be configured.
+                        At least one of global, shared, or perUser must be configured.
                       properties:
+                        global:
+                          description: Global token bucket for this specific tool.
+                          properties:
+                            maxTokens:
+                              description: |-
+                                MaxTokens is the maximum number of tokens (bucket capacity).
+                                This is also the burst size: the maximum number of requests that can be served
+                                instantaneously before the bucket is depleted.
+                              format: int32
+                              minimum: 1
+                              type: integer
+                            refillPeriod:
+                              description: |-
+                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                                The effective refill rate is maxTokens / refillPeriod tokens per second.
+                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                              type: string
+                          required:
+                          - maxTokens
+                          - refillPeriod
+                          type: object
                         name:
                           description: Name is the MCP tool name this limit applies
                             to.
@@ -1257,7 +1346,8 @@ spec:
                           - refillPeriod
                           type: object
                         shared:
-                          description: Shared token bucket for this specific tool.
+                          description: Shared is a deprecated alias for Global. Use
+                            global instead.
                           properties:
                             maxTokens:
                               description: |-
@@ -1281,17 +1371,19 @@ spec:
                       - name
                       type: object
                       x-kubernetes-validations:
-                      - message: at least one of shared or perUser must be configured
-                        rule: has(self.shared) || has(self.perUser)
+                      - message: at least one of global, shared, or perUser must be
+                          configured
+                        rule: has(self.global) || has(self.shared) || has(self.perUser)
                     type: array
                     x-kubernetes-list-map-keys:
                     - name
                     x-kubernetes-list-type: map
                 type: object
                 x-kubernetes-validations:
-                - message: at least one of shared, perUser, or tools must be configured
-                  rule: has(self.shared) || has(self.perUser) || (has(self.tools)
-                    && size(self.tools) > 0)
+                - message: at least one of global, shared, perUser, or tools must
+                    be configured
+                  rule: has(self.global) || has(self.shared) || has(self.perUser)
+                    || (has(self.tools) && size(self.tools) > 0)
               replicas:
                 description: |-
                   Replicas is the desired number of proxy runner (thv run) pod replicas.
diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml
index 6078670479..cc804a054c 100644
--- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml
+++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml
@@ -2245,6 +2245,177 @@ spec:
                   This field accepts a PodTemplateSpec object as JSON/YAML.
                 type: object
                 x-kubernetes-preserve-unknown-fields: true
+              rateLimiting:
+                description: |-
+                  RateLimiting defines rate limiting configuration for the Virtual MCP server.
+                  Requires Redis session storage to be configured for distributed rate limiting.
+                properties:
+                  global:
+                    description: Global is a token bucket shared across all users
+                      for the entire server.
+                    properties:
+                      maxTokens:
+                        description: |-
+                          MaxTokens is the maximum number of tokens (bucket capacity).
+                          This is also the burst size: the maximum number of requests that can be served
+                          instantaneously before the bucket is depleted.
+                        format: int32
+                        minimum: 1
+                        type: integer
+                      refillPeriod:
+                        description: |-
+                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                          The effective refill rate is maxTokens / refillPeriod tokens per second.
+                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                        type: string
+                    required:
+                    - maxTokens
+                    - refillPeriod
+                    type: object
+                  perUser:
+                    description: |-
+                      PerUser is a token bucket applied independently to each authenticated user
+                      at the server level. Requires authentication to be enabled.
+                      Each unique userID creates Redis keys that expire after 2x refillPeriod.
+                      Memory formula: unique_users_per_TTL_window * (1 + num_tools_with_per_user_limits) keys.
+                    properties:
+                      maxTokens:
+                        description: |-
+                          MaxTokens is the maximum number of tokens (bucket capacity).
+                          This is also the burst size: the maximum number of requests that can be served
+                          instantaneously before the bucket is depleted.
+                        format: int32
+                        minimum: 1
+                        type: integer
+                      refillPeriod:
+                        description: |-
+                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                          The effective refill rate is maxTokens / refillPeriod tokens per second.
+                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                        type: string
+                    required:
+                    - maxTokens
+                    - refillPeriod
+                    type: object
+                  shared:
+                    description: Shared is a deprecated alias for Global. Use global
+                      instead.
+                    properties:
+                      maxTokens:
+                        description: |-
+                          MaxTokens is the maximum number of tokens (bucket capacity).
+                          This is also the burst size: the maximum number of requests that can be served
+                          instantaneously before the bucket is depleted.
+                        format: int32
+                        minimum: 1
+                        type: integer
+                      refillPeriod:
+                        description: |-
+                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                          The effective refill rate is maxTokens / refillPeriod tokens per second.
+                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                        type: string
+                    required:
+                    - maxTokens
+                    - refillPeriod
+                    type: object
+                  tools:
+                    description: |-
+                      Tools defines per-tool rate limit overrides.
+                      Each entry applies additional rate limits to calls targeting a specific tool name.
+                      A request must pass both the server-level limit and the per-tool limit.
+                    items:
+                      description: |-
+                        ToolRateLimitConfig defines rate limits for a specific tool.
+                        At least one of global, shared, or perUser must be configured.
+                      properties:
+                        global:
+                          description: Global token bucket for this specific tool.
+                          properties:
+                            maxTokens:
+                              description: |-
+                                MaxTokens is the maximum number of tokens (bucket capacity).
+                                This is also the burst size: the maximum number of requests that can be served
+                                instantaneously before the bucket is depleted.
+                              format: int32
+                              minimum: 1
+                              type: integer
+                            refillPeriod:
+                              description: |-
+                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                                The effective refill rate is maxTokens / refillPeriod tokens per second.
+                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                              type: string
+                          required:
+                          - maxTokens
+                          - refillPeriod
+                          type: object
+                        name:
+                          description: Name is the MCP tool name this limit applies
+                            to.
+                          minLength: 1
+                          type: string
+                        perUser:
+                          description: PerUser token bucket configuration for this
+                            tool.
+                          properties:
+                            maxTokens:
+                              description: |-
+                                MaxTokens is the maximum number of tokens (bucket capacity).
+                                This is also the burst size: the maximum number of requests that can be served
+                                instantaneously before the bucket is depleted.
+                              format: int32
+                              minimum: 1
+                              type: integer
+                            refillPeriod:
+                              description: |-
+                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                                The effective refill rate is maxTokens / refillPeriod tokens per second.
+                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                              type: string
+                          required:
+                          - maxTokens
+                          - refillPeriod
+                          type: object
+                        shared:
+                          description: Shared is a deprecated alias for Global. Use
+                            global instead.
+                          properties:
+                            maxTokens:
+                              description: |-
+                                MaxTokens is the maximum number of tokens (bucket capacity).
+                                This is also the burst size: the maximum number of requests that can be served
+                                instantaneously before the bucket is depleted.
+                              format: int32
+                              minimum: 1
+                              type: integer
+                            refillPeriod:
+                              description: |-
+                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                                The effective refill rate is maxTokens / refillPeriod tokens per second.
+                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                              type: string
+                          required:
+                          - maxTokens
+                          - refillPeriod
+                          type: object
+                      required:
+                      - name
+                      type: object
+                      x-kubernetes-validations:
+                      - message: at least one of global, shared, or perUser must be
+                          configured
+                        rule: has(self.global) || has(self.shared) || has(self.perUser)
+                    type: array
+                    x-kubernetes-list-map-keys:
+                    - name
+                    x-kubernetes-list-type: map
+                type: object
+                x-kubernetes-validations:
+                - message: at least one of global, shared, perUser, or tools must
+                    be configured
+                  rule: has(self.global) || has(self.shared) || has(self.perUser)
+                    || (has(self.tools) && size(self.tools) > 0)
               replicas:
                 description: |-
                   Replicas is the desired number of vMCP pod replicas.
@@ -2350,6 +2521,17 @@ spec:
             - groupRef
             - incomingAuth
             type: object
+            x-kubernetes-validations:
+            - message: rateLimiting requires sessionStorage with provider 'redis'
+              rule: '!has(self.rateLimiting) || (has(self.sessionStorage) && self.sessionStorage.provider
+                == ''redis'')'
+            - message: rateLimiting.perUser requires incomingAuth.type oidc
+              rule: '!(has(self.rateLimiting) && has(self.rateLimiting.perUser)) ||
+                (has(self.incomingAuth) && self.incomingAuth.type == ''oidc'')'
+            - message: per-tool perUser rate limiting requires incomingAuth.type oidc
+              rule: '!has(self.rateLimiting) || !has(self.rateLimiting.tools) || self.rateLimiting.tools.all(t,
+                !has(t.perUser)) || (has(self.incomingAuth) && self.incomingAuth.type
+                == ''oidc'')'
           status:
             description: VirtualMCPServerStatus defines the observed state of VirtualMCPServer
             properties:
@@ -4741,6 +4923,177 @@ spec:
                   This field accepts a PodTemplateSpec object as JSON/YAML.
                 type: object
                 x-kubernetes-preserve-unknown-fields: true
+              rateLimiting:
+                description: |-
+                  RateLimiting defines rate limiting configuration for the Virtual MCP server.
+                  Requires Redis session storage to be configured for distributed rate limiting.
+                properties:
+                  global:
+                    description: Global is a token bucket shared across all users
+                      for the entire server.
+                    properties:
+                      maxTokens:
+                        description: |-
+                          MaxTokens is the maximum number of tokens (bucket capacity).
+                          This is also the burst size: the maximum number of requests that can be served
+                          instantaneously before the bucket is depleted.
+                        format: int32
+                        minimum: 1
+                        type: integer
+                      refillPeriod:
+                        description: |-
+                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                          The effective refill rate is maxTokens / refillPeriod tokens per second.
+                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                        type: string
+                    required:
+                    - maxTokens
+                    - refillPeriod
+                    type: object
+                  perUser:
+                    description: |-
+                      PerUser is a token bucket applied independently to each authenticated user
+                      at the server level. Requires authentication to be enabled.
+                      Each unique userID creates Redis keys that expire after 2x refillPeriod.
+                      Memory formula: unique_users_per_TTL_window * (1 + num_tools_with_per_user_limits) keys.
+                    properties:
+                      maxTokens:
+                        description: |-
+                          MaxTokens is the maximum number of tokens (bucket capacity).
+                          This is also the burst size: the maximum number of requests that can be served
+                          instantaneously before the bucket is depleted.
+                        format: int32
+                        minimum: 1
+                        type: integer
+                      refillPeriod:
+                        description: |-
+                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                          The effective refill rate is maxTokens / refillPeriod tokens per second.
+                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                        type: string
+                    required:
+                    - maxTokens
+                    - refillPeriod
+                    type: object
+                  shared:
+                    description: Shared is a deprecated alias for Global. Use global
+                      instead.
+                    properties:
+                      maxTokens:
+                        description: |-
+                          MaxTokens is the maximum number of tokens (bucket capacity).
+                          This is also the burst size: the maximum number of requests that can be served
+                          instantaneously before the bucket is depleted.
+                        format: int32
+                        minimum: 1
+                        type: integer
+                      refillPeriod:
+                        description: |-
+                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                          The effective refill rate is maxTokens / refillPeriod tokens per second.
+                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                        type: string
+                    required:
+                    - maxTokens
+                    - refillPeriod
+                    type: object
+                  tools:
+                    description: |-
+                      Tools defines per-tool rate limit overrides.
+                      Each entry applies additional rate limits to calls targeting a specific tool name.
+                      A request must pass both the server-level limit and the per-tool limit.
+                    items:
+                      description: |-
+                        ToolRateLimitConfig defines rate limits for a specific tool.
+                        At least one of global, shared, or perUser must be configured.
+                      properties:
+                        global:
+                          description: Global token bucket for this specific tool.
+                          properties:
+                            maxTokens:
+                              description: |-
+                                MaxTokens is the maximum number of tokens (bucket capacity).
+                                This is also the burst size: the maximum number of requests that can be served
+                                instantaneously before the bucket is depleted.
+                              format: int32
+                              minimum: 1
+                              type: integer
+                            refillPeriod:
+                              description: |-
+                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                                The effective refill rate is maxTokens / refillPeriod tokens per second.
+                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                              type: string
+                          required:
+                          - maxTokens
+                          - refillPeriod
+                          type: object
+                        name:
+                          description: Name is the MCP tool name this limit applies
+                            to.
+                          minLength: 1
+                          type: string
+                        perUser:
+                          description: PerUser token bucket configuration for this
+                            tool.
+                          properties:
+                            maxTokens:
+                              description: |-
+                                MaxTokens is the maximum number of tokens (bucket capacity).
+                                This is also the burst size: the maximum number of requests that can be served
+                                instantaneously before the bucket is depleted.
+                              format: int32
+                              minimum: 1
+                              type: integer
+                            refillPeriod:
+                              description: |-
+                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                                The effective refill rate is maxTokens / refillPeriod tokens per second.
+                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                              type: string
+                          required:
+                          - maxTokens
+                          - refillPeriod
+                          type: object
+                        shared:
+                          description: Shared is a deprecated alias for Global. Use
+                            global instead.
+                          properties:
+                            maxTokens:
+                              description: |-
+                                MaxTokens is the maximum number of tokens (bucket capacity).
+                                This is also the burst size: the maximum number of requests that can be served
+                                instantaneously before the bucket is depleted.
+                              format: int32
+                              minimum: 1
+                              type: integer
+                            refillPeriod:
+                              description: |-
+                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
+                                The effective refill rate is maxTokens / refillPeriod tokens per second.
+                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
+                              type: string
+                          required:
+                          - maxTokens
+                          - refillPeriod
+                          type: object
+                      required:
+                      - name
+                      type: object
+                      x-kubernetes-validations:
+                      - message: at least one of global, shared, or perUser must be
+                          configured
+                        rule: has(self.global) || has(self.shared) || has(self.perUser)
+                    type: array
+                    x-kubernetes-list-map-keys:
+                    - name
+                    x-kubernetes-list-type: map
+                type: object
+                x-kubernetes-validations:
+                - message: at least one of global, shared, perUser, or tools must
+                    be configured
+                  rule: has(self.global) || has(self.shared) || has(self.perUser)
+                    || (has(self.tools) && size(self.tools) > 0)
               replicas:
                 description: |-
                   Replicas is the desired number of vMCP pod replicas.
@@ -4846,6 +5199,17 @@ spec:
             - groupRef
             - incomingAuth
             type: object
+            x-kubernetes-validations:
+            - message: rateLimiting requires sessionStorage with provider 'redis'
+              rule: '!has(self.rateLimiting) || (has(self.sessionStorage) && self.sessionStorage.provider
+                == ''redis'')'
+            - message: rateLimiting.perUser requires incomingAuth.type oidc
+              rule: '!(has(self.rateLimiting) && has(self.rateLimiting.perUser)) ||
+                (has(self.incomingAuth) && self.incomingAuth.type == ''oidc'')'
+            - message: per-tool perUser rate limiting requires incomingAuth.type oidc
+              rule: '!has(self.rateLimiting) || !has(self.rateLimiting.tools) || self.rateLimiting.tools.all(t,
+                !has(t.perUser)) || (has(self.incomingAuth) && self.incomingAuth.type
+                == ''oidc'')'
           status:
             description: VirtualMCPServerStatus defines the observed state of VirtualMCPServer
             properties:
diff --git a/docs/operator/crd-api.md b/docs/operator/crd-api.md
index b6e0c50cda..d7313e9135 100644
--- a/docs/operator/crd-api.md
+++ b/docs/operator/crd-api.md
@@ -233,6 +233,25 @@ _Appears in:_
 | `primaryUpstreamProvider` _string_ | PrimaryUpstreamProvider names the upstream IDP provider whose access<br />token should be used as the source of JWT claims for Cedar evaluation.<br />When empty, claims from the ToolHive-issued token are used.<br />Must match an upstream provider name configured in the embedded auth server<br />(e.g. "default", "github"). Only relevant when the embedded auth server is active. |  | Optional: \{\} <br /> |
 
 
+#### ratelimit.config.Bucket
+
+
+
+Bucket defines a token bucket configuration with a maximum capacity and a
+refill period.
+
+
+
+_Appears in:_
+- [ratelimit.config.Config](#ratelimitconfigconfig)
+- [ratelimit.config.ToolConfig](#ratelimitconfigtoolconfig)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `maxTokens` _integer_ | MaxTokens is the maximum number of tokens. |  |  |
+| `refillPeriod` _[Duration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#duration-v1-meta)_ | RefillPeriod is the duration to fully refill the bucket. |  |  |
+
+
 #### vmcp.config.CircuitBreakerConfig
 
 
@@ -640,6 +659,25 @@ _Appears in:_
 
 
 
+#### ratelimit.config.ToolConfig
+
+
+
+ToolConfig defines rate limits for a specific tool.
+
+
+
+_Appears in:_
+- [ratelimit.config.Config](#ratelimitconfigconfig)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `name` _string_ | Name is the MCP tool name this limit applies to. |  |  |
+| `global` _[ratelimit.config.Bucket](#ratelimitconfigbucket)_ | Global is a token bucket shared across all users for this tool. |  |  |
+| `shared` _[ratelimit.config.Bucket](#ratelimitconfigbucket)_ | Shared is a deprecated alias for Global. |  |  |
+| `perUser` _[ratelimit.config.Bucket](#ratelimitconfigbucket)_ | PerUser is a token bucket applied independently to each authenticated user<br />for this tool. |  |  |
+
+
 #### vmcp.config.ToolConfigRef
 
 
@@ -2712,10 +2750,12 @@ At least one of shared, perUser, or tools must be configured.
 
 _Appears in:_
 - [api.v1beta1.MCPServerSpec](#apiv1beta1mcpserverspec)
+- [api.v1beta1.VirtualMCPServerSpec](#apiv1beta1virtualmcpserverspec)
 
 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
-| `shared` _[api.v1beta1.RateLimitBucket](#apiv1beta1ratelimitbucket)_ | Shared is a token bucket shared across all users for the entire server. |  | Optional: \{\} <br /> |
+| `global` _[api.v1beta1.RateLimitBucket](#apiv1beta1ratelimitbucket)_ | Global is a token bucket shared across all users for the entire server. |  | Optional: \{\} <br /> |
+| `shared` _[api.v1beta1.RateLimitBucket](#apiv1beta1ratelimitbucket)_ | Shared is a deprecated alias for Global. Use global instead. |  | Optional: \{\} <br /> |
 | `perUser` _[api.v1beta1.RateLimitBucket](#apiv1beta1ratelimitbucket)_ | PerUser is a token bucket applied independently to each authenticated user<br />at the server level. Requires authentication to be enabled.<br />Each unique userID creates Redis keys that expire after 2x refillPeriod.<br />Memory formula: unique_users_per_TTL_window * (1 + num_tools_with_per_user_limits) keys. |  | Optional: \{\} <br /> |
 | `tools` _[api.v1beta1.ToolRateLimitConfig](#apiv1beta1toolratelimitconfig) array_ | Tools defines per-tool rate limit overrides.<br />Each entry applies additional rate limits to calls targeting a specific tool name.<br />A request must pass both the server-level limit and the per-tool limit. |  | Optional: \{\} <br /> |
 
@@ -3130,7 +3170,7 @@ _Appears in:_
 
 
 ToolRateLimitConfig defines rate limits for a specific tool.
-At least one of shared or perUser must be configured.
+At least one of global, shared, or perUser must be configured.
 
 
 
@@ -3140,7 +3180,8 @@ _Appears in:_
 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
 | `name` _string_ | Name is the MCP tool name this limit applies to. |  | MinLength: 1 <br />Required: \{\} <br /> |
-| `shared` _[api.v1beta1.RateLimitBucket](#apiv1beta1ratelimitbucket)_ | Shared token bucket for this specific tool. |  | Optional: \{\} <br /> |
+| `global` _[api.v1beta1.RateLimitBucket](#apiv1beta1ratelimitbucket)_ | Global token bucket for this specific tool. |  | Optional: \{\} <br /> |
+| `shared` _[api.v1beta1.RateLimitBucket](#apiv1beta1ratelimitbucket)_ | Shared is a deprecated alias for Global. Use global instead. |  | Optional: \{\} <br /> |
 | `perUser` _[api.v1beta1.RateLimitBucket](#apiv1beta1ratelimitbucket)_ | PerUser token bucket configuration for this tool. |  | Optional: \{\} <br /> |
 
 
@@ -3439,6 +3480,7 @@ _Appears in:_
 | `replicas` _integer_ | Replicas is the desired number of vMCP pod replicas.<br />VirtualMCPServer creates a single Deployment for the vMCP aggregator process,<br />so there is only one replicas field (unlike MCPServer which has separate<br />Replicas and BackendReplicas for its two Deployments).<br />When nil, the operator does not set Deployment.Spec.Replicas, leaving replica<br />management to an HPA or other external controller. |  | Minimum: 0 <br />Optional: \{\} <br /> |
 | `sessionStorage` _[api.v1beta1.SessionStorageConfig](#apiv1beta1sessionstorageconfig)_ | SessionStorage configures session storage for stateful horizontal scaling.<br />When nil, no session storage is configured. |  | Optional: \{\} <br /> |
 | `imagePullSecrets` _[LocalObjectReference](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#localobjectreference-v1-core) array_ | ImagePullSecrets allows specifying image pull secrets for the vMCP workload.<br />These are applied to both the vMCP Deployment's PodSpec.ImagePullSecrets<br />and to the operator-managed ServiceAccount the vMCP server runs as, so private<br />images are pullable through either path.<br />Merge semantics with PodTemplateSpec:<br />The deployed PodSpec.ImagePullSecrets is the Kubernetes-native strategic-merge<br />union of this field and spec.podTemplateSpec.spec.imagePullSecrets, merged by<br />the patchStrategy:"merge" / patchMergeKey:"name" tags on corev1.PodSpec.<br />  - This field is rendered first as the controller-generated default.<br />  - spec.podTemplateSpec.spec.imagePullSecrets is then strategic-merge-patched<br />    on top, keyed by Name. Distinct names from the two sources are unioned in<br />    the resulting list; entries with the same Name are deduplicated and the<br />    PodTemplateSpec entry wins on overlap (user override).<br />  - Order in the resulting list is not guaranteed and should not be relied on:<br />    strategic merge by name is order-insensitive.<br />  - The operator-managed ServiceAccount's imagePullSecrets list is populated<br />    ONLY from this field. spec.podTemplateSpec.spec.imagePullSecrets does not<br />    reach the ServiceAccount because PodTemplateSpec has no notion of a<br />    ServiceAccount. To make a secret usable via the ServiceAccount path<br />    (e.g. for sidecars or init containers that pull images independently),<br />    list it here rather than under spec.podTemplateSpec.<br />Note on cross-CRD consistency:<br />MCPRegistry currently uses an atomic-replace strategy for its imagePullSecrets<br />(the user-provided value replaces the controller-generated list rather than<br />being merged on top). VirtualMCPServer follows the Kubernetes-native<br />strategic-merge-by-name behavior described above. Aligning the two is tracked<br />as a separate follow-up; until then, manifests that set imagePullSecrets on<br />both CRDs will see different override behavior between them. |  | Optional: \{\} <br /> |
+| `rateLimiting` _[api.v1beta1.RateLimitConfig](#apiv1beta1ratelimitconfig)_ | RateLimiting defines rate limiting configuration for the Virtual MCP server.<br />Requires Redis session storage to be configured for distributed rate limiting. |  | Optional: \{\} <br /> |
 
 
 #### api.v1beta1.VirtualMCPServerStatus
diff --git a/docs/server/docs.go b/docs/server/docs.go
index a108f6087f..bad149a9d7 100644
--- a/docs/server/docs.go
+++ b/docs/server/docs.go
@@ -60,6 +60,9 @@ const docTemplate = `{
             "github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.RateLimitConfig": {
                 "description": "RateLimitConfig contains the CRD rate limiting configuration.\nWhen set, rate limiting middleware is added to the proxy middleware chain.",
                 "properties": {
+                    "global": {
+                        "$ref": "#/components/schemas/github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.RateLimitBucket"
+                    },
                     "perUser": {
                         "$ref": "#/components/schemas/github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.RateLimitBucket"
                     },
@@ -79,6 +82,9 @@ const docTemplate = `{
             },
             "github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.ToolRateLimitConfig": {
                 "properties": {
+                    "global": {
+                        "$ref": "#/components/schemas/github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.RateLimitBucket"
+                    },
                     "name": {
                         "description": "Name is the MCP tool name this limit applies to.\n+kubebuilder:validation:Required\n+kubebuilder:validation:MinLength=1",
                         "type": "string"
diff --git a/docs/server/swagger.json b/docs/server/swagger.json
index 8a5d8df9b5..6269efbf3d 100644
--- a/docs/server/swagger.json
+++ b/docs/server/swagger.json
@@ -53,6 +53,9 @@
             "github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.RateLimitConfig": {
                 "description": "RateLimitConfig contains the CRD rate limiting configuration.\nWhen set, rate limiting middleware is added to the proxy middleware chain.",
                 "properties": {
+                    "global": {
+                        "$ref": "#/components/schemas/github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.RateLimitBucket"
+                    },
                     "perUser": {
                         "$ref": "#/components/schemas/github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.RateLimitBucket"
                     },
@@ -72,6 +75,9 @@
             },
             "github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.ToolRateLimitConfig": {
                 "properties": {
+                    "global": {
+                        "$ref": "#/components/schemas/github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.RateLimitBucket"
+                    },
                     "name": {
                         "description": "Name is the MCP tool name this limit applies to.\n+kubebuilder:validation:Required\n+kubebuilder:validation:MinLength=1",
                         "type": "string"
diff --git a/docs/server/swagger.yaml b/docs/server/swagger.yaml
index bdc1cac8a0..2799158f5c 100644
--- a/docs/server/swagger.yaml
+++ b/docs/server/swagger.yaml
@@ -52,6 +52,8 @@ components:
         RateLimitConfig contains the CRD rate limiting configuration.
         When set, rate limiting middleware is added to the proxy middleware chain.
       properties:
+        global:
+          $ref: '#/components/schemas/github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.RateLimitBucket'
         perUser:
           $ref: '#/components/schemas/github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.RateLimitBucket'
         shared:
@@ -71,6 +73,8 @@ components:
       type: object
     github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.ToolRateLimitConfig:
       properties:
+        global:
+          $ref: '#/components/schemas/github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.RateLimitBucket'
         name:
           description: |-
             Name is the MCP tool name this limit applies to.
diff --git a/pkg/ratelimit/config/config.go b/pkg/ratelimit/config/config.go
new file mode 100644
index 0000000000..38aaa01780
--- /dev/null
+++ b/pkg/ratelimit/config/config.go
@@ -0,0 +1,147 @@
+// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+// Package config defines runtime-neutral rate limiting configuration.
+package config
+
+import metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+
+// Config defines rate limiting configuration for an MCP server.
+// +gendoc
+type Config struct {
+	// Global is a token bucket shared across all users for the entire server.
+	Global *Bucket `json:"global,omitempty" yaml:"global,omitempty"`
+
+	// Shared is a deprecated alias for Global. It is kept for compatibility with
+	// older CRDs and config files that used "shared" before the RFC settled on
+	// "global".
+	Shared *Bucket `json:"shared,omitempty" yaml:"shared,omitempty"`
+
+	// PerUser is a token bucket applied independently to each authenticated user
+	// at the server level.
+	PerUser *Bucket `json:"perUser,omitempty" yaml:"perUser,omitempty"`
+
+	// Tools defines per-tool rate limits.
+	Tools []ToolConfig `json:"tools,omitempty" yaml:"tools,omitempty"`
+}
+
+// EffectiveGlobal returns the configured server-level global bucket, including
+// the deprecated shared alias.
+func (c *Config) EffectiveGlobal() *Bucket {
+	if c == nil {
+		return nil
+	}
+	if c.Global != nil {
+		return c.Global
+	}
+	return c.Shared
+}
+
+// Bucket defines a token bucket configuration with a maximum capacity and a
+// refill period.
+// +gendoc
+type Bucket struct {
+	// MaxTokens is the maximum number of tokens.
+	MaxTokens int32 `json:"maxTokens" yaml:"maxTokens"`
+
+	// RefillPeriod is the duration to fully refill the bucket.
+	RefillPeriod metav1.Duration `json:"refillPeriod" yaml:"refillPeriod"`
+}
+
+// ToolConfig defines rate limits for a specific tool.
+// +gendoc
+type ToolConfig struct {
+	// Name is the MCP tool name this limit applies to.
+	Name string `json:"name" yaml:"name"`
+
+	// Global is a token bucket shared across all users for this tool.
+	Global *Bucket `json:"global,omitempty" yaml:"global,omitempty"`
+
+	// Shared is a deprecated alias for Global.
+	Shared *Bucket `json:"shared,omitempty" yaml:"shared,omitempty"`
+
+	// PerUser is a token bucket applied independently to each authenticated user
+	// for this tool.
+	PerUser *Bucket `json:"perUser,omitempty" yaml:"perUser,omitempty"`
+}
+
+// EffectiveGlobal returns the configured tool-level global bucket, including
+// the deprecated shared alias.
+func (c *ToolConfig) EffectiveGlobal() *Bucket {
+	if c == nil {
+		return nil
+	}
+	if c.Global != nil {
+		return c.Global
+	}
+	return c.Shared
+}
+
+// DeepCopyInto copies the receiver into out.
+func (in *Config) DeepCopyInto(out *Config) {
+	*out = *in
+	if in.Global != nil {
+		out.Global = in.Global.DeepCopy()
+	}
+	if in.Shared != nil {
+		out.Shared = in.Shared.DeepCopy()
+	}
+	if in.PerUser != nil {
+		out.PerUser = in.PerUser.DeepCopy()
+	}
+	if in.Tools != nil {
+		out.Tools = make([]ToolConfig, len(in.Tools))
+		for i := range in.Tools {
+			in.Tools[i].DeepCopyInto(&out.Tools[i])
+		}
+	}
+}
+
+// DeepCopy returns a copy of the receiver.
+func (in *Config) DeepCopy() *Config {
+	if in == nil {
+		return nil
+	}
+	out := new(Config)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto copies the receiver into out.
+func (in *Bucket) DeepCopyInto(out *Bucket) {
+	*out = *in
+}
+
+// DeepCopy returns a copy of the receiver.
+func (in *Bucket) DeepCopy() *Bucket {
+	if in == nil {
+		return nil
+	}
+	out := new(Bucket)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto copies the receiver into out.
+func (in *ToolConfig) DeepCopyInto(out *ToolConfig) {
+	*out = *in
+	if in.Global != nil {
+		out.Global = in.Global.DeepCopy()
+	}
+	if in.Shared != nil {
+		out.Shared = in.Shared.DeepCopy()
+	}
+	if in.PerUser != nil {
+		out.PerUser = in.PerUser.DeepCopy()
+	}
+}
+
+// DeepCopy returns a copy of the receiver.
+func (in *ToolConfig) DeepCopy() *ToolConfig {
+	if in == nil {
+		return nil
+	}
+	out := new(ToolConfig)
+	in.DeepCopyInto(out)
+	return out
+}
diff --git a/pkg/ratelimit/config/doc.go b/pkg/ratelimit/config/doc.go
new file mode 100644
index 0000000000..ef361de965
--- /dev/null
+++ b/pkg/ratelimit/config/doc.go
@@ -0,0 +1,8 @@
+// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+// Package config defines runtime-neutral rate limiting configuration.
+//
+// +groupName=toolhive.stacklok.dev
+// +versionName=config
+package config
diff --git a/pkg/ratelimit/limiter.go b/pkg/ratelimit/limiter.go
index 3490c1cbaa..4a5e654912 100644
--- a/pkg/ratelimit/limiter.go
+++ b/pkg/ratelimit/limiter.go
@@ -14,7 +14,7 @@ import (
 
 	"github.com/redis/go-redis/v9"
 
-	v1beta1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1beta1"
+	rlconfig "github.com/stacklok/toolhive/pkg/ratelimit/config"
 	"github.com/stacklok/toolhive/pkg/ratelimit/internal/bucket"
 )
 
@@ -39,17 +39,17 @@ type Decision struct {
 // NewLimiter constructs a Limiter from CRD configuration.
 // Returns a no-op limiter (always allows) when crd is nil.
 // namespace and name identify the MCP server for Redis key derivation.
-func NewLimiter(client redis.Cmdable, namespace, name string, crd *v1beta1.RateLimitConfig) (Limiter, error) {
+func NewLimiter(client redis.Cmdable, namespace, name string, crd *rlconfig.Config) (Limiter, error) {
 	if crd == nil {
 		return noopLimiter{}, nil
 	}
 
 	l := &limiter{client: client}
 
-	if crd.Shared != nil {
-		b, err := newBucket(namespace, name, "shared", crd.Shared)
+	if global, suffix := serverGlobalBucket(crd); global != nil {
+		b, err := newBucket(namespace, name, suffix, global)
 		if err != nil {
-			return nil, fmt.Errorf("shared bucket: %w", err)
+			return nil, fmt.Errorf("%s bucket: %w", suffix, err)
 		}
 		l.serverBucket = b
 	}
@@ -63,10 +63,10 @@ func NewLimiter(client redis.Cmdable, namespace, name string, crd *v1beta1.RateL
 	}
 
 	for _, t := range crd.Tools {
-		if t.Shared != nil {
-			b, err := newBucket(namespace, name, "shared:tool:"+t.Name, t.Shared)
+		if global, suffix := toolGlobalBucket(&t); global != nil {
+			b, err := newBucket(namespace, name, suffix+":tool:"+t.Name, global)
 			if err != nil {
-				return nil, fmt.Errorf("tool %q shared bucket: %w", t.Name, err)
+				return nil, fmt.Errorf("tool %q %s bucket: %w", t.Name, suffix, err)
 			}
 			if l.toolBuckets == nil {
 				l.toolBuckets = make(map[string]*bucket.TokenBucket)
@@ -88,6 +88,26 @@ func NewLimiter(client redis.Cmdable, namespace, name string, crd *v1beta1.RateL
 	return l, nil
 }
 
+func serverGlobalBucket(crd *rlconfig.Config) (*rlconfig.Bucket, string) {
+	if crd.Global != nil {
+		return crd.Global, "global"
+	}
+	if crd.Shared != nil {
+		return crd.Shared, "shared"
+	}
+	return nil, ""
+}
+
+func toolGlobalBucket(tool *rlconfig.ToolConfig) (*rlconfig.Bucket, string) {
+	if tool.Global != nil {
+		return tool.Global, "global"
+	}
+	if tool.Shared != nil {
+		return tool.Shared, "shared"
+	}
+	return nil, ""
+}
+
 // bucketSpec holds deferred bucket parameters for per-user buckets that are
 // created on the fly in Allow() because the userID is not known at construction time.
 type bucketSpec struct {
@@ -177,7 +197,7 @@ func (noopLimiter) Allow(context.Context, string, string) (*Decision, error) {
 }
 
 // validateBucketCRD checks that a CRD bucket spec has valid parameters.
-func validateBucketCRD(b *v1beta1.RateLimitBucket) (int32, time.Duration, error) {
+func validateBucketCRD(b *rlconfig.Bucket) (int32, time.Duration, error) {
 	if b.MaxTokens < 1 {
 		return 0, 0, fmt.Errorf("maxTokens must be >= 1, got %d", b.MaxTokens)
 	}
@@ -189,7 +209,7 @@ func validateBucketCRD(b *v1beta1.RateLimitBucket) (int32, time.Duration, error)
 }
 
 // newBucket validates a CRD bucket spec and creates a TokenBucket.
-func newBucket(namespace, serverName, suffix string, b *v1beta1.RateLimitBucket) (*bucket.TokenBucket, error) {
+func newBucket(namespace, serverName, suffix string, b *rlconfig.Bucket) (*bucket.TokenBucket, error) {
 	maxTokens, refillPeriod, err := validateBucketCRD(b)
 	if err != nil {
 		return nil, err
@@ -199,7 +219,7 @@ func newBucket(namespace, serverName, suffix string, b *v1beta1.RateLimitBucket)
 
 // newBucketSpec validates a CRD bucket spec and creates a deferred bucketSpec
 // for per-user buckets that are materialized at Allow() time.
-func newBucketSpec(namespace, serverName string, b *v1beta1.RateLimitBucket) (bucketSpec, error) {
+func newBucketSpec(namespace, serverName string, b *rlconfig.Bucket) (bucketSpec, error) {
 	maxTokens, refillPeriod, err := validateBucketCRD(b)
 	if err != nil {
 		return bucketSpec{}, err
diff --git a/pkg/ratelimit/limiter_test.go b/pkg/ratelimit/limiter_test.go
index a007191d3c..e5e921aea8 100644
--- a/pkg/ratelimit/limiter_test.go
+++ b/pkg/ratelimit/limiter_test.go
@@ -49,7 +49,7 @@ func TestNewLimiter_ZeroMaxTokens(t *testing.T) {
 		},
 	}
 
-	_, err := NewLimiter(client, "ns", "srv", crd)
+	_, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
 	assert.Error(t, err)
 	assert.Contains(t, err.Error(), "maxTokens must be >= 1")
 }
@@ -65,7 +65,7 @@ func TestNewLimiter_ZeroDuration(t *testing.T) {
 		},
 	}
 
-	_, err := NewLimiter(client, "ns", "srv", crd)
+	_, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
 	assert.Error(t, err)
 	assert.Contains(t, err.Error(), "refillPeriod must be positive")
 }
@@ -78,7 +78,7 @@ func TestLimiter_ServerGlobalExhausted(t *testing.T) {
 	crd := &v1beta1.RateLimitConfig{
 		Shared: &v1beta1.RateLimitBucket{MaxTokens: 2, RefillPeriod: metav1.Duration{Duration: time.Minute}},
 	}
-	l, err := NewLimiter(client, "ns", "srv", crd)
+	l, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
 	require.NoError(t, err)
 
 	for range 2 {
@@ -93,6 +93,70 @@ func TestLimiter_ServerGlobalExhausted(t *testing.T) {
 	assert.Greater(t, d.RetryAfter, time.Duration(0))
 }
 
+func TestLimiter_SharedAliasUsesLegacyRedisKeys(t *testing.T) {
+	t.Parallel()
+	client, _ := newTestClient(t)
+	ctx := t.Context()
+
+	crd := &v1beta1.RateLimitConfig{
+		Shared: &v1beta1.RateLimitBucket{MaxTokens: 10, RefillPeriod: metav1.Duration{Duration: time.Minute}},
+		Tools: []v1beta1.ToolRateLimitConfig{
+			{
+				Name:   "search",
+				Shared: &v1beta1.RateLimitBucket{MaxTokens: 10, RefillPeriod: metav1.Duration{Duration: time.Minute}},
+			},
+		},
+	}
+	l, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
+	require.NoError(t, err)
+
+	d, err := l.Allow(ctx, "search", "")
+	require.NoError(t, err)
+	require.True(t, d.Allowed)
+
+	legacyServerKey := "thv:rl:{ns:srv}:shared"
+	legacyToolKey := "thv:rl:{ns:srv}:shared:tool:search"
+	newServerKey := "thv:rl:{ns:srv}:global"
+	newToolKey := "thv:rl:{ns:srv}:global:tool:search"
+
+	exists, err := client.Exists(ctx, legacyServerKey, legacyToolKey).Result()
+	require.NoError(t, err)
+	assert.Equal(t, int64(2), exists)
+
+	exists, err = client.Exists(ctx, newServerKey, newToolKey).Result()
+	require.NoError(t, err)
+	assert.Equal(t, int64(0), exists)
+}
+
+func TestLimiter_GlobalUsesGlobalRedisKeys(t *testing.T) {
+	t.Parallel()
+	client, _ := newTestClient(t)
+	ctx := t.Context()
+
+	crd := &v1beta1.RateLimitConfig{
+		Global: &v1beta1.RateLimitBucket{MaxTokens: 10, RefillPeriod: metav1.Duration{Duration: time.Minute}},
+		Tools: []v1beta1.ToolRateLimitConfig{
+			{
+				Name:   "search",
+				Global: &v1beta1.RateLimitBucket{MaxTokens: 10, RefillPeriod: metav1.Duration{Duration: time.Minute}},
+			},
+		},
+	}
+	l, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
+	require.NoError(t, err)
+
+	d, err := l.Allow(ctx, "search", "")
+	require.NoError(t, err)
+	require.True(t, d.Allowed)
+
+	exists, err := client.Exists(ctx,
+		"thv:rl:{ns:srv}:global",
+		"thv:rl:{ns:srv}:global:tool:search",
+	).Result()
+	require.NoError(t, err)
+	assert.Equal(t, int64(2), exists)
+}
+
 func TestLimiter_PerToolIsolation(t *testing.T) {
 	t.Parallel()
 	client, _ := newTestClient(t)
@@ -106,7 +170,7 @@ func TestLimiter_PerToolIsolation(t *testing.T) {
 			},
 		},
 	}
-	l, err := NewLimiter(client, "ns", "srv", crd)
+	l, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
 	require.NoError(t, err)
 
 	d, err := l.Allow(ctx, "search", "")
@@ -137,7 +201,7 @@ func TestLimiter_ServerAndPerToolBothRequired(t *testing.T) {
 			},
 		},
 	}
-	l, err := NewLimiter(client, "ns", "srv", crd)
+	l, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
 	require.NoError(t, err)
 
 	for range 2 {
@@ -164,7 +228,7 @@ func TestLimiter_RedisUnavailableReturnsError(t *testing.T) {
 	crd := &v1beta1.RateLimitConfig{
 		Shared: &v1beta1.RateLimitBucket{MaxTokens: 10, RefillPeriod: metav1.Duration{Duration: time.Minute}},
 	}
-	l, err := NewLimiter(client, "ns", "srv", crd)
+	l, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
 	require.NoError(t, err)
 
 	mr.Close()
@@ -181,7 +245,7 @@ func TestLimiter_PerUserServerLevel(t *testing.T) {
 	crd := &v1beta1.RateLimitConfig{
 		PerUser: &v1beta1.RateLimitBucket{MaxTokens: 2, RefillPeriod: metav1.Duration{Duration: time.Minute}},
 	}
-	l, err := NewLimiter(client, "ns", "srv", crd)
+	l, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
 	require.NoError(t, err)
 
 	// User A exhausts their 2 tokens.
@@ -201,6 +265,28 @@ func TestLimiter_PerUserServerLevel(t *testing.T) {
 	assert.True(t, d.Allowed)
 }
 
+func TestLimiter_PerUserServerLevelSharedAcrossBackendTools(t *testing.T) {
+	t.Parallel()
+	client, _ := newTestClient(t)
+	ctx := t.Context()
+
+	crd := &v1beta1.RateLimitConfig{
+		PerUser: &v1beta1.RateLimitBucket{MaxTokens: 2, RefillPeriod: metav1.Duration{Duration: time.Minute}},
+	}
+	l, err := NewLimiter(client, "ns", "vmcp", crd.ToInternal())
+	require.NoError(t, err)
+
+	for _, toolName := range []string{"backend_a_echo", "backend_b_search"} {
+		d, allowErr := l.Allow(ctx, toolName, "user-a")
+		require.NoError(t, allowErr)
+		require.True(t, d.Allowed)
+	}
+
+	d, err := l.Allow(ctx, "backend_c_status", "user-a")
+	require.NoError(t, err)
+	assert.False(t, d.Allowed, "server-level per-user limit must be shared across all backend tools")
+}
+
 func TestLimiter_PerToolPerUserIsolation(t *testing.T) {
 	t.Parallel()
 	client, _ := newTestClient(t)
@@ -214,7 +300,7 @@ func TestLimiter_PerToolPerUserIsolation(t *testing.T) {
 			},
 		},
 	}
-	l, err := NewLimiter(client, "ns", "srv", crd)
+	l, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
 	require.NoError(t, err)
 
 	// User A uses their 1 token for "search".
@@ -252,7 +338,7 @@ func TestLimiter_ServerAndToolPerUserBothRequired(t *testing.T) {
 			},
 		},
 	}
-	l, err := NewLimiter(client, "ns", "srv", crd)
+	l, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
 	require.NoError(t, err)
 
 	// User A makes 2 "search" calls — both pass.
@@ -273,6 +359,66 @@ func TestLimiter_ServerAndToolPerUserBothRequired(t *testing.T) {
 	assert.True(t, d.Allowed)
 }
 
+func TestLimiter_PerToolUsesPostAggregationName(t *testing.T) {
+	t.Parallel()
+	client, _ := newTestClient(t)
+	ctx := t.Context()
+
+	crd := &v1beta1.RateLimitConfig{
+		Tools: []v1beta1.ToolRateLimitConfig{
+			{
+				Name:   "backend_a_expensive_tool",
+				Global: &v1beta1.RateLimitBucket{MaxTokens: 1, RefillPeriod: metav1.Duration{Duration: time.Minute}},
+			},
+		},
+	}
+	l, err := NewLimiter(client, "ns", "vmcp", crd.ToInternal())
+	require.NoError(t, err)
+
+	d, err := l.Allow(ctx, "backend_a_expensive_tool", "user-a")
+	require.NoError(t, err)
+	require.True(t, d.Allowed)
+
+	d, err = l.Allow(ctx, "backend_a_expensive_tool", "user-b")
+	require.NoError(t, err)
+	assert.False(t, d.Allowed, "global per-tool limit should apply to the post-aggregation tool name")
+
+	d, err = l.Allow(ctx, "expensive_tool", "user-a")
+	require.NoError(t, err)
+	assert.True(t, d.Allowed, "unprefixed backend name should not match the vMCP post-aggregation limit")
+}
+
+func TestLimiter_GlobalAndPerUserBothEnforced(t *testing.T) {
+	t.Parallel()
+	client, _ := newTestClient(t)
+	ctx := t.Context()
+
+	crd := &v1beta1.RateLimitConfig{
+		Global:  &v1beta1.RateLimitBucket{MaxTokens: 3, RefillPeriod: metav1.Duration{Duration: time.Minute}},
+		PerUser: &v1beta1.RateLimitBucket{MaxTokens: 2, RefillPeriod: metav1.Duration{Duration: time.Minute}},
+	}
+	l, err := NewLimiter(client, "ns", "vmcp", crd.ToInternal())
+	require.NoError(t, err)
+
+	for range 2 {
+		d, allowErr := l.Allow(ctx, "backend_a_echo", "user-a")
+		require.NoError(t, allowErr)
+		require.True(t, d.Allowed)
+	}
+
+	d, err := l.Allow(ctx, "backend_a_echo", "user-a")
+	require.NoError(t, err)
+	assert.False(t, d.Allowed, "per-user limit should reject user-a's third call")
+
+	d, err = l.Allow(ctx, "backend_b_echo", "user-b")
+	require.NoError(t, err)
+	require.True(t, d.Allowed, "global bucket should still have one token because rejected calls do not drain")
+
+	d, err = l.Allow(ctx, "backend_b_echo", "user-c")
+	require.NoError(t, err)
+	assert.False(t, d.Allowed, "global limit should reject after total allowed calls are exhausted")
+}
+
 func TestLimiter_PerUserRejectionDoesNotDrainShared(t *testing.T) {
 	t.Parallel()
 	client, _ := newTestClient(t)
@@ -284,7 +430,7 @@ func TestLimiter_PerUserRejectionDoesNotDrainShared(t *testing.T) {
 		Shared:  &v1beta1.RateLimitBucket{MaxTokens: 3, RefillPeriod: metav1.Duration{Duration: time.Minute}},
 		PerUser: &v1beta1.RateLimitBucket{MaxTokens: 1, RefillPeriod: metav1.Duration{Duration: time.Minute}},
 	}
-	l, err := NewLimiter(client, "ns", "srv", crd)
+	l, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
 	require.NoError(t, err)
 
 	// User A: first call passes (shared=2, user-a=0).
@@ -319,7 +465,7 @@ func TestLimiter_RedisUnavailablePerUser(t *testing.T) {
 	crd := &v1beta1.RateLimitConfig{
 		PerUser: &v1beta1.RateLimitBucket{MaxTokens: 10, RefillPeriod: metav1.Duration{Duration: time.Minute}},
 	}
-	l, err := NewLimiter(client, "ns", "srv", crd)
+	l, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
 	require.NoError(t, err)
 
 	mr.Close()
@@ -335,7 +481,7 @@ func TestNewLimiter_PerUserZeroMaxTokens(t *testing.T) {
 	crd := &v1beta1.RateLimitConfig{
 		PerUser: &v1beta1.RateLimitBucket{MaxTokens: 0, RefillPeriod: metav1.Duration{Duration: time.Minute}},
 	}
-	_, err := NewLimiter(client, "ns", "srv", crd)
+	_, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
 	assert.Error(t, err)
 	assert.Contains(t, err.Error(), "perUser bucket: maxTokens must be >= 1")
 }
@@ -352,7 +498,7 @@ func TestNewLimiter_ToolPerUserZeroDuration(t *testing.T) {
 			},
 		},
 	}
-	_, err := NewLimiter(client, "ns", "srv", crd)
+	_, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
 	assert.Error(t, err)
 	assert.Contains(t, err.Error(), `tool "search" perUser bucket: refillPeriod must be positive`)
 }
diff --git a/pkg/ratelimit/middleware.go b/pkg/ratelimit/middleware.go
index ee108fdf87..a472923b59 100644
--- a/pkg/ratelimit/middleware.go
+++ b/pkg/ratelimit/middleware.go
@@ -15,10 +15,11 @@ import (
 
 	"github.com/redis/go-redis/v9"
 
-	v1beta1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1beta1"
 	"github.com/stacklok/toolhive/pkg/auth"
 	"github.com/stacklok/toolhive/pkg/mcp"
+	rlconfig "github.com/stacklok/toolhive/pkg/ratelimit/config"
 	"github.com/stacklok/toolhive/pkg/transport/types"
+	"github.com/stacklok/toolhive/pkg/vmcp/session/optimizerdec"
 )
 
 const (
@@ -39,11 +40,11 @@ const (
 
 // MiddlewareParams holds the parameters for the rate limit middleware factory.
 type MiddlewareParams struct {
-	Namespace  string                   `json:"namespace"`
-	ServerName string                   `json:"server_name"`
-	Config     *v1beta1.RateLimitConfig `json:"config"`
-	RedisAddr  string                   `json:"redis_addr,omitempty"`
-	RedisDB    int32                    `json:"redis_db,omitempty"`
+	Namespace  string           `json:"namespace"`
+	ServerName string           `json:"server_name"`
+	Config     *rlconfig.Config `json:"config"`
+	RedisAddr  string           `json:"redis_addr,omitempty"`
+	RedisDB    int32            `json:"redis_db,omitempty"`
 }
 
 // rateLimitMiddleware wraps rate limiting functionality for the factory pattern.
@@ -99,16 +100,16 @@ func CreateMiddleware(config *types.MiddlewareConfig, runner types.MiddlewareRun
 	}
 
 	mw := &rateLimitMiddleware{
-		handler: rateLimitHandler(limiter),
+		handler: HTTPMiddleware(limiter),
 		client:  client,
 	}
 	runner.AddMiddleware(MiddlewareType, mw)
 	return nil
 }
 
-// rateLimitHandler returns a middleware function that enforces rate limits
+// HTTPMiddleware returns a middleware function that enforces rate limits
 // on tools/call requests.
-func rateLimitHandler(limiter Limiter) types.MiddlewareFunction {
+func HTTPMiddleware(limiter Limiter) types.MiddlewareFunction {
 	return func(next http.Handler) http.Handler {
 		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 			// Rate limits only apply to parsed tools/call requests.
@@ -127,7 +128,8 @@ func rateLimitHandler(limiter Limiter) types.MiddlewareFunction {
 			if identity, ok := auth.IdentityFromContext(r.Context()); ok {
 				userID = identity.Subject
 			}
-			decision, err := limiter.Allow(r.Context(), parsed.ResourceID, userID)
+			toolName := rateLimitToolName(parsed)
+			decision, err := limiter.Allow(r.Context(), toolName, userID)
 			if err != nil {
 				slog.Warn("rate limit check failed, allowing request", "error", err)
 				next.ServeHTTP(w, r)
@@ -142,6 +144,18 @@ func rateLimitHandler(limiter Limiter) types.MiddlewareFunction {
 	}
 }
 
+func rateLimitToolName(parsed *mcp.ParsedMCPRequest) string {
+	if parsed == nil {
+		return ""
+	}
+	if parsed.ResourceID == optimizerdec.CallToolName && parsed.Arguments != nil {
+		if toolName, ok := parsed.Arguments[optimizerdec.CallToolArgToolName].(string); ok && toolName != "" {
+			return toolName
+		}
+	}
+	return parsed.ResourceID
+}
+
 // writeRateLimited writes an HTTP 429 response with a JSON-RPC error body.
 func writeRateLimited(w http.ResponseWriter, requestID any, retryAfter time.Duration) {
 	retrySeconds := int(math.Ceil(retryAfter.Seconds()))
diff --git a/pkg/ratelimit/middleware_test.go b/pkg/ratelimit/middleware_test.go
index ed76e72e0c..c526665783 100644
--- a/pkg/ratelimit/middleware_test.go
+++ b/pkg/ratelimit/middleware_test.go
@@ -61,11 +61,23 @@ func withParsedMCPRequest(r *http.Request, method, resourceID string, id any) *h
 	return r.WithContext(ctx)
 }
 
+func withParsedToolCall(r *http.Request, resourceID string, arguments map[string]interface{}, id any) *http.Request {
+	parsed := &mcp.ParsedMCPRequest{
+		Method:     "tools/call",
+		ResourceID: resourceID,
+		Arguments:  arguments,
+		ID:         id,
+		IsRequest:  true,
+	}
+	ctx := context.WithValue(r.Context(), mcp.MCPRequestContextKey, parsed)
+	return r.WithContext(ctx)
+}
+
 func TestRateLimitHandler_ToolCallAllowed(t *testing.T) {
 	t.Parallel()
 
 	limiter := &dummyLimiter{decision: &Decision{Allowed: true}}
-	handler := rateLimitHandler(limiter)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+	handler := HTTPMiddleware(limiter)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 		w.WriteHeader(http.StatusOK)
 	}))
 
@@ -82,7 +94,7 @@ func TestRateLimitHandler_ToolCallRejected(t *testing.T) {
 	t.Parallel()
 
 	limiter := &dummyLimiter{decision: &Decision{Allowed: false, RetryAfter: 5 * time.Second}}
-	handler := rateLimitHandler(limiter)(http.HandlerFunc(func(http.ResponseWriter, *http.Request) {
+	handler := HTTPMiddleware(limiter)(http.HandlerFunc(func(http.ResponseWriter, *http.Request) {
 		t.Fatal("next handler should not be called when rate limited")
 	}))
 
@@ -114,7 +126,7 @@ func TestRateLimitHandler_RedisErrorFailOpen(t *testing.T) {
 
 	limiter := &dummyLimiter{err: errors.New("redis connection refused")}
 	nextCalled := false
-	handler := rateLimitHandler(limiter)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+	handler := HTTPMiddleware(limiter)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 		nextCalled = true
 		w.WriteHeader(http.StatusOK)
 	}))
@@ -134,7 +146,7 @@ func TestRateLimitHandler_NoParsedMCPRequest_PassesThrough(t *testing.T) {
 
 	limiter := &dummyLimiter{decision: &Decision{Allowed: false}}
 	nextCalled := false
-	handler := rateLimitHandler(limiter)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+	handler := HTTPMiddleware(limiter)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 		nextCalled = true
 		w.WriteHeader(http.StatusOK)
 	}))
@@ -154,7 +166,7 @@ func TestRateLimitHandler_NonToolCallPassesThrough(t *testing.T) {
 
 	limiter := &dummyLimiter{decision: &Decision{Allowed: false, RetryAfter: time.Second}}
 	nextCalled := false
-	handler := rateLimitHandler(limiter)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+	handler := HTTPMiddleware(limiter)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 		nextCalled = true
 		w.WriteHeader(http.StatusOK)
 	}))
@@ -173,7 +185,7 @@ func TestRateLimitHandler_PassesUserID(t *testing.T) {
 	t.Parallel()
 
 	recorder := &recordingLimiter{}
-	handler := rateLimitHandler(recorder)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+	handler := HTTPMiddleware(recorder)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 		w.WriteHeader(http.StatusOK)
 	}))
 
@@ -189,11 +201,36 @@ func TestRateLimitHandler_PassesUserID(t *testing.T) {
 	assert.Equal(t, "alice@example.com", recorder.userID)
 }
 
+func TestRateLimitHandler_OptimizerCallToolUsesInnerToolName(t *testing.T) {
+	t.Parallel()
+
+	recorder := &recordingLimiter{}
+	handler := HTTPMiddleware(recorder)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		w.WriteHeader(http.StatusOK)
+	}))
+
+	req := httptest.NewRequest(http.MethodPost, "/mcp", nil)
+	req = withParsedToolCall(req, "call_tool", map[string]interface{}{
+		"tool_name": "backend_a_expensive_tool",
+		"parameters": map[string]interface{}{
+			"query": "test",
+		},
+	}, 1)
+	req = withIdentity(req, "alice@example.com")
+	w := httptest.NewRecorder()
+
+	handler.ServeHTTP(w, req)
+
+	assert.Equal(t, http.StatusOK, w.Code)
+	assert.Equal(t, "backend_a_expensive_tool", recorder.toolName)
+	assert.Equal(t, "alice@example.com", recorder.userID)
+}
+
 func TestRateLimitHandler_NoIdentityPassesEmptyUserID(t *testing.T) {
 	t.Parallel()
 
 	recorder := &recordingLimiter{}
-	handler := rateLimitHandler(recorder)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+	handler := HTTPMiddleware(recorder)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 		w.WriteHeader(http.StatusOK)
 	}))
 
diff --git a/pkg/runner/middleware.go b/pkg/runner/middleware.go
index be9dd33506..0d711e4439 100644
--- a/pkg/runner/middleware.go
+++ b/pkg/runner/middleware.go
@@ -434,7 +434,7 @@ func addRateLimitMiddleware(middlewares []types.MiddlewareConfig, config *RunCon
 	params := ratelimit.MiddlewareParams{
 		Namespace:  config.RateLimitNamespace,
 		ServerName: config.Name,
-		Config:     config.RateLimitConfig,
+		Config:     config.RateLimitConfig.ToInternal(),
 		RedisAddr:  redisAddr,
 		RedisDB:    redisDB,
 	}
diff --git a/pkg/vmcp/cli/serve.go b/pkg/vmcp/cli/serve.go
index ffb501115c..72b72148e6 100644
--- a/pkg/vmcp/cli/serve.go
+++ b/pkg/vmcp/cli/serve.go
@@ -386,6 +386,8 @@ func Serve(ctx context.Context, cfg ServeConfig) error {
 		OptimizerConfig:         optCfg,
 		SessionFactory:          sessionFactory,
 		SessionStorage:          vmcpCfg.SessionStorage,
+		RateLimiting:            vmcpCfg.RateLimiting,
+		RateLimitNamespace:      os.Getenv("VMCP_NAMESPACE"),
 	}
 
 	// Assign Watcher only when backendWatcher is non-nil. A typed nil
diff --git a/pkg/vmcp/config/config.go b/pkg/vmcp/config/config.go
index 7a2c699290..9deabf6cd0 100644
--- a/pkg/vmcp/config/config.go
+++ b/pkg/vmcp/config/config.go
@@ -15,6 +15,7 @@ import (
 
 	"github.com/stacklok/toolhive/pkg/audit"
 	thvjson "github.com/stacklok/toolhive/pkg/json"
+	rlconfig "github.com/stacklok/toolhive/pkg/ratelimit/config"
 	"github.com/stacklok/toolhive/pkg/telemetry"
 	"github.com/stacklok/toolhive/pkg/vmcp"
 	authtypes "github.com/stacklok/toolhive/pkg/vmcp/auth/types"
@@ -173,6 +174,14 @@ type Config struct {
 	// the THV_SESSION_REDIS_PASSWORD environment variable.
 	// +optional
 	SessionStorage *SessionStorageConfig `json:"sessionStorage,omitempty" yaml:"sessionStorage,omitempty"`
+
+	// RateLimiting configures request rate limits for tools/call.
+	// Kubernetes users should set VirtualMCPServer.spec.rateLimiting; the operator
+	// copies that top-level field here for the vMCP runtime.
+	// This remains YAML-only so it can be written to the runtime config ConfigMap
+	// without exposing spec.config.rateLimiting in the Kubernetes API.
+	// +optional
+	RateLimiting *rlconfig.Config `json:"-" yaml:"rateLimiting,omitempty"`
 }
 
 // IncomingAuthConfig configures client authentication to the virtual MCP server.
diff --git a/pkg/vmcp/config/zz_generated.deepcopy.go b/pkg/vmcp/config/zz_generated.deepcopy.go
index 80861bff11..018d6cabdd 100644
--- a/pkg/vmcp/config/zz_generated.deepcopy.go
+++ b/pkg/vmcp/config/zz_generated.deepcopy.go
@@ -204,6 +204,10 @@ func (in *Config) DeepCopyInto(out *Config) {
 		*out = new(SessionStorageConfig)
 		**out = **in
 	}
+	if in.RateLimiting != nil {
+		in, out := &in.RateLimiting, &out.RateLimiting
+		*out = (*in).DeepCopy()
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Config.
diff --git a/pkg/vmcp/server/ratelimit.go b/pkg/vmcp/server/ratelimit.go
new file mode 100644
index 0000000000..95d0c2f16e
--- /dev/null
+++ b/pkg/vmcp/server/ratelimit.go
@@ -0,0 +1,72 @@
+// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+package server
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"os"
+	"strings"
+	"time"
+
+	"github.com/redis/go-redis/v9"
+
+	"github.com/stacklok/toolhive/pkg/ratelimit"
+	transporttypes "github.com/stacklok/toolhive/pkg/transport/types"
+	vmcpconfig "github.com/stacklok/toolhive/pkg/vmcp/config"
+)
+
+type httpMiddleware = transporttypes.MiddlewareFunction
+
+type shutdownFunc func(context.Context) error
+
+func (s *Server) buildRateLimitMiddleware(ctx context.Context) (httpMiddleware, shutdownFunc, error) {
+	if s.config.RateLimiting == nil {
+		return nil, nil, nil
+	}
+	if s.config.SessionStorage == nil ||
+		!strings.EqualFold(s.config.SessionStorage.Provider, "redis") ||
+		s.config.SessionStorage.Address == "" {
+		return nil, nil, fmt.Errorf("rate limiting requires sessionStorage with provider redis")
+	}
+
+	client := redis.NewClient(&redis.Options{
+		Addr:     s.config.SessionStorage.Address,
+		DB:       int(s.config.SessionStorage.DB),
+		Password: os.Getenv(vmcpconfig.RedisPasswordEnvVar),
+	})
+
+	pingCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
+	defer cancel()
+	if err := client.Ping(pingCtx).Err(); err != nil {
+		_ = client.Close()
+		return nil, nil, fmt.Errorf("rate limit middleware: failed to connect to Redis at %s: %w",
+			s.config.SessionStorage.Address, err)
+	}
+
+	namespace := s.config.RateLimitNamespace
+	if namespace == "" {
+		namespace = os.Getenv("VMCP_NAMESPACE")
+	}
+	if namespace == "" {
+		namespace = "default"
+	}
+
+	limiter, err := ratelimit.NewLimiter(client, namespace, s.config.Name, s.config.RateLimiting)
+	if err != nil {
+		_ = client.Close()
+		return nil, nil, fmt.Errorf("failed to create rate limiter: %w", err)
+	}
+
+	slog.Info("rate limit middleware enabled for vMCP",
+		"namespace", namespace,
+		"name", s.config.Name,
+		"redis_addr", s.config.SessionStorage.Address,
+		"redis_db", s.config.SessionStorage.DB)
+
+	return ratelimit.HTTPMiddleware(limiter), func(context.Context) error {
+		return client.Close()
+	}, nil
+}
diff --git a/pkg/vmcp/server/ratelimit_test.go b/pkg/vmcp/server/ratelimit_test.go
new file mode 100644
index 0000000000..a804a1ace9
--- /dev/null
+++ b/pkg/vmcp/server/ratelimit_test.go
@@ -0,0 +1,76 @@
+// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+package server
+
+import (
+	"context"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+
+	"github.com/alicebob/miniredis/v2"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+
+	"github.com/stacklok/toolhive/pkg/auth"
+	mcpparser "github.com/stacklok/toolhive/pkg/mcp"
+	rlconfig "github.com/stacklok/toolhive/pkg/ratelimit/config"
+	vmcpconfig "github.com/stacklok/toolhive/pkg/vmcp/config"
+)
+
+func TestBuildRateLimitMiddleware_EnforcesToolsCall(t *testing.T) {
+	t.Parallel()
+
+	redisServer := miniredis.RunT(t)
+	srv := &Server{
+		config: &Config{
+			Name:               "vmcp-a",
+			RateLimitNamespace: "default",
+			SessionStorage: &vmcpconfig.SessionStorageConfig{
+				Provider: "redis",
+				Address:  redisServer.Addr(),
+			},
+			RateLimiting: &rlconfig.Config{
+				PerUser: &rlconfig.Bucket{
+					MaxTokens:    1,
+					RefillPeriod: metav1.Duration{Duration: time.Minute},
+				},
+			},
+		},
+	}
+
+	middleware, closeFn, err := srv.buildRateLimitMiddleware(context.Background())
+	require.NoError(t, err)
+	require.NotNil(t, middleware)
+	t.Cleanup(func() {
+		if closeFn != nil {
+			require.NoError(t, closeFn(context.Background()))
+		}
+	})
+
+	handler := middleware(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		w.WriteHeader(http.StatusOK)
+	}))
+
+	req := httptest.NewRequest(http.MethodPost, "/mcp", nil)
+	req = req.WithContext(auth.WithIdentity(req.Context(), &auth.Identity{
+		PrincipalInfo: auth.PrincipalInfo{Subject: "alice"},
+	}))
+	req = req.WithContext(context.WithValue(req.Context(), mcpparser.MCPRequestContextKey, &mcpparser.ParsedMCPRequest{
+		Method:     "tools/call",
+		ResourceID: "backend_a_echo",
+		ID:         1,
+		IsRequest:  true,
+	}))
+
+	first := httptest.NewRecorder()
+	handler.ServeHTTP(first, req)
+	assert.Equal(t, http.StatusOK, first.Code)
+
+	second := httptest.NewRecorder()
+	handler.ServeHTTP(second, req)
+	assert.Equal(t, http.StatusTooManyRequests, second.Code)
+}
diff --git a/pkg/vmcp/server/server.go b/pkg/vmcp/server/server.go
index 23c415a551..694e43b25d 100644
--- a/pkg/vmcp/server/server.go
+++ b/pkg/vmcp/server/server.go
@@ -28,6 +28,7 @@ import (
 	"github.com/stacklok/toolhive/pkg/auth"
 	asrunner "github.com/stacklok/toolhive/pkg/authserver/runner"
 	mcpparser "github.com/stacklok/toolhive/pkg/mcp"
+	rlconfig "github.com/stacklok/toolhive/pkg/ratelimit/config"
 	"github.com/stacklok/toolhive/pkg/recovery"
 	"github.com/stacklok/toolhive/pkg/telemetry"
 	transportmiddleware "github.com/stacklok/toolhive/pkg/transport/middleware"
@@ -181,6 +182,12 @@ type Config struct {
 	// session persistence; the Redis password is read from the
 	// THV_SESSION_REDIS_PASSWORD environment variable.
 	SessionStorage *vmcpconfig.SessionStorageConfig
+
+	// RateLimiting configures rate limiting for incoming tools/call requests.
+	RateLimiting *rlconfig.Config
+
+	// RateLimitNamespace is used with Name to derive Redis keys.
+	RateLimitNamespace string
 }
 
 // Server is the Virtual MCP Server that aggregates multiple backends.
@@ -525,7 +532,7 @@ func New(
 // Each call builds a fresh handler. The method is safe to call multiple times.
 // All returned handlers share the same underlying MCPServer and SessionManager,
 // so callers should not serve concurrent traffic through multiple handlers.
-func (s *Server) Handler(_ context.Context) (http.Handler, error) {
+func (s *Server) Handler(ctx context.Context) (http.Handler, error) {
 	// Create Streamable HTTP server with ToolHive session management
 	streamableServer := server.NewStreamableHTTPServer(
 		s.mcpServer,
@@ -537,6 +544,25 @@ func (s *Server) Handler(_ context.Context) (http.Handler, error) {
 	// Create HTTP mux with separated authenticated and unauthenticated routes
 	mux := http.NewServeMux()
 
+	s.registerHTTPRoutes(mux)
+
+	// MCP endpoint - apply middleware chain (wrapping order, execution happens in reverse):
+	// Code wraps: auth → MCP-parsing → rate-limit → audit → discovery →
+	//   annotation-enrichment → authz → backend-enrichment → telemetry
+	// Execution order: recovery → header-val → auth → MCP-parsing →
+	//   rate-limit → audit → discovery → annotation-enrichment → authz →
+	//   backend-enrichment → telemetry → handler
+	mcpHandler, err := s.buildMCPHandler(ctx, streamableServer)
+	if err != nil {
+		return nil, err
+	}
+
+	mux.Handle("/", mcpHandler)
+
+	return mux, nil
+}
+
+func (s *Server) registerHTTPRoutes(mux *http.ServeMux) {
 	// Unauthenticated health endpoints
 	mux.HandleFunc("/health", s.handleHealth)
 	mux.HandleFunc("/ping", s.handleHealth)
@@ -570,32 +596,56 @@ func (s *Server) Handler(_ context.Context) (http.Handler, error) {
 		s.config.AuthServer.RegisterHandlers(mux)
 		slog.Debug("embedded authorization server routes registered")
 	}
+}
 
-	// MCP endpoint - apply middleware chain (wrapping order, execution happens in reverse):
-	// Code wraps: auth+parser → audit → discovery → annotation-enrichment →
-	//   authz → backend-enrichment → MCP-parsing → telemetry
-	// Execution order: recovery → header-val → auth+parser → audit →
-	//   discovery → annotation-enrichment → authz → backend-enrichment →
-	//   MCP-parsing → telemetry → handler
+func (s *Server) buildMCPHandler(ctx context.Context, base http.Handler) (http.Handler, error) {
+	mcpHandler := s.wrapMCPHandler(base)
+
+	if err := s.applyAuditMiddleware(&mcpHandler); err != nil {
+		return nil, err
+	}
+	if err := s.applyRateLimitMiddleware(ctx, &mcpHandler); err != nil {
+		return nil, err
+	}
+
+	// Apply MCP parsing middleware to extract JSON-RPC method from request body.
+	// This must run before rate limiting so global limits also work when auth is
+	// disabled, and before telemetry so metrics can use the real MCP method.
+	mcpHandler = mcpparser.ParsingMiddleware(mcpHandler)
+
+	// Apply authentication middleware if configured (runs first in chain)
+	if s.config.AuthMiddleware != nil {
+		mcpHandler = s.config.AuthMiddleware(mcpHandler)
+		slog.Info("authentication middleware enabled for MCP endpoints")
+	}
+
+	// Apply Accept header validation (rejects GET requests without Accept: text/event-stream)
+	mcpHandler = headerValidatingMiddleware(mcpHandler)
 
-	var mcpHandler http.Handler = streamableServer
+	// Clear the write deadline for qualifying SSE connections (GET +
+	// Accept: text/event-stream + MCP endpoint path) so the server-level
+	// WriteTimeout does not kill long-lived SSE streams (see golang/go#16100).
+	// Non-qualifying requests are left untouched; http.Server.WriteTimeout
+	// (defaultWriteTimeout) remains in effect for them.
+	mcpHandler = transportmiddleware.WriteTimeout(s.config.EndpointPath)(mcpHandler)
+
+	// Apply recovery middleware as outermost (catches panics from all inner middleware)
+	mcpHandler = recovery.Middleware(mcpHandler)
+	slog.Info("recovery middleware enabled for MCP endpoints")
+
+	return mcpHandler, nil
+}
+
+func (s *Server) wrapMCPHandler(base http.Handler) http.Handler {
+	mcpHandler := base
 
 	if s.config.TelemetryProvider != nil {
 		mcpHandler = s.config.TelemetryProvider.Middleware(s.config.Name, "streamable-http")(mcpHandler)
 		slog.Info("telemetry middleware enabled for MCP endpoints")
 	}
 
-	// Apply MCP parsing middleware to extract JSON-RPC method from request body.
-	// This runs before telemetry so that recordMetrics can label metrics with the
-	// actual mcp_method (e.g. "tools/call", "initialize") instead of "unknown".
-	// Note: ParsingMiddleware is also composed inside the auth middleware (for audit/authz).
-	// The second application here is a no-op because the context already holds a
-	// ParsedMCPRequest; it exists only so the telemetry layer works correctly even
-	// when auth middleware is nil.
-	mcpHandler = mcpparser.ParsingMiddleware(mcpHandler)
-
-	// Apply backend enrichment middleware if audit is configured
-	// This runs after discovery populates the routing table, so it can extract backend names
+	// Apply backend enrichment middleware if audit is configured.
+	// This runs after discovery populates the routing table, so it can extract backend names.
 	if s.config.AuditConfig != nil {
 		mcpHandler = s.backendEnrichmentMiddleware(mcpHandler)
 		slog.Info("backend enrichment middleware enabled for audit events")
@@ -606,75 +656,63 @@ func (s *Server) Handler(_ context.Context) (http.Handler, error) {
 	if s.config.AuthzMiddleware != nil {
 		mcpHandler = s.config.AuthzMiddleware(mcpHandler)
 		slog.Info("authorization middleware enabled for MCP endpoints (post-discovery)")
-	}
-
-	// Apply annotation enrichment middleware (runs after discovery, before authz in execution).
-	// Reads tool annotations from discovered capabilities and injects them into the
-	// request context so the authz middleware can make annotation-aware decisions.
-	if s.config.AuthzMiddleware != nil {
 		mcpHandler = AnnotationEnrichmentMiddleware(mcpHandler)
 		slog.Info("annotation enrichment middleware enabled for MCP endpoints")
 	}
 
-	// Apply discovery middleware (runs after audit/auth middleware)
-	// Discovery middleware performs per-request capability aggregation with user context.
-	// vmcpSessionMgr (MultiSessionGetter) is used to retrieve the fully-formed MultiSession
-	// for subsequent requests so the routing table can be injected into context.
-	// The backend registry provides a dynamic backend list (supports DynamicRegistry for K8s).
-	// The health monitor enables filtering based on current health status (respects circuit breaker).
-	s.healthMonitorMu.RLock()
-	healthMon := s.healthMonitor
-	s.healthMonitorMu.RUnlock()
-
-	var healthStatusProvider health.StatusProvider
-	if healthMon != nil {
-		healthStatusProvider = healthMon
-	}
+	healthStatusProvider := s.getHealthStatusProvider()
 	mcpHandler = discovery.Middleware(
 		s.discoveryMgr, s.backendRegistry, s.vmcpSessionMgr, healthStatusProvider,
 		discovery.WithSessionScopedRouting(),
 	)(mcpHandler)
 	slog.Info("discovery middleware enabled for lazy per-user capability discovery")
 
-	// Apply audit middleware if configured (runs after auth, before discovery)
-	if s.config.AuditConfig != nil {
-		if err := s.config.AuditConfig.Validate(); err != nil {
-			return nil, fmt.Errorf("invalid audit configuration: %w", err)
-		}
-		auditor, err := audit.NewAuditorWithTransport(
-			s.config.AuditConfig,
-			"streamable-http", // vMCP uses streamable HTTP transport
-		)
-		if err != nil {
-			return nil, fmt.Errorf("failed to create auditor: %w", err)
-		}
-		mcpHandler = auditor.Middleware(mcpHandler)
-		slog.Info("audit middleware enabled for MCP endpoints")
-	}
-
-	// Apply authentication middleware if configured (runs first in chain)
-	if s.config.AuthMiddleware != nil {
-		mcpHandler = s.config.AuthMiddleware(mcpHandler)
-		slog.Info("authentication middleware enabled for MCP endpoints")
-	}
+	return mcpHandler
+}
 
-	// Apply Accept header validation (rejects GET requests without Accept: text/event-stream)
-	mcpHandler = headerValidatingMiddleware(mcpHandler)
+func (s *Server) getHealthStatusProvider() health.StatusProvider {
+	s.healthMonitorMu.RLock()
+	defer s.healthMonitorMu.RUnlock()
 
-	// Clear the write deadline for qualifying SSE connections (GET +
-	// Accept: text/event-stream + MCP endpoint path) so the server-level
-	// WriteTimeout does not kill long-lived SSE streams (see golang/go#16100).
-	// Non-qualifying requests are left untouched; http.Server.WriteTimeout
-	// (defaultWriteTimeout) remains in effect for them.
-	mcpHandler = transportmiddleware.WriteTimeout(s.config.EndpointPath)(mcpHandler)
+	if s.healthMonitor == nil {
+		return nil
+	}
+	return s.healthMonitor
+}
 
-	// Apply recovery middleware as outermost (catches panics from all inner middleware)
-	mcpHandler = recovery.Middleware(mcpHandler)
-	slog.Info("recovery middleware enabled for MCP endpoints")
+func (s *Server) applyAuditMiddleware(mcpHandler *http.Handler) error {
+	if s.config.AuditConfig == nil {
+		return nil
+	}
+	if err := s.config.AuditConfig.Validate(); err != nil {
+		return fmt.Errorf("invalid audit configuration: %w", err)
+	}
+	auditor, err := audit.NewAuditorWithTransport(
+		s.config.AuditConfig,
+		"streamable-http", // vMCP uses streamable HTTP transport
+	)
+	if err != nil {
+		return fmt.Errorf("failed to create auditor: %w", err)
+	}
+	*mcpHandler = auditor.Middleware(*mcpHandler)
+	slog.Info("audit middleware enabled for MCP endpoints")
+	return nil
+}
 
-	mux.Handle("/", mcpHandler)
+func (s *Server) applyRateLimitMiddleware(ctx context.Context, mcpHandler *http.Handler) error {
+	rateLimitMiddleware, closeRateLimit, err := s.buildRateLimitMiddleware(ctx)
+	if err != nil {
+		return err
+	}
+	if rateLimitMiddleware == nil {
+		return nil
+	}
 
-	return mux, nil
+	*mcpHandler = rateLimitMiddleware(*mcpHandler)
+	if closeRateLimit != nil {
+		s.shutdownFuncs = append(s.shutdownFuncs, closeRateLimit)
+	}
+	return nil
 }
 
 // Start starts the Virtual MCP Server and begins serving requests.
diff --git a/test/e2e/thv-operator/virtualmcp/virtualmcp_ratelimit_test.go b/test/e2e/thv-operator/virtualmcp/virtualmcp_ratelimit_test.go
new file mode 100644
index 0000000000..de9900ba6c
--- /dev/null
+++ b/test/e2e/thv-operator/virtualmcp/virtualmcp_ratelimit_test.go
@@ -0,0 +1,248 @@
+// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+package virtualmcp
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"time"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+
+	mcpv1beta1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1beta1"
+	"github.com/stacklok/toolhive/test/e2e/images"
+)
+
+var _ = Describe("VirtualMCPServer Rate Limiting", Ordered, func() {
+	const (
+		timeout         = 3 * time.Minute
+		pollingInterval = 1 * time.Second
+	)
+
+	var (
+		httpClient   *http.Client
+		mcpGroupName string
+		backendName  string
+		vmcpName     string
+		oidcName     string
+		oidcConfig   string
+		redisName    string
+		oidcNodePort int32
+		vmcpNodePort int32
+		cleanupOIDC  func()
+	)
+
+	BeforeAll(func() {
+		httpClient = &http.Client{Timeout: 30 * time.Second}
+
+		timestamp := time.Now().UnixNano()
+		mcpGroupName = fmt.Sprintf("e2e-vmcp-rl-group-%d", timestamp)
+		backendName = fmt.Sprintf("e2e-vmcp-rl-backend-%d", timestamp)
+		vmcpName = fmt.Sprintf("e2e-vmcp-rl-%d", timestamp)
+		oidcName = fmt.Sprintf("e2e-vmcp-rl-oidc-%d", timestamp)
+		oidcConfig = fmt.Sprintf("e2e-vmcp-rl-oidc-config-%d", timestamp)
+		redisName = fmt.Sprintf("e2e-vmcp-rl-redis-%d", timestamp)
+
+		By("Deploying Redis for vMCP session storage and rate limiting")
+		deployRedis(redisName)
+
+		By("Deploying parameterized OIDC server for per-user identity")
+		var issuerURL string
+		issuerURL, oidcNodePort, cleanupOIDC = DeployParameterizedOIDCServer(
+			ctx, k8sClient, oidcName, defaultNamespace, timeout, pollingInterval)
+
+		By("Creating MCPOIDCConfig for vMCP incoming auth")
+		Expect(k8sClient.Create(ctx, &mcpv1beta1.MCPOIDCConfig{
+			ObjectMeta: metav1.ObjectMeta{Name: oidcConfig, Namespace: defaultNamespace},
+			Spec: mcpv1beta1.MCPOIDCConfigSpec{
+				Type: mcpv1beta1.MCPOIDCConfigTypeInline,
+				Inline: &mcpv1beta1.InlineOIDCSharedConfig{
+					Issuer:                          issuerURL,
+					InsecureAllowHTTP:               true,
+					JWKSAllowPrivateIP:              true,
+					ProtectedResourceAllowPrivateIP: true,
+				},
+			},
+		})).To(Succeed())
+
+		By("Creating MCPGroup and yardstick backend")
+		CreateMCPGroupAndWait(ctx, k8sClient, mcpGroupName, defaultNamespace,
+			"VirtualMCPServer rate limit e2e group", timeout, pollingInterval)
+		CreateMCPServerAndWait(ctx, k8sClient, backendName, defaultNamespace, mcpGroupName,
+			images.YardstickServerImage, timeout, pollingInterval)
+
+		By("Creating VirtualMCPServer with per-user rate limit")
+		redisAddr := fmt.Sprintf("%s.%s.svc.cluster.local:6379", redisName, defaultNamespace)
+		Expect(k8sClient.Create(ctx, &mcpv1beta1.VirtualMCPServer{
+			ObjectMeta: metav1.ObjectMeta{Name: vmcpName, Namespace: defaultNamespace},
+			Spec: mcpv1beta1.VirtualMCPServerSpec{
+				GroupRef: &mcpv1beta1.MCPGroupRef{Name: mcpGroupName},
+				IncomingAuth: &mcpv1beta1.IncomingAuthConfig{
+					Type: "oidc",
+					OIDCConfigRef: &mcpv1beta1.MCPOIDCConfigReference{
+						Name:     oidcConfig,
+						Audience: "vmcp-audience",
+					},
+				},
+				ServiceType: "NodePort",
+				SessionStorage: &mcpv1beta1.SessionStorageConfig{
+					Provider: mcpv1beta1.SessionStorageProviderRedis,
+					Address:  redisAddr,
+				},
+				RateLimiting: &mcpv1beta1.RateLimitConfig{
+					PerUser: &mcpv1beta1.RateLimitBucket{
+						MaxTokens:    2,
+						RefillPeriod: metav1.Duration{Duration: time.Minute},
+					},
+				},
+			},
+		})).To(Succeed())
+
+		By("Waiting for VirtualMCPServer to be ready")
+		WaitForVirtualMCPServerReady(ctx, k8sClient, vmcpName, defaultNamespace, timeout, pollingInterval)
+		WaitForCondition(ctx, k8sClient, vmcpName, defaultNamespace, "BackendsDiscovered", "True", timeout, pollingInterval)
+		vmcpNodePort = GetVMCPNodePort(ctx, k8sClient, vmcpName, defaultNamespace, timeout, pollingInterval)
+	})
+
+	AfterAll(func() {
+		_ = k8sClient.Delete(ctx, &mcpv1beta1.VirtualMCPServer{
+			ObjectMeta: metav1.ObjectMeta{Name: vmcpName, Namespace: defaultNamespace},
+		})
+		_ = k8sClient.Delete(ctx, &mcpv1beta1.MCPServer{
+			ObjectMeta: metav1.ObjectMeta{Name: backendName, Namespace: defaultNamespace},
+		})
+		_ = k8sClient.Delete(ctx, &mcpv1beta1.MCPGroup{
+			ObjectMeta: metav1.ObjectMeta{Name: mcpGroupName, Namespace: defaultNamespace},
+		})
+		_ = k8sClient.Delete(ctx, &mcpv1beta1.MCPOIDCConfig{
+			ObjectMeta: metav1.ObjectMeta{Name: oidcConfig, Namespace: defaultNamespace},
+		})
+		cleanupRedis(redisName)
+		if cleanupOIDC != nil {
+			cleanupOIDC()
+		}
+	})
+
+	It("rejects tools/call after the per-user limit is exceeded", func() {
+		token := getVMCPRateLimitOIDCToken(ctx, httpClient, oidcNodePort, "vmcp-rate-limit-user")
+		sessionID := sendVMCPRateLimitInitialize(ctx, httpClient, vmcpNodePort, token)
+
+		By("Sending requests within the per-user rate limit")
+		for i := range 2 {
+			status, body, _ := sendVMCPRateLimitToolCall(ctx, httpClient, vmcpNodePort, "echo", i+1, token, sessionID)
+			Expect(status).To(Equal(http.StatusOK),
+				"request %d should succeed, got status %d: %s", i+1, status, string(body))
+		}
+
+		By("Sending request that exceeds the per-user rate limit")
+		status, body, retryAfter := sendVMCPRateLimitToolCall(ctx, httpClient, vmcpNodePort, "echo", 3, token, sessionID)
+		Expect(status).To(Equal(http.StatusTooManyRequests),
+			"third request should be rate limited, body: %s", string(body))
+		Expect(retryAfter).ToNot(BeEmpty(), "Retry-After header should be set")
+
+		var rpcResp map[string]any
+		Expect(json.Unmarshal(body, &rpcResp)).To(Succeed())
+		errObj, ok := rpcResp["error"].(map[string]any)
+		Expect(ok).To(BeTrue(), "response should include JSON-RPC error")
+		Expect(errObj["code"]).To(BeEquivalentTo(-32029))
+		Expect(errObj["message"]).To(Equal("Rate limit exceeded"))
+	})
+})
+
+func getVMCPRateLimitOIDCToken(ctx context.Context, httpClient *http.Client, oidcNodePort int32, subject string) string {
+	url := fmt.Sprintf("http://localhost:%d/token?subject=%s", oidcNodePort, subject)
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, nil)
+	Expect(err).ToNot(HaveOccurred())
+
+	resp, err := httpClient.Do(req)
+	Expect(err).ToNot(HaveOccurred())
+	defer func() { _ = resp.Body.Close() }()
+
+	var tokenResp struct {
+		AccessToken string `json:"access_token"`
+	}
+	Expect(json.NewDecoder(resp.Body).Decode(&tokenResp)).To(Succeed())
+	Expect(tokenResp.AccessToken).ToNot(BeEmpty(), "OIDC server should return a token")
+	return tokenResp.AccessToken
+}
+
+func sendVMCPRateLimitInitialize(
+	ctx context.Context, httpClient *http.Client, port int32, bearerToken string,
+) string {
+	reqBody := map[string]any{
+		"jsonrpc": "2.0",
+		"id":      0,
+		"method":  "initialize",
+		"params": map[string]any{
+			"protocolVersion": "2025-03-26",
+			"capabilities":    map[string]any{},
+			"clientInfo": map[string]any{
+				"name":    "vmcp-rate-limit-e2e",
+				"version": "1.0.0",
+			},
+		},
+	}
+	bodyBytes, err := json.Marshal(reqBody)
+	Expect(err).ToNot(HaveOccurred())
+
+	url := fmt.Sprintf("http://localhost:%d/mcp", port)
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(bodyBytes))
+	Expect(err).ToNot(HaveOccurred())
+	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Accept", "application/json, text/event-stream")
+	req.Header.Set("Authorization", "Bearer "+bearerToken)
+
+	resp, err := httpClient.Do(req)
+	Expect(err).ToNot(HaveOccurred())
+	defer func() { _ = resp.Body.Close() }()
+	Expect(resp.StatusCode).To(Equal(http.StatusOK), "initialize should succeed")
+
+	sessionID := resp.Header.Get("Mcp-Session-Id")
+	Expect(sessionID).ToNot(BeEmpty(), "initialize response should include Mcp-Session-Id header")
+	return sessionID
+}
+
+func sendVMCPRateLimitToolCall(
+	ctx context.Context,
+	httpClient *http.Client,
+	port int32,
+	toolName string,
+	requestID int,
+	bearerToken string,
+	sessionID string,
+) (int, []byte, string) {
+	reqBody := map[string]any{
+		"jsonrpc": "2.0",
+		"id":      requestID,
+		"method":  "tools/call",
+		"params": map[string]any{
+			"name":      toolName,
+			"arguments": map[string]any{"input": "test"},
+		},
+	}
+	bodyBytes, err := json.Marshal(reqBody)
+	Expect(err).ToNot(HaveOccurred())
+
+	url := fmt.Sprintf("http://localhost:%d/mcp", port)
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(bodyBytes))
+	Expect(err).ToNot(HaveOccurred())
+	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Accept", "application/json, text/event-stream")
+	req.Header.Set("Authorization", "Bearer "+bearerToken)
+	req.Header.Set("Mcp-Session-Id", sessionID)
+
+	resp, err := httpClient.Do(req)
+	Expect(err).ToNot(HaveOccurred())
+	defer func() { _ = resp.Body.Close() }()
+
+	respBody, err := io.ReadAll(resp.Body)
+	Expect(err).ToNot(HaveOccurred())
+	return resp.StatusCode, respBody, resp.Header.Get("Retry-After")
+}

From 933b10640924e871e1f71967f8a3fa176b87ff43 Mon Sep 17 00:00:00 2001
From: Sanskarzz <sanskar.gur@gmail.com>
Date: Tue, 28 Apr 2026 23:56:58 +0530
Subject: [PATCH 2/4] Only schema changes, no runtime behavior

Signed-off-by: Sanskarzz <sanskar.gur@gmail.com>
---
 .../api/v1beta1/mcpserver_types.go            |  56 +---
 .../virtualmcpserver_vmcpconfig_test.go       |  20 +-
 cmd/thv-operator/pkg/vmcpconfig/converter.go  |  26 +-
 .../pkg/vmcpconfig/converter_test.go          |  63 ++++-
 .../embeddingserver_update_test.go            |   1 +
 ...irtualmcpserver_sessionstorage_cel_test.go |   7 +-
 docs/operator/crd-api.md                      |  38 ---
 pkg/ratelimit/config/config.go                | 147 -----------
 pkg/ratelimit/config/doc.go                   |   8 -
 pkg/ratelimit/limiter.go                      |  14 +-
 pkg/ratelimit/limiter_test.go                 | 112 ++------
 pkg/ratelimit/middleware.go                   |  34 +--
 pkg/ratelimit/middleware_test.go              |  51 +---
 pkg/runner/middleware.go                      |   2 +-
 pkg/vmcp/cli/serve.go                         |   2 -
 pkg/vmcp/config/config.go                     |   3 +-
 pkg/vmcp/config/config_test.go                |  27 ++
 pkg/vmcp/server/ratelimit.go                  |  72 -----
 pkg/vmcp/server/ratelimit_test.go             |  76 ------
 pkg/vmcp/server/server.go                     | 184 ++++++-------
 .../virtualmcp_circuit_breaker_test.go        |  24 +-
 .../virtualmcp/virtualmcp_ratelimit_test.go   | 248 ------------------
 22 files changed, 278 insertions(+), 937 deletions(-)
 delete mode 100644 pkg/ratelimit/config/config.go
 delete mode 100644 pkg/ratelimit/config/doc.go
 delete mode 100644 pkg/vmcp/server/ratelimit.go
 delete mode 100644 pkg/vmcp/server/ratelimit_test.go
 delete mode 100644 test/e2e/thv-operator/virtualmcp/virtualmcp_ratelimit_test.go

diff --git a/cmd/thv-operator/api/v1beta1/mcpserver_types.go b/cmd/thv-operator/api/v1beta1/mcpserver_types.go
index 6738e6e134..ad9dc96b0f 100644
--- a/cmd/thv-operator/api/v1beta1/mcpserver_types.go
+++ b/cmd/thv-operator/api/v1beta1/mcpserver_types.go
@@ -7,8 +7,6 @@ import (
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
-
-	rlconfig "github.com/stacklok/toolhive/pkg/ratelimit/config"
 )
 
 // Condition types for MCPServer
@@ -513,18 +511,18 @@ type SessionStorageConfig struct {
 type RateLimitConfig struct {
 	// Global is a token bucket shared across all users for the entire server.
 	// +optional
-	Global *RateLimitBucket `json:"global,omitempty"`
+	Global *RateLimitBucket `json:"global,omitempty" yaml:"global,omitempty"`
 
 	// Shared is a deprecated alias for Global. Use global instead.
 	// +optional
-	Shared *RateLimitBucket `json:"shared,omitempty"`
+	Shared *RateLimitBucket `json:"shared,omitempty" yaml:"shared,omitempty"`
 
 	// PerUser is a token bucket applied independently to each authenticated user
 	// at the server level. Requires authentication to be enabled.
 	// Each unique userID creates Redis keys that expire after 2x refillPeriod.
 	// Memory formula: unique_users_per_TTL_window * (1 + num_tools_with_per_user_limits) keys.
 	// +optional
-	PerUser *RateLimitBucket `json:"perUser,omitempty"`
+	PerUser *RateLimitBucket `json:"perUser,omitempty" yaml:"perUser,omitempty"`
 
 	// Tools defines per-tool rate limit overrides.
 	// Each entry applies additional rate limits to calls targeting a specific tool name.
@@ -532,7 +530,7 @@ type RateLimitConfig struct {
 	// +listType=map
 	// +listMapKey=name
 	// +optional
-	Tools []ToolRateLimitConfig `json:"tools,omitempty"`
+	Tools []ToolRateLimitConfig `json:"tools,omitempty" yaml:"tools,omitempty"`
 }
 
 // RateLimitBucket defines a token bucket configuration with a maximum capacity
@@ -543,13 +541,13 @@ type RateLimitBucket struct {
 	// instantaneously before the bucket is depleted.
 	// +kubebuilder:validation:Required
 	// +kubebuilder:validation:Minimum=1
-	MaxTokens int32 `json:"maxTokens"`
+	MaxTokens int32 `json:"maxTokens" yaml:"maxTokens"`
 
 	// RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
 	// The effective refill rate is maxTokens / refillPeriod tokens per second.
 	// Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
 	// +kubebuilder:validation:Required
-	RefillPeriod metav1.Duration `json:"refillPeriod"`
+	RefillPeriod metav1.Duration `json:"refillPeriod" yaml:"refillPeriod"`
 }
 
 // ToolRateLimitConfig defines rate limits for a specific tool.
@@ -562,53 +560,19 @@ type ToolRateLimitConfig struct {
 	// Name is the MCP tool name this limit applies to.
 	// +kubebuilder:validation:Required
 	// +kubebuilder:validation:MinLength=1
-	Name string `json:"name"`
+	Name string `json:"name" yaml:"name"`
 
 	// Global token bucket for this specific tool.
 	// +optional
-	Global *RateLimitBucket `json:"global,omitempty"`
+	Global *RateLimitBucket `json:"global,omitempty" yaml:"global,omitempty"`
 
 	// Shared is a deprecated alias for Global. Use global instead.
 	// +optional
-	Shared *RateLimitBucket `json:"shared,omitempty"`
+	Shared *RateLimitBucket `json:"shared,omitempty" yaml:"shared,omitempty"`
 
 	// PerUser token bucket configuration for this tool.
 	// +optional
-	PerUser *RateLimitBucket `json:"perUser,omitempty"`
-}
-
-// ToInternal converts the Kubernetes API rate limit config to the runtime-neutral representation.
-func (in *RateLimitConfig) ToInternal() *rlconfig.Config {
-	if in == nil {
-		return nil
-	}
-	out := &rlconfig.Config{
-		Global:  rateLimitBucketToInternal(in.Global),
-		Shared:  rateLimitBucketToInternal(in.Shared),
-		PerUser: rateLimitBucketToInternal(in.PerUser),
-	}
-	if len(in.Tools) > 0 {
-		out.Tools = make([]rlconfig.ToolConfig, 0, len(in.Tools))
-		for _, tool := range in.Tools {
-			out.Tools = append(out.Tools, rlconfig.ToolConfig{
-				Name:    tool.Name,
-				Global:  rateLimitBucketToInternal(tool.Global),
-				Shared:  rateLimitBucketToInternal(tool.Shared),
-				PerUser: rateLimitBucketToInternal(tool.PerUser),
-			})
-		}
-	}
-	return out
-}
-
-func rateLimitBucketToInternal(in *RateLimitBucket) *rlconfig.Bucket {
-	if in == nil {
-		return nil
-	}
-	return &rlconfig.Bucket{
-		MaxTokens:    in.MaxTokens,
-		RefillPeriod: in.RefillPeriod,
-	}
+	PerUser *RateLimitBucket `json:"perUser,omitempty" yaml:"perUser,omitempty"`
 }
 
 // Permission profile types
diff --git a/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig_test.go b/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig_test.go
index 76da31960f..e671e9083b 100644
--- a/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig_test.go
+++ b/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig_test.go
@@ -530,12 +530,20 @@ func TestEnsureVmcpConfigConfigMap(t *testing.T) {
 	var cfg vmcpconfig.Config
 	require.NoError(t, yaml.Unmarshal([]byte(cm.Data["config.yaml"]), &cfg))
 	require.NotNil(t, cfg.RateLimiting, "runtime config must include spec.rateLimiting")
-	require.NotNil(t, cfg.RateLimiting.PerUser)
-	assert.Equal(t, int32(2), cfg.RateLimiting.PerUser.MaxTokens)
-	require.Len(t, cfg.RateLimiting.Tools, 1)
-	assert.Equal(t, "backend_a_echo", cfg.RateLimiting.Tools[0].Name)
-	require.NotNil(t, cfg.RateLimiting.Tools[0].Global)
-	assert.Equal(t, int32(5), cfg.RateLimiting.Tools[0].Global.MaxTokens)
+
+	rateLimiting := cfg.RateLimiting.Get()
+	perUser, ok := rateLimiting["perUser"].(map[string]any)
+	require.True(t, ok)
+	assert.EqualValues(t, 2, perUser["maxTokens"])
+	tools, ok := rateLimiting["tools"].([]any)
+	require.True(t, ok)
+	require.Len(t, tools, 1)
+	tool, ok := tools[0].(map[string]any)
+	require.True(t, ok)
+	assert.Equal(t, "backend_a_echo", tool["name"])
+	global, ok := tool["global"].(map[string]any)
+	require.True(t, ok)
+	assert.EqualValues(t, 5, global["maxTokens"])
 }
 
 // TestSetAuthConfigConditions tests that auth config conditions reflect the current state
diff --git a/cmd/thv-operator/pkg/vmcpconfig/converter.go b/cmd/thv-operator/pkg/vmcpconfig/converter.go
index 6c9a4975e9..1b0a5ae140 100644
--- a/cmd/thv-operator/pkg/vmcpconfig/converter.go
+++ b/cmd/thv-operator/pkg/vmcpconfig/converter.go
@@ -6,6 +6,7 @@ package vmcpconfig
 
 import (
 	"context"
+	"encoding/json"
 	"fmt"
 
 	"github.com/go-logr/logr"
@@ -19,6 +20,7 @@ import (
 	"github.com/stacklok/toolhive/cmd/thv-operator/pkg/oidc"
 	"github.com/stacklok/toolhive/cmd/thv-operator/pkg/spectoconfig"
 	"github.com/stacklok/toolhive/pkg/authserver"
+	thvjson "github.com/stacklok/toolhive/pkg/json"
 	"github.com/stacklok/toolhive/pkg/telemetry"
 	"github.com/stacklok/toolhive/pkg/vmcp/auth/converters"
 	authtypes "github.com/stacklok/toolhive/pkg/vmcp/auth/types"
@@ -140,7 +142,13 @@ func (c *Converter) Convert(
 	}
 
 	config.SessionStorage = convertSessionStorage(vmcp)
-	config.RateLimiting = vmcp.Spec.RateLimiting.ToInternal()
+	if vmcp.Spec.RateLimiting != nil {
+		rateLimiting, err := convertRateLimiting(vmcp.Spec.RateLimiting)
+		if err != nil {
+			return nil, nil, fmt.Errorf("failed to convert rate limiting: %w", err)
+		}
+		config.RateLimiting = rateLimiting
+	}
 
 	// Apply operational defaults (fills missing values)
 	config.EnsureOperationalDefaults()
@@ -158,6 +166,22 @@ func (c *Converter) Convert(
 	return config, authServerRC, nil
 }
 
+func convertRateLimiting(rateLimiting *mcpv1beta1.RateLimitConfig) (*thvjson.Map, error) {
+	if rateLimiting == nil {
+		return nil, nil
+	}
+	raw, err := json.Marshal(rateLimiting)
+	if err != nil {
+		return nil, err
+	}
+	var value map[string]any
+	if err := json.Unmarshal(raw, &value); err != nil {
+		return nil, err
+	}
+	converted := thvjson.NewMap(value)
+	return &converted, nil
+}
+
 // convertIncomingAuth converts IncomingAuthConfig from CRD to vmcp config.
 func (c *Converter) convertIncomingAuth(
 	ctx context.Context,
diff --git a/cmd/thv-operator/pkg/vmcpconfig/converter_test.go b/cmd/thv-operator/pkg/vmcpconfig/converter_test.go
index 0fdb67443b..6e195e9b9e 100644
--- a/cmd/thv-operator/pkg/vmcpconfig/converter_test.go
+++ b/cmd/thv-operator/pkg/vmcpconfig/converter_test.go
@@ -127,10 +127,65 @@ func TestConverter_RateLimitingFromTopLevelSpec(t *testing.T) {
 	cfg, _, err := converter.Convert(context.Background(), vmcp, nil)
 	require.NoError(t, err)
 	require.NotNil(t, cfg.RateLimiting)
-	require.NotNil(t, cfg.RateLimiting.Global)
-	assert.Equal(t, int32(10), cfg.RateLimiting.Global.MaxTokens)
-	require.Len(t, cfg.RateLimiting.Tools, 1)
-	assert.Equal(t, "backend_a_echo", cfg.RateLimiting.Tools[0].Name)
+
+	rateLimiting := cfg.RateLimiting.Get()
+	global, ok := rateLimiting["global"].(map[string]any)
+	require.True(t, ok)
+	assert.EqualValues(t, 10, global["maxTokens"])
+	tools, ok := rateLimiting["tools"].([]any)
+	require.True(t, ok)
+	require.Len(t, tools, 1)
+	tool, ok := tools[0].(map[string]any)
+	require.True(t, ok)
+	assert.Equal(t, "backend_a_echo", tool["name"])
+}
+
+func TestConverter_RateLimitingPreservesSharedAliasAndCopiesInput(t *testing.T) {
+	t.Parallel()
+
+	vmcp := &mcpv1beta1.VirtualMCPServer{
+		ObjectMeta: metav1.ObjectMeta{Name: "test-vmcp", Namespace: "default"},
+		Spec: mcpv1beta1.VirtualMCPServerSpec{
+			GroupRef:     &mcpv1beta1.MCPGroupRef{Name: "test-group"},
+			IncomingAuth: &mcpv1beta1.IncomingAuthConfig{Type: "anonymous"},
+			RateLimiting: &mcpv1beta1.RateLimitConfig{
+				Shared: &mcpv1beta1.RateLimitBucket{
+					MaxTokens:    10,
+					RefillPeriod: metav1.Duration{Duration: time.Minute},
+				},
+				Tools: []mcpv1beta1.ToolRateLimitConfig{
+					{
+						Name: "backend_a_echo",
+						Shared: &mcpv1beta1.RateLimitBucket{
+							MaxTokens:    1,
+							RefillPeriod: metav1.Duration{Duration: time.Second},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	converter := newTestConverter(t, newNoOpMockResolver(t))
+	cfg, _, err := converter.Convert(context.Background(), vmcp, nil)
+	require.NoError(t, err)
+	require.NotNil(t, cfg.RateLimiting)
+
+	vmcp.Spec.RateLimiting.Shared.MaxTokens = 99
+	vmcp.Spec.RateLimiting.Tools[0].Shared.MaxTokens = 88
+
+	rateLimiting := cfg.RateLimiting.Get()
+	shared, ok := rateLimiting["shared"].(map[string]any)
+	require.True(t, ok)
+	assert.EqualValues(t, 10, shared["maxTokens"])
+	tools, ok := rateLimiting["tools"].([]any)
+	require.True(t, ok)
+	require.Len(t, tools, 1)
+	tool, ok := tools[0].(map[string]any)
+	require.True(t, ok)
+	toolShared, ok := tool["shared"].(map[string]any)
+	require.True(t, ok)
+	assert.EqualValues(t, 1, toolShared["maxTokens"])
 }
 
 func TestConverter_OIDCResolution(t *testing.T) {
diff --git a/cmd/thv-operator/test-integration/embedding-server/embeddingserver_update_test.go b/cmd/thv-operator/test-integration/embedding-server/embeddingserver_update_test.go
index b91f9d021d..c4a3484bd2 100644
--- a/cmd/thv-operator/test-integration/embedding-server/embeddingserver_update_test.go
+++ b/cmd/thv-operator/test-integration/embedding-server/embeddingserver_update_test.go
@@ -466,6 +466,7 @@ var _ = Describe("EmbeddingServer Controller Update Tests", func() {
 				Expect(k8sClient.Create(ctx, embeddingServer)).To(Succeed())
 				Eventually(func(g Gomega) {
 					g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(embeddingServer), &appsv1.StatefulSet{})).To(Succeed())
+					g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(embeddingServer), &corev1.Service{})).To(Succeed())
 				}, timeout, interval).Should(Succeed())
 			})
 
diff --git a/cmd/thv-operator/test-integration/virtualmcp/virtualmcpserver_sessionstorage_cel_test.go b/cmd/thv-operator/test-integration/virtualmcp/virtualmcpserver_sessionstorage_cel_test.go
index df614f32a6..dc00607240 100644
--- a/cmd/thv-operator/test-integration/virtualmcp/virtualmcpserver_sessionstorage_cel_test.go
+++ b/cmd/thv-operator/test-integration/virtualmcp/virtualmcpserver_sessionstorage_cel_test.go
@@ -147,8 +147,11 @@ var _ = Describe("CEL Validation for SessionStorageConfig on VirtualMCPServer",
 					Address:  "redis:6379",
 				})
 				vmcp.Spec.IncomingAuth = &mcpv1beta1.IncomingAuthConfig{
-					Type:          "oidc",
-					OIDCConfigRef: &mcpv1beta1.MCPOIDCConfigReference{Name: "oidc"},
+					Type: "oidc",
+					OIDCConfigRef: &mcpv1beta1.MCPOIDCConfigReference{
+						Name:     "oidc",
+						Audience: "test-audience",
+					},
 				}
 				vmcp.Spec.RateLimiting = &mcpv1beta1.RateLimitConfig{
 					PerUser: &mcpv1beta1.RateLimitBucket{
diff --git a/docs/operator/crd-api.md b/docs/operator/crd-api.md
index d7313e9135..2b66ac030d 100644
--- a/docs/operator/crd-api.md
+++ b/docs/operator/crd-api.md
@@ -233,25 +233,6 @@ _Appears in:_
 | `primaryUpstreamProvider` _string_ | PrimaryUpstreamProvider names the upstream IDP provider whose access<br />token should be used as the source of JWT claims for Cedar evaluation.<br />When empty, claims from the ToolHive-issued token are used.<br />Must match an upstream provider name configured in the embedded auth server<br />(e.g. "default", "github"). Only relevant when the embedded auth server is active. |  | Optional: \{\} <br /> |
 
 
-#### ratelimit.config.Bucket
-
-
-
-Bucket defines a token bucket configuration with a maximum capacity and a
-refill period.
-
-
-
-_Appears in:_
-- [ratelimit.config.Config](#ratelimitconfigconfig)
-- [ratelimit.config.ToolConfig](#ratelimitconfigtoolconfig)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `maxTokens` _integer_ | MaxTokens is the maximum number of tokens. |  |  |
-| `refillPeriod` _[Duration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#duration-v1-meta)_ | RefillPeriod is the duration to fully refill the bucket. |  |  |
-
-
 #### vmcp.config.CircuitBreakerConfig
 
 
@@ -659,25 +640,6 @@ _Appears in:_
 
 
 
-#### ratelimit.config.ToolConfig
-
-
-
-ToolConfig defines rate limits for a specific tool.
-
-
-
-_Appears in:_
-- [ratelimit.config.Config](#ratelimitconfigconfig)
-
-| Field | Description | Default | Validation |
-| --- | --- | --- | --- |
-| `name` _string_ | Name is the MCP tool name this limit applies to. |  |  |
-| `global` _[ratelimit.config.Bucket](#ratelimitconfigbucket)_ | Global is a token bucket shared across all users for this tool. |  |  |
-| `shared` _[ratelimit.config.Bucket](#ratelimitconfigbucket)_ | Shared is a deprecated alias for Global. |  |  |
-| `perUser` _[ratelimit.config.Bucket](#ratelimitconfigbucket)_ | PerUser is a token bucket applied independently to each authenticated user<br />for this tool. |  |  |
-
-
 #### vmcp.config.ToolConfigRef
 
 
diff --git a/pkg/ratelimit/config/config.go b/pkg/ratelimit/config/config.go
deleted file mode 100644
index 38aaa01780..0000000000
--- a/pkg/ratelimit/config/config.go
+++ /dev/null
@@ -1,147 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
-// SPDX-License-Identifier: Apache-2.0
-
-// Package config defines runtime-neutral rate limiting configuration.
-package config
-
-import metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
-// Config defines rate limiting configuration for an MCP server.
-// +gendoc
-type Config struct {
-	// Global is a token bucket shared across all users for the entire server.
-	Global *Bucket `json:"global,omitempty" yaml:"global,omitempty"`
-
-	// Shared is a deprecated alias for Global. It is kept for compatibility with
-	// older CRDs and config files that used "shared" before the RFC settled on
-	// "global".
-	Shared *Bucket `json:"shared,omitempty" yaml:"shared,omitempty"`
-
-	// PerUser is a token bucket applied independently to each authenticated user
-	// at the server level.
-	PerUser *Bucket `json:"perUser,omitempty" yaml:"perUser,omitempty"`
-
-	// Tools defines per-tool rate limits.
-	Tools []ToolConfig `json:"tools,omitempty" yaml:"tools,omitempty"`
-}
-
-// EffectiveGlobal returns the configured server-level global bucket, including
-// the deprecated shared alias.
-func (c *Config) EffectiveGlobal() *Bucket {
-	if c == nil {
-		return nil
-	}
-	if c.Global != nil {
-		return c.Global
-	}
-	return c.Shared
-}
-
-// Bucket defines a token bucket configuration with a maximum capacity and a
-// refill period.
-// +gendoc
-type Bucket struct {
-	// MaxTokens is the maximum number of tokens.
-	MaxTokens int32 `json:"maxTokens" yaml:"maxTokens"`
-
-	// RefillPeriod is the duration to fully refill the bucket.
-	RefillPeriod metav1.Duration `json:"refillPeriod" yaml:"refillPeriod"`
-}
-
-// ToolConfig defines rate limits for a specific tool.
-// +gendoc
-type ToolConfig struct {
-	// Name is the MCP tool name this limit applies to.
-	Name string `json:"name" yaml:"name"`
-
-	// Global is a token bucket shared across all users for this tool.
-	Global *Bucket `json:"global,omitempty" yaml:"global,omitempty"`
-
-	// Shared is a deprecated alias for Global.
-	Shared *Bucket `json:"shared,omitempty" yaml:"shared,omitempty"`
-
-	// PerUser is a token bucket applied independently to each authenticated user
-	// for this tool.
-	PerUser *Bucket `json:"perUser,omitempty" yaml:"perUser,omitempty"`
-}
-
-// EffectiveGlobal returns the configured tool-level global bucket, including
-// the deprecated shared alias.
-func (c *ToolConfig) EffectiveGlobal() *Bucket {
-	if c == nil {
-		return nil
-	}
-	if c.Global != nil {
-		return c.Global
-	}
-	return c.Shared
-}
-
-// DeepCopyInto copies the receiver into out.
-func (in *Config) DeepCopyInto(out *Config) {
-	*out = *in
-	if in.Global != nil {
-		out.Global = in.Global.DeepCopy()
-	}
-	if in.Shared != nil {
-		out.Shared = in.Shared.DeepCopy()
-	}
-	if in.PerUser != nil {
-		out.PerUser = in.PerUser.DeepCopy()
-	}
-	if in.Tools != nil {
-		out.Tools = make([]ToolConfig, len(in.Tools))
-		for i := range in.Tools {
-			in.Tools[i].DeepCopyInto(&out.Tools[i])
-		}
-	}
-}
-
-// DeepCopy returns a copy of the receiver.
-func (in *Config) DeepCopy() *Config {
-	if in == nil {
-		return nil
-	}
-	out := new(Config)
-	in.DeepCopyInto(out)
-	return out
-}
-
-// DeepCopyInto copies the receiver into out.
-func (in *Bucket) DeepCopyInto(out *Bucket) {
-	*out = *in
-}
-
-// DeepCopy returns a copy of the receiver.
-func (in *Bucket) DeepCopy() *Bucket {
-	if in == nil {
-		return nil
-	}
-	out := new(Bucket)
-	in.DeepCopyInto(out)
-	return out
-}
-
-// DeepCopyInto copies the receiver into out.
-func (in *ToolConfig) DeepCopyInto(out *ToolConfig) {
-	*out = *in
-	if in.Global != nil {
-		out.Global = in.Global.DeepCopy()
-	}
-	if in.Shared != nil {
-		out.Shared = in.Shared.DeepCopy()
-	}
-	if in.PerUser != nil {
-		out.PerUser = in.PerUser.DeepCopy()
-	}
-}
-
-// DeepCopy returns a copy of the receiver.
-func (in *ToolConfig) DeepCopy() *ToolConfig {
-	if in == nil {
-		return nil
-	}
-	out := new(ToolConfig)
-	in.DeepCopyInto(out)
-	return out
-}
diff --git a/pkg/ratelimit/config/doc.go b/pkg/ratelimit/config/doc.go
deleted file mode 100644
index ef361de965..0000000000
--- a/pkg/ratelimit/config/doc.go
+++ /dev/null
@@ -1,8 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
-// SPDX-License-Identifier: Apache-2.0
-
-// Package config defines runtime-neutral rate limiting configuration.
-//
-// +groupName=toolhive.stacklok.dev
-// +versionName=config
-package config
diff --git a/pkg/ratelimit/limiter.go b/pkg/ratelimit/limiter.go
index 4a5e654912..6679d6db4a 100644
--- a/pkg/ratelimit/limiter.go
+++ b/pkg/ratelimit/limiter.go
@@ -14,7 +14,7 @@ import (
 
 	"github.com/redis/go-redis/v9"
 
-	rlconfig "github.com/stacklok/toolhive/pkg/ratelimit/config"
+	v1beta1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1beta1"
 	"github.com/stacklok/toolhive/pkg/ratelimit/internal/bucket"
 )
 
@@ -39,7 +39,7 @@ type Decision struct {
 // NewLimiter constructs a Limiter from CRD configuration.
 // Returns a no-op limiter (always allows) when crd is nil.
 // namespace and name identify the MCP server for Redis key derivation.
-func NewLimiter(client redis.Cmdable, namespace, name string, crd *rlconfig.Config) (Limiter, error) {
+func NewLimiter(client redis.Cmdable, namespace, name string, crd *v1beta1.RateLimitConfig) (Limiter, error) {
 	if crd == nil {
 		return noopLimiter{}, nil
 	}
@@ -88,7 +88,7 @@ func NewLimiter(client redis.Cmdable, namespace, name string, crd *rlconfig.Conf
 	return l, nil
 }
 
-func serverGlobalBucket(crd *rlconfig.Config) (*rlconfig.Bucket, string) {
+func serverGlobalBucket(crd *v1beta1.RateLimitConfig) (*v1beta1.RateLimitBucket, string) {
 	if crd.Global != nil {
 		return crd.Global, "global"
 	}
@@ -98,7 +98,7 @@ func serverGlobalBucket(crd *rlconfig.Config) (*rlconfig.Bucket, string) {
 	return nil, ""
 }
 
-func toolGlobalBucket(tool *rlconfig.ToolConfig) (*rlconfig.Bucket, string) {
+func toolGlobalBucket(tool *v1beta1.ToolRateLimitConfig) (*v1beta1.RateLimitBucket, string) {
 	if tool.Global != nil {
 		return tool.Global, "global"
 	}
@@ -197,7 +197,7 @@ func (noopLimiter) Allow(context.Context, string, string) (*Decision, error) {
 }
 
 // validateBucketCRD checks that a CRD bucket spec has valid parameters.
-func validateBucketCRD(b *rlconfig.Bucket) (int32, time.Duration, error) {
+func validateBucketCRD(b *v1beta1.RateLimitBucket) (int32, time.Duration, error) {
 	if b.MaxTokens < 1 {
 		return 0, 0, fmt.Errorf("maxTokens must be >= 1, got %d", b.MaxTokens)
 	}
@@ -209,7 +209,7 @@ func validateBucketCRD(b *rlconfig.Bucket) (int32, time.Duration, error) {
 }
 
 // newBucket validates a CRD bucket spec and creates a TokenBucket.
-func newBucket(namespace, serverName, suffix string, b *rlconfig.Bucket) (*bucket.TokenBucket, error) {
+func newBucket(namespace, serverName, suffix string, b *v1beta1.RateLimitBucket) (*bucket.TokenBucket, error) {
 	maxTokens, refillPeriod, err := validateBucketCRD(b)
 	if err != nil {
 		return nil, err
@@ -219,7 +219,7 @@ func newBucket(namespace, serverName, suffix string, b *rlconfig.Bucket) (*bucke
 
 // newBucketSpec validates a CRD bucket spec and creates a deferred bucketSpec
 // for per-user buckets that are materialized at Allow() time.
-func newBucketSpec(namespace, serverName string, b *rlconfig.Bucket) (bucketSpec, error) {
+func newBucketSpec(namespace, serverName string, b *v1beta1.RateLimitBucket) (bucketSpec, error) {
 	maxTokens, refillPeriod, err := validateBucketCRD(b)
 	if err != nil {
 		return bucketSpec{}, err
diff --git a/pkg/ratelimit/limiter_test.go b/pkg/ratelimit/limiter_test.go
index e5e921aea8..aea2b1636a 100644
--- a/pkg/ratelimit/limiter_test.go
+++ b/pkg/ratelimit/limiter_test.go
@@ -49,7 +49,7 @@ func TestNewLimiter_ZeroMaxTokens(t *testing.T) {
 		},
 	}
 
-	_, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
+	_, err := NewLimiter(client, "ns", "srv", crd)
 	assert.Error(t, err)
 	assert.Contains(t, err.Error(), "maxTokens must be >= 1")
 }
@@ -65,7 +65,7 @@ func TestNewLimiter_ZeroDuration(t *testing.T) {
 		},
 	}
 
-	_, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
+	_, err := NewLimiter(client, "ns", "srv", crd)
 	assert.Error(t, err)
 	assert.Contains(t, err.Error(), "refillPeriod must be positive")
 }
@@ -78,7 +78,7 @@ func TestLimiter_ServerGlobalExhausted(t *testing.T) {
 	crd := &v1beta1.RateLimitConfig{
 		Shared: &v1beta1.RateLimitBucket{MaxTokens: 2, RefillPeriod: metav1.Duration{Duration: time.Minute}},
 	}
-	l, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
+	l, err := NewLimiter(client, "ns", "srv", crd)
 	require.NoError(t, err)
 
 	for range 2 {
@@ -107,7 +107,7 @@ func TestLimiter_SharedAliasUsesLegacyRedisKeys(t *testing.T) {
 			},
 		},
 	}
-	l, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
+	l, err := NewLimiter(client, "ns", "srv", crd)
 	require.NoError(t, err)
 
 	d, err := l.Allow(ctx, "search", "")
@@ -142,7 +142,7 @@ func TestLimiter_GlobalUsesGlobalRedisKeys(t *testing.T) {
 			},
 		},
 	}
-	l, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
+	l, err := NewLimiter(client, "ns", "srv", crd)
 	require.NoError(t, err)
 
 	d, err := l.Allow(ctx, "search", "")
@@ -170,7 +170,7 @@ func TestLimiter_PerToolIsolation(t *testing.T) {
 			},
 		},
 	}
-	l, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
+	l, err := NewLimiter(client, "ns", "srv", crd)
 	require.NoError(t, err)
 
 	d, err := l.Allow(ctx, "search", "")
@@ -201,7 +201,7 @@ func TestLimiter_ServerAndPerToolBothRequired(t *testing.T) {
 			},
 		},
 	}
-	l, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
+	l, err := NewLimiter(client, "ns", "srv", crd)
 	require.NoError(t, err)
 
 	for range 2 {
@@ -228,7 +228,7 @@ func TestLimiter_RedisUnavailableReturnsError(t *testing.T) {
 	crd := &v1beta1.RateLimitConfig{
 		Shared: &v1beta1.RateLimitBucket{MaxTokens: 10, RefillPeriod: metav1.Duration{Duration: time.Minute}},
 	}
-	l, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
+	l, err := NewLimiter(client, "ns", "srv", crd)
 	require.NoError(t, err)
 
 	mr.Close()
@@ -245,7 +245,7 @@ func TestLimiter_PerUserServerLevel(t *testing.T) {
 	crd := &v1beta1.RateLimitConfig{
 		PerUser: &v1beta1.RateLimitBucket{MaxTokens: 2, RefillPeriod: metav1.Duration{Duration: time.Minute}},
 	}
-	l, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
+	l, err := NewLimiter(client, "ns", "srv", crd)
 	require.NoError(t, err)
 
 	// User A exhausts their 2 tokens.
@@ -265,28 +265,6 @@ func TestLimiter_PerUserServerLevel(t *testing.T) {
 	assert.True(t, d.Allowed)
 }
 
-func TestLimiter_PerUserServerLevelSharedAcrossBackendTools(t *testing.T) {
-	t.Parallel()
-	client, _ := newTestClient(t)
-	ctx := t.Context()
-
-	crd := &v1beta1.RateLimitConfig{
-		PerUser: &v1beta1.RateLimitBucket{MaxTokens: 2, RefillPeriod: metav1.Duration{Duration: time.Minute}},
-	}
-	l, err := NewLimiter(client, "ns", "vmcp", crd.ToInternal())
-	require.NoError(t, err)
-
-	for _, toolName := range []string{"backend_a_echo", "backend_b_search"} {
-		d, allowErr := l.Allow(ctx, toolName, "user-a")
-		require.NoError(t, allowErr)
-		require.True(t, d.Allowed)
-	}
-
-	d, err := l.Allow(ctx, "backend_c_status", "user-a")
-	require.NoError(t, err)
-	assert.False(t, d.Allowed, "server-level per-user limit must be shared across all backend tools")
-}
-
 func TestLimiter_PerToolPerUserIsolation(t *testing.T) {
 	t.Parallel()
 	client, _ := newTestClient(t)
@@ -300,7 +278,7 @@ func TestLimiter_PerToolPerUserIsolation(t *testing.T) {
 			},
 		},
 	}
-	l, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
+	l, err := NewLimiter(client, "ns", "srv", crd)
 	require.NoError(t, err)
 
 	// User A uses their 1 token for "search".
@@ -338,7 +316,7 @@ func TestLimiter_ServerAndToolPerUserBothRequired(t *testing.T) {
 			},
 		},
 	}
-	l, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
+	l, err := NewLimiter(client, "ns", "srv", crd)
 	require.NoError(t, err)
 
 	// User A makes 2 "search" calls — both pass.
@@ -359,66 +337,6 @@ func TestLimiter_ServerAndToolPerUserBothRequired(t *testing.T) {
 	assert.True(t, d.Allowed)
 }
 
-func TestLimiter_PerToolUsesPostAggregationName(t *testing.T) {
-	t.Parallel()
-	client, _ := newTestClient(t)
-	ctx := t.Context()
-
-	crd := &v1beta1.RateLimitConfig{
-		Tools: []v1beta1.ToolRateLimitConfig{
-			{
-				Name:   "backend_a_expensive_tool",
-				Global: &v1beta1.RateLimitBucket{MaxTokens: 1, RefillPeriod: metav1.Duration{Duration: time.Minute}},
-			},
-		},
-	}
-	l, err := NewLimiter(client, "ns", "vmcp", crd.ToInternal())
-	require.NoError(t, err)
-
-	d, err := l.Allow(ctx, "backend_a_expensive_tool", "user-a")
-	require.NoError(t, err)
-	require.True(t, d.Allowed)
-
-	d, err = l.Allow(ctx, "backend_a_expensive_tool", "user-b")
-	require.NoError(t, err)
-	assert.False(t, d.Allowed, "global per-tool limit should apply to the post-aggregation tool name")
-
-	d, err = l.Allow(ctx, "expensive_tool", "user-a")
-	require.NoError(t, err)
-	assert.True(t, d.Allowed, "unprefixed backend name should not match the vMCP post-aggregation limit")
-}
-
-func TestLimiter_GlobalAndPerUserBothEnforced(t *testing.T) {
-	t.Parallel()
-	client, _ := newTestClient(t)
-	ctx := t.Context()
-
-	crd := &v1beta1.RateLimitConfig{
-		Global:  &v1beta1.RateLimitBucket{MaxTokens: 3, RefillPeriod: metav1.Duration{Duration: time.Minute}},
-		PerUser: &v1beta1.RateLimitBucket{MaxTokens: 2, RefillPeriod: metav1.Duration{Duration: time.Minute}},
-	}
-	l, err := NewLimiter(client, "ns", "vmcp", crd.ToInternal())
-	require.NoError(t, err)
-
-	for range 2 {
-		d, allowErr := l.Allow(ctx, "backend_a_echo", "user-a")
-		require.NoError(t, allowErr)
-		require.True(t, d.Allowed)
-	}
-
-	d, err := l.Allow(ctx, "backend_a_echo", "user-a")
-	require.NoError(t, err)
-	assert.False(t, d.Allowed, "per-user limit should reject user-a's third call")
-
-	d, err = l.Allow(ctx, "backend_b_echo", "user-b")
-	require.NoError(t, err)
-	require.True(t, d.Allowed, "global bucket should still have one token because rejected calls do not drain")
-
-	d, err = l.Allow(ctx, "backend_b_echo", "user-c")
-	require.NoError(t, err)
-	assert.False(t, d.Allowed, "global limit should reject after total allowed calls are exhausted")
-}
-
 func TestLimiter_PerUserRejectionDoesNotDrainShared(t *testing.T) {
 	t.Parallel()
 	client, _ := newTestClient(t)
@@ -430,7 +348,7 @@ func TestLimiter_PerUserRejectionDoesNotDrainShared(t *testing.T) {
 		Shared:  &v1beta1.RateLimitBucket{MaxTokens: 3, RefillPeriod: metav1.Duration{Duration: time.Minute}},
 		PerUser: &v1beta1.RateLimitBucket{MaxTokens: 1, RefillPeriod: metav1.Duration{Duration: time.Minute}},
 	}
-	l, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
+	l, err := NewLimiter(client, "ns", "srv", crd)
 	require.NoError(t, err)
 
 	// User A: first call passes (shared=2, user-a=0).
@@ -465,7 +383,7 @@ func TestLimiter_RedisUnavailablePerUser(t *testing.T) {
 	crd := &v1beta1.RateLimitConfig{
 		PerUser: &v1beta1.RateLimitBucket{MaxTokens: 10, RefillPeriod: metav1.Duration{Duration: time.Minute}},
 	}
-	l, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
+	l, err := NewLimiter(client, "ns", "srv", crd)
 	require.NoError(t, err)
 
 	mr.Close()
@@ -481,7 +399,7 @@ func TestNewLimiter_PerUserZeroMaxTokens(t *testing.T) {
 	crd := &v1beta1.RateLimitConfig{
 		PerUser: &v1beta1.RateLimitBucket{MaxTokens: 0, RefillPeriod: metav1.Duration{Duration: time.Minute}},
 	}
-	_, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
+	_, err := NewLimiter(client, "ns", "srv", crd)
 	assert.Error(t, err)
 	assert.Contains(t, err.Error(), "perUser bucket: maxTokens must be >= 1")
 }
@@ -498,7 +416,7 @@ func TestNewLimiter_ToolPerUserZeroDuration(t *testing.T) {
 			},
 		},
 	}
-	_, err := NewLimiter(client, "ns", "srv", crd.ToInternal())
+	_, err := NewLimiter(client, "ns", "srv", crd)
 	assert.Error(t, err)
 	assert.Contains(t, err.Error(), `tool "search" perUser bucket: refillPeriod must be positive`)
 }
diff --git a/pkg/ratelimit/middleware.go b/pkg/ratelimit/middleware.go
index a472923b59..ee108fdf87 100644
--- a/pkg/ratelimit/middleware.go
+++ b/pkg/ratelimit/middleware.go
@@ -15,11 +15,10 @@ import (
 
 	"github.com/redis/go-redis/v9"
 
+	v1beta1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1beta1"
 	"github.com/stacklok/toolhive/pkg/auth"
 	"github.com/stacklok/toolhive/pkg/mcp"
-	rlconfig "github.com/stacklok/toolhive/pkg/ratelimit/config"
 	"github.com/stacklok/toolhive/pkg/transport/types"
-	"github.com/stacklok/toolhive/pkg/vmcp/session/optimizerdec"
 )
 
 const (
@@ -40,11 +39,11 @@ const (
 
 // MiddlewareParams holds the parameters for the rate limit middleware factory.
 type MiddlewareParams struct {
-	Namespace  string           `json:"namespace"`
-	ServerName string           `json:"server_name"`
-	Config     *rlconfig.Config `json:"config"`
-	RedisAddr  string           `json:"redis_addr,omitempty"`
-	RedisDB    int32            `json:"redis_db,omitempty"`
+	Namespace  string                   `json:"namespace"`
+	ServerName string                   `json:"server_name"`
+	Config     *v1beta1.RateLimitConfig `json:"config"`
+	RedisAddr  string                   `json:"redis_addr,omitempty"`
+	RedisDB    int32                    `json:"redis_db,omitempty"`
 }
 
 // rateLimitMiddleware wraps rate limiting functionality for the factory pattern.
@@ -100,16 +99,16 @@ func CreateMiddleware(config *types.MiddlewareConfig, runner types.MiddlewareRun
 	}
 
 	mw := &rateLimitMiddleware{
-		handler: HTTPMiddleware(limiter),
+		handler: rateLimitHandler(limiter),
 		client:  client,
 	}
 	runner.AddMiddleware(MiddlewareType, mw)
 	return nil
 }
 
-// HTTPMiddleware returns a middleware function that enforces rate limits
+// rateLimitHandler returns a middleware function that enforces rate limits
 // on tools/call requests.
-func HTTPMiddleware(limiter Limiter) types.MiddlewareFunction {
+func rateLimitHandler(limiter Limiter) types.MiddlewareFunction {
 	return func(next http.Handler) http.Handler {
 		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 			// Rate limits only apply to parsed tools/call requests.
@@ -128,8 +127,7 @@ func HTTPMiddleware(limiter Limiter) types.MiddlewareFunction {
 			if identity, ok := auth.IdentityFromContext(r.Context()); ok {
 				userID = identity.Subject
 			}
-			toolName := rateLimitToolName(parsed)
-			decision, err := limiter.Allow(r.Context(), toolName, userID)
+			decision, err := limiter.Allow(r.Context(), parsed.ResourceID, userID)
 			if err != nil {
 				slog.Warn("rate limit check failed, allowing request", "error", err)
 				next.ServeHTTP(w, r)
@@ -144,18 +142,6 @@ func HTTPMiddleware(limiter Limiter) types.MiddlewareFunction {
 	}
 }
 
-func rateLimitToolName(parsed *mcp.ParsedMCPRequest) string {
-	if parsed == nil {
-		return ""
-	}
-	if parsed.ResourceID == optimizerdec.CallToolName && parsed.Arguments != nil {
-		if toolName, ok := parsed.Arguments[optimizerdec.CallToolArgToolName].(string); ok && toolName != "" {
-			return toolName
-		}
-	}
-	return parsed.ResourceID
-}
-
 // writeRateLimited writes an HTTP 429 response with a JSON-RPC error body.
 func writeRateLimited(w http.ResponseWriter, requestID any, retryAfter time.Duration) {
 	retrySeconds := int(math.Ceil(retryAfter.Seconds()))
diff --git a/pkg/ratelimit/middleware_test.go b/pkg/ratelimit/middleware_test.go
index c526665783..ed76e72e0c 100644
--- a/pkg/ratelimit/middleware_test.go
+++ b/pkg/ratelimit/middleware_test.go
@@ -61,23 +61,11 @@ func withParsedMCPRequest(r *http.Request, method, resourceID string, id any) *h
 	return r.WithContext(ctx)
 }
 
-func withParsedToolCall(r *http.Request, resourceID string, arguments map[string]interface{}, id any) *http.Request {
-	parsed := &mcp.ParsedMCPRequest{
-		Method:     "tools/call",
-		ResourceID: resourceID,
-		Arguments:  arguments,
-		ID:         id,
-		IsRequest:  true,
-	}
-	ctx := context.WithValue(r.Context(), mcp.MCPRequestContextKey, parsed)
-	return r.WithContext(ctx)
-}
-
 func TestRateLimitHandler_ToolCallAllowed(t *testing.T) {
 	t.Parallel()
 
 	limiter := &dummyLimiter{decision: &Decision{Allowed: true}}
-	handler := HTTPMiddleware(limiter)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+	handler := rateLimitHandler(limiter)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 		w.WriteHeader(http.StatusOK)
 	}))
 
@@ -94,7 +82,7 @@ func TestRateLimitHandler_ToolCallRejected(t *testing.T) {
 	t.Parallel()
 
 	limiter := &dummyLimiter{decision: &Decision{Allowed: false, RetryAfter: 5 * time.Second}}
-	handler := HTTPMiddleware(limiter)(http.HandlerFunc(func(http.ResponseWriter, *http.Request) {
+	handler := rateLimitHandler(limiter)(http.HandlerFunc(func(http.ResponseWriter, *http.Request) {
 		t.Fatal("next handler should not be called when rate limited")
 	}))
 
@@ -126,7 +114,7 @@ func TestRateLimitHandler_RedisErrorFailOpen(t *testing.T) {
 
 	limiter := &dummyLimiter{err: errors.New("redis connection refused")}
 	nextCalled := false
-	handler := HTTPMiddleware(limiter)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+	handler := rateLimitHandler(limiter)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 		nextCalled = true
 		w.WriteHeader(http.StatusOK)
 	}))
@@ -146,7 +134,7 @@ func TestRateLimitHandler_NoParsedMCPRequest_PassesThrough(t *testing.T) {
 
 	limiter := &dummyLimiter{decision: &Decision{Allowed: false}}
 	nextCalled := false
-	handler := HTTPMiddleware(limiter)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+	handler := rateLimitHandler(limiter)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 		nextCalled = true
 		w.WriteHeader(http.StatusOK)
 	}))
@@ -166,7 +154,7 @@ func TestRateLimitHandler_NonToolCallPassesThrough(t *testing.T) {
 
 	limiter := &dummyLimiter{decision: &Decision{Allowed: false, RetryAfter: time.Second}}
 	nextCalled := false
-	handler := HTTPMiddleware(limiter)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+	handler := rateLimitHandler(limiter)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 		nextCalled = true
 		w.WriteHeader(http.StatusOK)
 	}))
@@ -185,7 +173,7 @@ func TestRateLimitHandler_PassesUserID(t *testing.T) {
 	t.Parallel()
 
 	recorder := &recordingLimiter{}
-	handler := HTTPMiddleware(recorder)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+	handler := rateLimitHandler(recorder)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 		w.WriteHeader(http.StatusOK)
 	}))
 
@@ -201,36 +189,11 @@ func TestRateLimitHandler_PassesUserID(t *testing.T) {
 	assert.Equal(t, "alice@example.com", recorder.userID)
 }
 
-func TestRateLimitHandler_OptimizerCallToolUsesInnerToolName(t *testing.T) {
-	t.Parallel()
-
-	recorder := &recordingLimiter{}
-	handler := HTTPMiddleware(recorder)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
-		w.WriteHeader(http.StatusOK)
-	}))
-
-	req := httptest.NewRequest(http.MethodPost, "/mcp", nil)
-	req = withParsedToolCall(req, "call_tool", map[string]interface{}{
-		"tool_name": "backend_a_expensive_tool",
-		"parameters": map[string]interface{}{
-			"query": "test",
-		},
-	}, 1)
-	req = withIdentity(req, "alice@example.com")
-	w := httptest.NewRecorder()
-
-	handler.ServeHTTP(w, req)
-
-	assert.Equal(t, http.StatusOK, w.Code)
-	assert.Equal(t, "backend_a_expensive_tool", recorder.toolName)
-	assert.Equal(t, "alice@example.com", recorder.userID)
-}
-
 func TestRateLimitHandler_NoIdentityPassesEmptyUserID(t *testing.T) {
 	t.Parallel()
 
 	recorder := &recordingLimiter{}
-	handler := HTTPMiddleware(recorder)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+	handler := rateLimitHandler(recorder)(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 		w.WriteHeader(http.StatusOK)
 	}))
 
diff --git a/pkg/runner/middleware.go b/pkg/runner/middleware.go
index 0d711e4439..be9dd33506 100644
--- a/pkg/runner/middleware.go
+++ b/pkg/runner/middleware.go
@@ -434,7 +434,7 @@ func addRateLimitMiddleware(middlewares []types.MiddlewareConfig, config *RunCon
 	params := ratelimit.MiddlewareParams{
 		Namespace:  config.RateLimitNamespace,
 		ServerName: config.Name,
-		Config:     config.RateLimitConfig.ToInternal(),
+		Config:     config.RateLimitConfig,
 		RedisAddr:  redisAddr,
 		RedisDB:    redisDB,
 	}
diff --git a/pkg/vmcp/cli/serve.go b/pkg/vmcp/cli/serve.go
index 72b72148e6..ffb501115c 100644
--- a/pkg/vmcp/cli/serve.go
+++ b/pkg/vmcp/cli/serve.go
@@ -386,8 +386,6 @@ func Serve(ctx context.Context, cfg ServeConfig) error {
 		OptimizerConfig:         optCfg,
 		SessionFactory:          sessionFactory,
 		SessionStorage:          vmcpCfg.SessionStorage,
-		RateLimiting:            vmcpCfg.RateLimiting,
-		RateLimitNamespace:      os.Getenv("VMCP_NAMESPACE"),
 	}
 
 	// Assign Watcher only when backendWatcher is non-nil. A typed nil
diff --git a/pkg/vmcp/config/config.go b/pkg/vmcp/config/config.go
index 9deabf6cd0..4d46cf9032 100644
--- a/pkg/vmcp/config/config.go
+++ b/pkg/vmcp/config/config.go
@@ -15,7 +15,6 @@ import (
 
 	"github.com/stacklok/toolhive/pkg/audit"
 	thvjson "github.com/stacklok/toolhive/pkg/json"
-	rlconfig "github.com/stacklok/toolhive/pkg/ratelimit/config"
 	"github.com/stacklok/toolhive/pkg/telemetry"
 	"github.com/stacklok/toolhive/pkg/vmcp"
 	authtypes "github.com/stacklok/toolhive/pkg/vmcp/auth/types"
@@ -181,7 +180,7 @@ type Config struct {
 	// This remains YAML-only so it can be written to the runtime config ConfigMap
 	// without exposing spec.config.rateLimiting in the Kubernetes API.
 	// +optional
-	RateLimiting *rlconfig.Config `json:"-" yaml:"rateLimiting,omitempty"`
+	RateLimiting *thvjson.Map `json:"-" yaml:"rateLimiting,omitempty"`
 }
 
 // IncomingAuthConfig configures client authentication to the virtual MCP server.
diff --git a/pkg/vmcp/config/config_test.go b/pkg/vmcp/config/config_test.go
index 5f652ebce0..da9e88aeda 100644
--- a/pkg/vmcp/config/config_test.go
+++ b/pkg/vmcp/config/config_test.go
@@ -4,6 +4,7 @@
 package config
 
 import (
+	"encoding/json"
 	"fmt"
 	"os"
 	"path/filepath"
@@ -15,10 +16,36 @@ import (
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
+	"gopkg.in/yaml.v3"
 
+	thvjson "github.com/stacklok/toolhive/pkg/json"
 	authtypes "github.com/stacklok/toolhive/pkg/vmcp/auth/types"
 )
 
+func TestConfigRateLimitingIsYAMLOnly(t *testing.T) {
+	t.Parallel()
+
+	rateLimiting := thvjson.NewMap(map[string]any{
+		"global": map[string]any{
+			"maxTokens":    10,
+			"refillPeriod": "1m0s",
+		},
+	})
+	cfg := Config{
+		Name:         "test-vmcp",
+		RateLimiting: &rateLimiting,
+	}
+
+	jsonBytes, err := json.Marshal(cfg)
+	require.NoError(t, err)
+	assert.NotContains(t, string(jsonBytes), "rateLimiting")
+
+	yamlBytes, err := yaml.Marshal(cfg)
+	require.NoError(t, err)
+	assert.Contains(t, string(yamlBytes), "rateLimiting:")
+	assert.Contains(t, string(yamlBytes), "maxTokens: 10")
+}
+
 func TestOutgoingAuthConfig_ResolveForBackend(t *testing.T) {
 	t.Parallel()
 
diff --git a/pkg/vmcp/server/ratelimit.go b/pkg/vmcp/server/ratelimit.go
deleted file mode 100644
index 95d0c2f16e..0000000000
--- a/pkg/vmcp/server/ratelimit.go
+++ /dev/null
@@ -1,72 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
-// SPDX-License-Identifier: Apache-2.0
-
-package server
-
-import (
-	"context"
-	"fmt"
-	"log/slog"
-	"os"
-	"strings"
-	"time"
-
-	"github.com/redis/go-redis/v9"
-
-	"github.com/stacklok/toolhive/pkg/ratelimit"
-	transporttypes "github.com/stacklok/toolhive/pkg/transport/types"
-	vmcpconfig "github.com/stacklok/toolhive/pkg/vmcp/config"
-)
-
-type httpMiddleware = transporttypes.MiddlewareFunction
-
-type shutdownFunc func(context.Context) error
-
-func (s *Server) buildRateLimitMiddleware(ctx context.Context) (httpMiddleware, shutdownFunc, error) {
-	if s.config.RateLimiting == nil {
-		return nil, nil, nil
-	}
-	if s.config.SessionStorage == nil ||
-		!strings.EqualFold(s.config.SessionStorage.Provider, "redis") ||
-		s.config.SessionStorage.Address == "" {
-		return nil, nil, fmt.Errorf("rate limiting requires sessionStorage with provider redis")
-	}
-
-	client := redis.NewClient(&redis.Options{
-		Addr:     s.config.SessionStorage.Address,
-		DB:       int(s.config.SessionStorage.DB),
-		Password: os.Getenv(vmcpconfig.RedisPasswordEnvVar),
-	})
-
-	pingCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
-	defer cancel()
-	if err := client.Ping(pingCtx).Err(); err != nil {
-		_ = client.Close()
-		return nil, nil, fmt.Errorf("rate limit middleware: failed to connect to Redis at %s: %w",
-			s.config.SessionStorage.Address, err)
-	}
-
-	namespace := s.config.RateLimitNamespace
-	if namespace == "" {
-		namespace = os.Getenv("VMCP_NAMESPACE")
-	}
-	if namespace == "" {
-		namespace = "default"
-	}
-
-	limiter, err := ratelimit.NewLimiter(client, namespace, s.config.Name, s.config.RateLimiting)
-	if err != nil {
-		_ = client.Close()
-		return nil, nil, fmt.Errorf("failed to create rate limiter: %w", err)
-	}
-
-	slog.Info("rate limit middleware enabled for vMCP",
-		"namespace", namespace,
-		"name", s.config.Name,
-		"redis_addr", s.config.SessionStorage.Address,
-		"redis_db", s.config.SessionStorage.DB)
-
-	return ratelimit.HTTPMiddleware(limiter), func(context.Context) error {
-		return client.Close()
-	}, nil
-}
diff --git a/pkg/vmcp/server/ratelimit_test.go b/pkg/vmcp/server/ratelimit_test.go
deleted file mode 100644
index a804a1ace9..0000000000
--- a/pkg/vmcp/server/ratelimit_test.go
+++ /dev/null
@@ -1,76 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
-// SPDX-License-Identifier: Apache-2.0
-
-package server
-
-import (
-	"context"
-	"net/http"
-	"net/http/httptest"
-	"testing"
-	"time"
-
-	"github.com/alicebob/miniredis/v2"
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
-	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
-	"github.com/stacklok/toolhive/pkg/auth"
-	mcpparser "github.com/stacklok/toolhive/pkg/mcp"
-	rlconfig "github.com/stacklok/toolhive/pkg/ratelimit/config"
-	vmcpconfig "github.com/stacklok/toolhive/pkg/vmcp/config"
-)
-
-func TestBuildRateLimitMiddleware_EnforcesToolsCall(t *testing.T) {
-	t.Parallel()
-
-	redisServer := miniredis.RunT(t)
-	srv := &Server{
-		config: &Config{
-			Name:               "vmcp-a",
-			RateLimitNamespace: "default",
-			SessionStorage: &vmcpconfig.SessionStorageConfig{
-				Provider: "redis",
-				Address:  redisServer.Addr(),
-			},
-			RateLimiting: &rlconfig.Config{
-				PerUser: &rlconfig.Bucket{
-					MaxTokens:    1,
-					RefillPeriod: metav1.Duration{Duration: time.Minute},
-				},
-			},
-		},
-	}
-
-	middleware, closeFn, err := srv.buildRateLimitMiddleware(context.Background())
-	require.NoError(t, err)
-	require.NotNil(t, middleware)
-	t.Cleanup(func() {
-		if closeFn != nil {
-			require.NoError(t, closeFn(context.Background()))
-		}
-	})
-
-	handler := middleware(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
-		w.WriteHeader(http.StatusOK)
-	}))
-
-	req := httptest.NewRequest(http.MethodPost, "/mcp", nil)
-	req = req.WithContext(auth.WithIdentity(req.Context(), &auth.Identity{
-		PrincipalInfo: auth.PrincipalInfo{Subject: "alice"},
-	}))
-	req = req.WithContext(context.WithValue(req.Context(), mcpparser.MCPRequestContextKey, &mcpparser.ParsedMCPRequest{
-		Method:     "tools/call",
-		ResourceID: "backend_a_echo",
-		ID:         1,
-		IsRequest:  true,
-	}))
-
-	first := httptest.NewRecorder()
-	handler.ServeHTTP(first, req)
-	assert.Equal(t, http.StatusOK, first.Code)
-
-	second := httptest.NewRecorder()
-	handler.ServeHTTP(second, req)
-	assert.Equal(t, http.StatusTooManyRequests, second.Code)
-}
diff --git a/pkg/vmcp/server/server.go b/pkg/vmcp/server/server.go
index 694e43b25d..23c415a551 100644
--- a/pkg/vmcp/server/server.go
+++ b/pkg/vmcp/server/server.go
@@ -28,7 +28,6 @@ import (
 	"github.com/stacklok/toolhive/pkg/auth"
 	asrunner "github.com/stacklok/toolhive/pkg/authserver/runner"
 	mcpparser "github.com/stacklok/toolhive/pkg/mcp"
-	rlconfig "github.com/stacklok/toolhive/pkg/ratelimit/config"
 	"github.com/stacklok/toolhive/pkg/recovery"
 	"github.com/stacklok/toolhive/pkg/telemetry"
 	transportmiddleware "github.com/stacklok/toolhive/pkg/transport/middleware"
@@ -182,12 +181,6 @@ type Config struct {
 	// session persistence; the Redis password is read from the
 	// THV_SESSION_REDIS_PASSWORD environment variable.
 	SessionStorage *vmcpconfig.SessionStorageConfig
-
-	// RateLimiting configures rate limiting for incoming tools/call requests.
-	RateLimiting *rlconfig.Config
-
-	// RateLimitNamespace is used with Name to derive Redis keys.
-	RateLimitNamespace string
 }
 
 // Server is the Virtual MCP Server that aggregates multiple backends.
@@ -532,7 +525,7 @@ func New(
 // Each call builds a fresh handler. The method is safe to call multiple times.
 // All returned handlers share the same underlying MCPServer and SessionManager,
 // so callers should not serve concurrent traffic through multiple handlers.
-func (s *Server) Handler(ctx context.Context) (http.Handler, error) {
+func (s *Server) Handler(_ context.Context) (http.Handler, error) {
 	// Create Streamable HTTP server with ToolHive session management
 	streamableServer := server.NewStreamableHTTPServer(
 		s.mcpServer,
@@ -544,25 +537,6 @@ func (s *Server) Handler(ctx context.Context) (http.Handler, error) {
 	// Create HTTP mux with separated authenticated and unauthenticated routes
 	mux := http.NewServeMux()
 
-	s.registerHTTPRoutes(mux)
-
-	// MCP endpoint - apply middleware chain (wrapping order, execution happens in reverse):
-	// Code wraps: auth → MCP-parsing → rate-limit → audit → discovery →
-	//   annotation-enrichment → authz → backend-enrichment → telemetry
-	// Execution order: recovery → header-val → auth → MCP-parsing →
-	//   rate-limit → audit → discovery → annotation-enrichment → authz →
-	//   backend-enrichment → telemetry → handler
-	mcpHandler, err := s.buildMCPHandler(ctx, streamableServer)
-	if err != nil {
-		return nil, err
-	}
-
-	mux.Handle("/", mcpHandler)
-
-	return mux, nil
-}
-
-func (s *Server) registerHTTPRoutes(mux *http.ServeMux) {
 	// Unauthenticated health endpoints
 	mux.HandleFunc("/health", s.handleHealth)
 	mux.HandleFunc("/ping", s.handleHealth)
@@ -596,56 +570,32 @@ func (s *Server) registerHTTPRoutes(mux *http.ServeMux) {
 		s.config.AuthServer.RegisterHandlers(mux)
 		slog.Debug("embedded authorization server routes registered")
 	}
-}
-
-func (s *Server) buildMCPHandler(ctx context.Context, base http.Handler) (http.Handler, error) {
-	mcpHandler := s.wrapMCPHandler(base)
-
-	if err := s.applyAuditMiddleware(&mcpHandler); err != nil {
-		return nil, err
-	}
-	if err := s.applyRateLimitMiddleware(ctx, &mcpHandler); err != nil {
-		return nil, err
-	}
-
-	// Apply MCP parsing middleware to extract JSON-RPC method from request body.
-	// This must run before rate limiting so global limits also work when auth is
-	// disabled, and before telemetry so metrics can use the real MCP method.
-	mcpHandler = mcpparser.ParsingMiddleware(mcpHandler)
-
-	// Apply authentication middleware if configured (runs first in chain)
-	if s.config.AuthMiddleware != nil {
-		mcpHandler = s.config.AuthMiddleware(mcpHandler)
-		slog.Info("authentication middleware enabled for MCP endpoints")
-	}
-
-	// Apply Accept header validation (rejects GET requests without Accept: text/event-stream)
-	mcpHandler = headerValidatingMiddleware(mcpHandler)
 
-	// Clear the write deadline for qualifying SSE connections (GET +
-	// Accept: text/event-stream + MCP endpoint path) so the server-level
-	// WriteTimeout does not kill long-lived SSE streams (see golang/go#16100).
-	// Non-qualifying requests are left untouched; http.Server.WriteTimeout
-	// (defaultWriteTimeout) remains in effect for them.
-	mcpHandler = transportmiddleware.WriteTimeout(s.config.EndpointPath)(mcpHandler)
-
-	// Apply recovery middleware as outermost (catches panics from all inner middleware)
-	mcpHandler = recovery.Middleware(mcpHandler)
-	slog.Info("recovery middleware enabled for MCP endpoints")
-
-	return mcpHandler, nil
-}
+	// MCP endpoint - apply middleware chain (wrapping order, execution happens in reverse):
+	// Code wraps: auth+parser → audit → discovery → annotation-enrichment →
+	//   authz → backend-enrichment → MCP-parsing → telemetry
+	// Execution order: recovery → header-val → auth+parser → audit →
+	//   discovery → annotation-enrichment → authz → backend-enrichment →
+	//   MCP-parsing → telemetry → handler
 
-func (s *Server) wrapMCPHandler(base http.Handler) http.Handler {
-	mcpHandler := base
+	var mcpHandler http.Handler = streamableServer
 
 	if s.config.TelemetryProvider != nil {
 		mcpHandler = s.config.TelemetryProvider.Middleware(s.config.Name, "streamable-http")(mcpHandler)
 		slog.Info("telemetry middleware enabled for MCP endpoints")
 	}
 
-	// Apply backend enrichment middleware if audit is configured.
-	// This runs after discovery populates the routing table, so it can extract backend names.
+	// Apply MCP parsing middleware to extract JSON-RPC method from request body.
+	// This runs before telemetry so that recordMetrics can label metrics with the
+	// actual mcp_method (e.g. "tools/call", "initialize") instead of "unknown".
+	// Note: ParsingMiddleware is also composed inside the auth middleware (for audit/authz).
+	// The second application here is a no-op because the context already holds a
+	// ParsedMCPRequest; it exists only so the telemetry layer works correctly even
+	// when auth middleware is nil.
+	mcpHandler = mcpparser.ParsingMiddleware(mcpHandler)
+
+	// Apply backend enrichment middleware if audit is configured
+	// This runs after discovery populates the routing table, so it can extract backend names
 	if s.config.AuditConfig != nil {
 		mcpHandler = s.backendEnrichmentMiddleware(mcpHandler)
 		slog.Info("backend enrichment middleware enabled for audit events")
@@ -656,63 +606,75 @@ func (s *Server) wrapMCPHandler(base http.Handler) http.Handler {
 	if s.config.AuthzMiddleware != nil {
 		mcpHandler = s.config.AuthzMiddleware(mcpHandler)
 		slog.Info("authorization middleware enabled for MCP endpoints (post-discovery)")
+	}
+
+	// Apply annotation enrichment middleware (runs after discovery, before authz in execution).
+	// Reads tool annotations from discovered capabilities and injects them into the
+	// request context so the authz middleware can make annotation-aware decisions.
+	if s.config.AuthzMiddleware != nil {
 		mcpHandler = AnnotationEnrichmentMiddleware(mcpHandler)
 		slog.Info("annotation enrichment middleware enabled for MCP endpoints")
 	}
 
-	healthStatusProvider := s.getHealthStatusProvider()
+	// Apply discovery middleware (runs after audit/auth middleware)
+	// Discovery middleware performs per-request capability aggregation with user context.
+	// vmcpSessionMgr (MultiSessionGetter) is used to retrieve the fully-formed MultiSession
+	// for subsequent requests so the routing table can be injected into context.
+	// The backend registry provides a dynamic backend list (supports DynamicRegistry for K8s).
+	// The health monitor enables filtering based on current health status (respects circuit breaker).
+	s.healthMonitorMu.RLock()
+	healthMon := s.healthMonitor
+	s.healthMonitorMu.RUnlock()
+
+	var healthStatusProvider health.StatusProvider
+	if healthMon != nil {
+		healthStatusProvider = healthMon
+	}
 	mcpHandler = discovery.Middleware(
 		s.discoveryMgr, s.backendRegistry, s.vmcpSessionMgr, healthStatusProvider,
 		discovery.WithSessionScopedRouting(),
 	)(mcpHandler)
 	slog.Info("discovery middleware enabled for lazy per-user capability discovery")
 
-	return mcpHandler
-}
-
-func (s *Server) getHealthStatusProvider() health.StatusProvider {
-	s.healthMonitorMu.RLock()
-	defer s.healthMonitorMu.RUnlock()
-
-	if s.healthMonitor == nil {
-		return nil
+	// Apply audit middleware if configured (runs after auth, before discovery)
+	if s.config.AuditConfig != nil {
+		if err := s.config.AuditConfig.Validate(); err != nil {
+			return nil, fmt.Errorf("invalid audit configuration: %w", err)
+		}
+		auditor, err := audit.NewAuditorWithTransport(
+			s.config.AuditConfig,
+			"streamable-http", // vMCP uses streamable HTTP transport
+		)
+		if err != nil {
+			return nil, fmt.Errorf("failed to create auditor: %w", err)
+		}
+		mcpHandler = auditor.Middleware(mcpHandler)
+		slog.Info("audit middleware enabled for MCP endpoints")
 	}
-	return s.healthMonitor
-}
 
-func (s *Server) applyAuditMiddleware(mcpHandler *http.Handler) error {
-	if s.config.AuditConfig == nil {
-		return nil
-	}
-	if err := s.config.AuditConfig.Validate(); err != nil {
-		return fmt.Errorf("invalid audit configuration: %w", err)
-	}
-	auditor, err := audit.NewAuditorWithTransport(
-		s.config.AuditConfig,
-		"streamable-http", // vMCP uses streamable HTTP transport
-	)
-	if err != nil {
-		return fmt.Errorf("failed to create auditor: %w", err)
+	// Apply authentication middleware if configured (runs first in chain)
+	if s.config.AuthMiddleware != nil {
+		mcpHandler = s.config.AuthMiddleware(mcpHandler)
+		slog.Info("authentication middleware enabled for MCP endpoints")
 	}
-	*mcpHandler = auditor.Middleware(*mcpHandler)
-	slog.Info("audit middleware enabled for MCP endpoints")
-	return nil
-}
 
-func (s *Server) applyRateLimitMiddleware(ctx context.Context, mcpHandler *http.Handler) error {
-	rateLimitMiddleware, closeRateLimit, err := s.buildRateLimitMiddleware(ctx)
-	if err != nil {
-		return err
-	}
-	if rateLimitMiddleware == nil {
-		return nil
-	}
+	// Apply Accept header validation (rejects GET requests without Accept: text/event-stream)
+	mcpHandler = headerValidatingMiddleware(mcpHandler)
 
-	*mcpHandler = rateLimitMiddleware(*mcpHandler)
-	if closeRateLimit != nil {
-		s.shutdownFuncs = append(s.shutdownFuncs, closeRateLimit)
-	}
-	return nil
+	// Clear the write deadline for qualifying SSE connections (GET +
+	// Accept: text/event-stream + MCP endpoint path) so the server-level
+	// WriteTimeout does not kill long-lived SSE streams (see golang/go#16100).
+	// Non-qualifying requests are left untouched; http.Server.WriteTimeout
+	// (defaultWriteTimeout) remains in effect for them.
+	mcpHandler = transportmiddleware.WriteTimeout(s.config.EndpointPath)(mcpHandler)
+
+	// Apply recovery middleware as outermost (catches panics from all inner middleware)
+	mcpHandler = recovery.Middleware(mcpHandler)
+	slog.Info("recovery middleware enabled for MCP endpoints")
+
+	mux.Handle("/", mcpHandler)
+
+	return mux, nil
 }
 
 // Start starts the Virtual MCP Server and begins serving requests.
diff --git a/test/e2e/thv-operator/virtualmcp/virtualmcp_circuit_breaker_test.go b/test/e2e/thv-operator/virtualmcp/virtualmcp_circuit_breaker_test.go
index 12cebf47d2..61a24c86be 100644
--- a/test/e2e/thv-operator/virtualmcp/virtualmcp_circuit_breaker_test.go
+++ b/test/e2e/thv-operator/virtualmcp/virtualmcp_circuit_breaker_test.go
@@ -10,6 +10,7 @@ import (
 
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
+	appsv1 "k8s.io/api/apps/v1"
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
@@ -460,9 +461,30 @@ var _ = Describe("VirtualMCPServer Circuit Breaker Lifecycle", Ordered, func() {
 		backend.Spec.Image = images.YardstickServerImage
 		Expect(k8sClient.Update(ctx, backend)).To(Succeed())
 
+		By("Waiting for backend StatefulSet template to use the fixed image")
+		Eventually(func() error {
+			sts := &appsv1.StatefulSet{}
+			if err := k8sClient.Get(ctx, types.NamespacedName{
+				Name:      backend2Name,
+				Namespace: testNamespace,
+			}, sts); err != nil {
+				return err
+			}
+			for _, container := range sts.Spec.Template.Spec.Containers {
+				if container.Name == "mcp" {
+					if container.Image != images.YardstickServerImage {
+						return fmt.Errorf("statefulset still has image %q", container.Image)
+					}
+					return nil
+				}
+			}
+			return fmt.Errorf("mcp container not found in statefulset template")
+		}, timeout, pollingInterval).Should(Succeed())
+
 		By("Deleting stuck pods to force recreation with fixed image")
 		// Pods in ImagePullBackOff don't automatically recreate when image is fixed
-		// Delete them to force the statefulset to create new pods with the correct image
+		// Delete them after the statefulset template is updated, otherwise the old template
+		// can immediately recreate the pod with the broken image again.
 		podList := &corev1.PodList{}
 		Expect(k8sClient.List(ctx, podList,
 			client.InNamespace(testNamespace),
diff --git a/test/e2e/thv-operator/virtualmcp/virtualmcp_ratelimit_test.go b/test/e2e/thv-operator/virtualmcp/virtualmcp_ratelimit_test.go
deleted file mode 100644
index de9900ba6c..0000000000
--- a/test/e2e/thv-operator/virtualmcp/virtualmcp_ratelimit_test.go
+++ /dev/null
@@ -1,248 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
-// SPDX-License-Identifier: Apache-2.0
-
-package virtualmcp
-
-import (
-	"bytes"
-	"context"
-	"encoding/json"
-	"fmt"
-	"io"
-	"net/http"
-	"time"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
-	mcpv1beta1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1beta1"
-	"github.com/stacklok/toolhive/test/e2e/images"
-)
-
-var _ = Describe("VirtualMCPServer Rate Limiting", Ordered, func() {
-	const (
-		timeout         = 3 * time.Minute
-		pollingInterval = 1 * time.Second
-	)
-
-	var (
-		httpClient   *http.Client
-		mcpGroupName string
-		backendName  string
-		vmcpName     string
-		oidcName     string
-		oidcConfig   string
-		redisName    string
-		oidcNodePort int32
-		vmcpNodePort int32
-		cleanupOIDC  func()
-	)
-
-	BeforeAll(func() {
-		httpClient = &http.Client{Timeout: 30 * time.Second}
-
-		timestamp := time.Now().UnixNano()
-		mcpGroupName = fmt.Sprintf("e2e-vmcp-rl-group-%d", timestamp)
-		backendName = fmt.Sprintf("e2e-vmcp-rl-backend-%d", timestamp)
-		vmcpName = fmt.Sprintf("e2e-vmcp-rl-%d", timestamp)
-		oidcName = fmt.Sprintf("e2e-vmcp-rl-oidc-%d", timestamp)
-		oidcConfig = fmt.Sprintf("e2e-vmcp-rl-oidc-config-%d", timestamp)
-		redisName = fmt.Sprintf("e2e-vmcp-rl-redis-%d", timestamp)
-
-		By("Deploying Redis for vMCP session storage and rate limiting")
-		deployRedis(redisName)
-
-		By("Deploying parameterized OIDC server for per-user identity")
-		var issuerURL string
-		issuerURL, oidcNodePort, cleanupOIDC = DeployParameterizedOIDCServer(
-			ctx, k8sClient, oidcName, defaultNamespace, timeout, pollingInterval)
-
-		By("Creating MCPOIDCConfig for vMCP incoming auth")
-		Expect(k8sClient.Create(ctx, &mcpv1beta1.MCPOIDCConfig{
-			ObjectMeta: metav1.ObjectMeta{Name: oidcConfig, Namespace: defaultNamespace},
-			Spec: mcpv1beta1.MCPOIDCConfigSpec{
-				Type: mcpv1beta1.MCPOIDCConfigTypeInline,
-				Inline: &mcpv1beta1.InlineOIDCSharedConfig{
-					Issuer:                          issuerURL,
-					InsecureAllowHTTP:               true,
-					JWKSAllowPrivateIP:              true,
-					ProtectedResourceAllowPrivateIP: true,
-				},
-			},
-		})).To(Succeed())
-
-		By("Creating MCPGroup and yardstick backend")
-		CreateMCPGroupAndWait(ctx, k8sClient, mcpGroupName, defaultNamespace,
-			"VirtualMCPServer rate limit e2e group", timeout, pollingInterval)
-		CreateMCPServerAndWait(ctx, k8sClient, backendName, defaultNamespace, mcpGroupName,
-			images.YardstickServerImage, timeout, pollingInterval)
-
-		By("Creating VirtualMCPServer with per-user rate limit")
-		redisAddr := fmt.Sprintf("%s.%s.svc.cluster.local:6379", redisName, defaultNamespace)
-		Expect(k8sClient.Create(ctx, &mcpv1beta1.VirtualMCPServer{
-			ObjectMeta: metav1.ObjectMeta{Name: vmcpName, Namespace: defaultNamespace},
-			Spec: mcpv1beta1.VirtualMCPServerSpec{
-				GroupRef: &mcpv1beta1.MCPGroupRef{Name: mcpGroupName},
-				IncomingAuth: &mcpv1beta1.IncomingAuthConfig{
-					Type: "oidc",
-					OIDCConfigRef: &mcpv1beta1.MCPOIDCConfigReference{
-						Name:     oidcConfig,
-						Audience: "vmcp-audience",
-					},
-				},
-				ServiceType: "NodePort",
-				SessionStorage: &mcpv1beta1.SessionStorageConfig{
-					Provider: mcpv1beta1.SessionStorageProviderRedis,
-					Address:  redisAddr,
-				},
-				RateLimiting: &mcpv1beta1.RateLimitConfig{
-					PerUser: &mcpv1beta1.RateLimitBucket{
-						MaxTokens:    2,
-						RefillPeriod: metav1.Duration{Duration: time.Minute},
-					},
-				},
-			},
-		})).To(Succeed())
-
-		By("Waiting for VirtualMCPServer to be ready")
-		WaitForVirtualMCPServerReady(ctx, k8sClient, vmcpName, defaultNamespace, timeout, pollingInterval)
-		WaitForCondition(ctx, k8sClient, vmcpName, defaultNamespace, "BackendsDiscovered", "True", timeout, pollingInterval)
-		vmcpNodePort = GetVMCPNodePort(ctx, k8sClient, vmcpName, defaultNamespace, timeout, pollingInterval)
-	})
-
-	AfterAll(func() {
-		_ = k8sClient.Delete(ctx, &mcpv1beta1.VirtualMCPServer{
-			ObjectMeta: metav1.ObjectMeta{Name: vmcpName, Namespace: defaultNamespace},
-		})
-		_ = k8sClient.Delete(ctx, &mcpv1beta1.MCPServer{
-			ObjectMeta: metav1.ObjectMeta{Name: backendName, Namespace: defaultNamespace},
-		})
-		_ = k8sClient.Delete(ctx, &mcpv1beta1.MCPGroup{
-			ObjectMeta: metav1.ObjectMeta{Name: mcpGroupName, Namespace: defaultNamespace},
-		})
-		_ = k8sClient.Delete(ctx, &mcpv1beta1.MCPOIDCConfig{
-			ObjectMeta: metav1.ObjectMeta{Name: oidcConfig, Namespace: defaultNamespace},
-		})
-		cleanupRedis(redisName)
-		if cleanupOIDC != nil {
-			cleanupOIDC()
-		}
-	})
-
-	It("rejects tools/call after the per-user limit is exceeded", func() {
-		token := getVMCPRateLimitOIDCToken(ctx, httpClient, oidcNodePort, "vmcp-rate-limit-user")
-		sessionID := sendVMCPRateLimitInitialize(ctx, httpClient, vmcpNodePort, token)
-
-		By("Sending requests within the per-user rate limit")
-		for i := range 2 {
-			status, body, _ := sendVMCPRateLimitToolCall(ctx, httpClient, vmcpNodePort, "echo", i+1, token, sessionID)
-			Expect(status).To(Equal(http.StatusOK),
-				"request %d should succeed, got status %d: %s", i+1, status, string(body))
-		}
-
-		By("Sending request that exceeds the per-user rate limit")
-		status, body, retryAfter := sendVMCPRateLimitToolCall(ctx, httpClient, vmcpNodePort, "echo", 3, token, sessionID)
-		Expect(status).To(Equal(http.StatusTooManyRequests),
-			"third request should be rate limited, body: %s", string(body))
-		Expect(retryAfter).ToNot(BeEmpty(), "Retry-After header should be set")
-
-		var rpcResp map[string]any
-		Expect(json.Unmarshal(body, &rpcResp)).To(Succeed())
-		errObj, ok := rpcResp["error"].(map[string]any)
-		Expect(ok).To(BeTrue(), "response should include JSON-RPC error")
-		Expect(errObj["code"]).To(BeEquivalentTo(-32029))
-		Expect(errObj["message"]).To(Equal("Rate limit exceeded"))
-	})
-})
-
-func getVMCPRateLimitOIDCToken(ctx context.Context, httpClient *http.Client, oidcNodePort int32, subject string) string {
-	url := fmt.Sprintf("http://localhost:%d/token?subject=%s", oidcNodePort, subject)
-	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, nil)
-	Expect(err).ToNot(HaveOccurred())
-
-	resp, err := httpClient.Do(req)
-	Expect(err).ToNot(HaveOccurred())
-	defer func() { _ = resp.Body.Close() }()
-
-	var tokenResp struct {
-		AccessToken string `json:"access_token"`
-	}
-	Expect(json.NewDecoder(resp.Body).Decode(&tokenResp)).To(Succeed())
-	Expect(tokenResp.AccessToken).ToNot(BeEmpty(), "OIDC server should return a token")
-	return tokenResp.AccessToken
-}
-
-func sendVMCPRateLimitInitialize(
-	ctx context.Context, httpClient *http.Client, port int32, bearerToken string,
-) string {
-	reqBody := map[string]any{
-		"jsonrpc": "2.0",
-		"id":      0,
-		"method":  "initialize",
-		"params": map[string]any{
-			"protocolVersion": "2025-03-26",
-			"capabilities":    map[string]any{},
-			"clientInfo": map[string]any{
-				"name":    "vmcp-rate-limit-e2e",
-				"version": "1.0.0",
-			},
-		},
-	}
-	bodyBytes, err := json.Marshal(reqBody)
-	Expect(err).ToNot(HaveOccurred())
-
-	url := fmt.Sprintf("http://localhost:%d/mcp", port)
-	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(bodyBytes))
-	Expect(err).ToNot(HaveOccurred())
-	req.Header.Set("Content-Type", "application/json")
-	req.Header.Set("Accept", "application/json, text/event-stream")
-	req.Header.Set("Authorization", "Bearer "+bearerToken)
-
-	resp, err := httpClient.Do(req)
-	Expect(err).ToNot(HaveOccurred())
-	defer func() { _ = resp.Body.Close() }()
-	Expect(resp.StatusCode).To(Equal(http.StatusOK), "initialize should succeed")
-
-	sessionID := resp.Header.Get("Mcp-Session-Id")
-	Expect(sessionID).ToNot(BeEmpty(), "initialize response should include Mcp-Session-Id header")
-	return sessionID
-}
-
-func sendVMCPRateLimitToolCall(
-	ctx context.Context,
-	httpClient *http.Client,
-	port int32,
-	toolName string,
-	requestID int,
-	bearerToken string,
-	sessionID string,
-) (int, []byte, string) {
-	reqBody := map[string]any{
-		"jsonrpc": "2.0",
-		"id":      requestID,
-		"method":  "tools/call",
-		"params": map[string]any{
-			"name":      toolName,
-			"arguments": map[string]any{"input": "test"},
-		},
-	}
-	bodyBytes, err := json.Marshal(reqBody)
-	Expect(err).ToNot(HaveOccurred())
-
-	url := fmt.Sprintf("http://localhost:%d/mcp", port)
-	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(bodyBytes))
-	Expect(err).ToNot(HaveOccurred())
-	req.Header.Set("Content-Type", "application/json")
-	req.Header.Set("Accept", "application/json, text/event-stream")
-	req.Header.Set("Authorization", "Bearer "+bearerToken)
-	req.Header.Set("Mcp-Session-Id", sessionID)
-
-	resp, err := httpClient.Do(req)
-	Expect(err).ToNot(HaveOccurred())
-	defer func() { _ = resp.Body.Close() }()
-
-	respBody, err := io.ReadAll(resp.Body)
-	Expect(err).ToNot(HaveOccurred())
-	return resp.StatusCode, respBody, resp.Header.Get("Retry-After")
-}

From f96d0af3518ba34d8130a2e1d7e37c0d0faa340e Mon Sep 17 00:00:00 2001
From: Sanskarzz <sanskar.gur@gmail.com>
Date: Thu, 30 Apr 2026 03:41:59 +0530
Subject: [PATCH 3/4] Address vMCP rate limit review feedback

Signed-off-by: Sanskarzz <sanskar.gur@gmail.com>
---
 .../api/v1beta1/mcpserver_types.go            |  22 +--
 .../api/v1beta1/mcpserver_types_test.go       |  13 +-
 .../api/v1beta1/zz_generated.deepcopy.go      |  12 +-
 .../virtualmcpserver_vmcpconfig.go            |  13 +-
 .../virtualmcpserver_vmcpconfig_test.go       |  23 ++--
 cmd/thv-operator/pkg/vmcpconfig/converter.go  |  25 ----
 .../pkg/vmcpconfig/converter_test.go          |  91 -------------
 ...irtualmcpserver_sessionstorage_cel_test.go |   2 +-
 .../toolhive.stacklok.dev_mcpservers.yaml     | 128 +++---------------
 ...olhive.stacklok.dev_virtualmcpservers.yaml | 128 +++---------------
 .../toolhive.stacklok.dev_mcpservers.yaml     | 128 +++---------------
 ...olhive.stacklok.dev_virtualmcpservers.yaml | 128 +++---------------
 docs/operator/crd-api.md                      |  10 +-
 docs/server/docs.go                           |   6 -
 docs/server/swagger.json                      |   6 -
 docs/server/swagger.yaml                      |   4 -
 pkg/ratelimit/internal/bucket/bucket.go       |   2 +-
 pkg/ratelimit/limiter.go                      |  32 +----
 pkg/ratelimit/limiter_test.go                 |  45 +-----
 pkg/vmcp/config/config.go                     |   8 --
 pkg/vmcp/config/config_test.go                |  27 ----
 pkg/vmcp/config/zz_generated.deepcopy.go      |   4 -
 22 files changed, 120 insertions(+), 737 deletions(-)

diff --git a/cmd/thv-operator/api/v1beta1/mcpserver_types.go b/cmd/thv-operator/api/v1beta1/mcpserver_types.go
index ad9dc96b0f..35fb76fe5b 100644
--- a/cmd/thv-operator/api/v1beta1/mcpserver_types.go
+++ b/cmd/thv-operator/api/v1beta1/mcpserver_types.go
@@ -505,15 +505,11 @@ type SessionStorageConfig struct {
 // RateLimitConfig defines rate limiting configuration for an MCP server.
 // At least one of shared, perUser, or tools must be configured.
 //
-// +kubebuilder:validation:XValidation:rule="has(self.global) || has(self.shared) || has(self.perUser) || (has(self.tools) && size(self.tools) > 0)",message="at least one of global, shared, perUser, or tools must be configured"
+// +kubebuilder:validation:XValidation:rule="has(self.shared) || has(self.perUser) || (has(self.tools) && size(self.tools) > 0)",message="at least one of shared, perUser, or tools must be configured"
 //
-//nolint:lll // CEL validation rules exceed line length limit
+//nolint:lll // kubebuilder marker exceeds line length
 type RateLimitConfig struct {
-	// Global is a token bucket shared across all users for the entire server.
-	// +optional
-	Global *RateLimitBucket `json:"global,omitempty" yaml:"global,omitempty"`
-
-	// Shared is a deprecated alias for Global. Use global instead.
+	// Shared is a token bucket shared across all users for the entire server.
 	// +optional
 	Shared *RateLimitBucket `json:"shared,omitempty" yaml:"shared,omitempty"`
 
@@ -534,7 +530,7 @@ type RateLimitConfig struct {
 }
 
 // RateLimitBucket defines a token bucket configuration with a maximum capacity
-// and a refill period. Used by both shared (global) and per-user rate limits.
+// and a refill period. Used by both shared and per-user rate limits.
 type RateLimitBucket struct {
 	// MaxTokens is the maximum number of tokens (bucket capacity).
 	// This is also the burst size: the maximum number of requests that can be served
@@ -551,9 +547,9 @@ type RateLimitBucket struct {
 }
 
 // ToolRateLimitConfig defines rate limits for a specific tool.
-// At least one of global, shared, or perUser must be configured.
+// At least one of shared or perUser must be configured.
 //
-// +kubebuilder:validation:XValidation:rule="has(self.global) || has(self.shared) || has(self.perUser)",message="at least one of global, shared, or perUser must be configured"
+// +kubebuilder:validation:XValidation:rule="has(self.shared) || has(self.perUser)",message="at least one of shared or perUser must be configured"
 //
 //nolint:lll // kubebuilder marker exceeds line length
 type ToolRateLimitConfig struct {
@@ -562,11 +558,7 @@ type ToolRateLimitConfig struct {
 	// +kubebuilder:validation:MinLength=1
 	Name string `json:"name" yaml:"name"`
 
-	// Global token bucket for this specific tool.
-	// +optional
-	Global *RateLimitBucket `json:"global,omitempty" yaml:"global,omitempty"`
-
-	// Shared is a deprecated alias for Global. Use global instead.
+	// Shared token bucket for this specific tool.
 	// +optional
 	Shared *RateLimitBucket `json:"shared,omitempty" yaml:"shared,omitempty"`
 
diff --git a/cmd/thv-operator/api/v1beta1/mcpserver_types_test.go b/cmd/thv-operator/api/v1beta1/mcpserver_types_test.go
index 25506fb4dd..4a886bb74f 100644
--- a/cmd/thv-operator/api/v1beta1/mcpserver_types_test.go
+++ b/cmd/thv-operator/api/v1beta1/mcpserver_types_test.go
@@ -75,13 +75,6 @@ func TestRateLimitConfigJSONRoundtrip(t *testing.T) {
 		input    RateLimitConfig
 		wantJSON string
 	}{
-		{
-			name: "global only",
-			input: RateLimitConfig{
-				Global: &RateLimitBucket{MaxTokens: 100, RefillPeriod: metav1.Duration{Duration: time.Minute}},
-			},
-			wantJSON: `{"global":{"maxTokens":100,"refillPeriod":"1m0s"}}`,
-		},
 		{
 			name: "shared only",
 			input: RateLimitConfig{
@@ -134,7 +127,7 @@ func TestVirtualMCPServerSpecRateLimitingJSONRoundtrip(t *testing.T) {
 			Address:  "redis.default.svc.cluster.local:6379",
 		},
 		RateLimiting: &RateLimitConfig{
-			Global: &RateLimitBucket{MaxTokens: 10, RefillPeriod: metav1.Duration{Duration: time.Minute}},
+			Shared: &RateLimitBucket{MaxTokens: 10, RefillPeriod: metav1.Duration{Duration: time.Minute}},
 			PerUser: &RateLimitBucket{
 				MaxTokens:    2,
 				RefillPeriod: metav1.Duration{Duration: time.Minute},
@@ -142,7 +135,7 @@ func TestVirtualMCPServerSpecRateLimitingJSONRoundtrip(t *testing.T) {
 			Tools: []ToolRateLimitConfig{
 				{
 					Name: "backend_a_echo",
-					Global: &RateLimitBucket{
+					Shared: &RateLimitBucket{
 						MaxTokens:    5,
 						RefillPeriod: metav1.Duration{Duration: 30 * time.Second},
 					},
@@ -155,7 +148,7 @@ func TestVirtualMCPServerSpecRateLimitingJSONRoundtrip(t *testing.T) {
 	require.NoError(t, err)
 	out := string(b)
 	assert.Contains(t, out, `"rateLimiting"`)
-	assert.Contains(t, out, `"global"`)
+	assert.Contains(t, out, `"shared"`)
 	assert.Contains(t, out, `"perUser"`)
 	assert.Contains(t, out, `"backend_a_echo"`)
 	assert.NotContains(t, out, `"config":{"rateLimiting"`)
diff --git a/cmd/thv-operator/api/v1beta1/zz_generated.deepcopy.go b/cmd/thv-operator/api/v1beta1/zz_generated.deepcopy.go
index fd3706419d..6a58686950 100644
--- a/cmd/thv-operator/api/v1beta1/zz_generated.deepcopy.go
+++ b/cmd/thv-operator/api/v1beta1/zz_generated.deepcopy.go
@@ -23,7 +23,7 @@ package v1beta1
 import (
 	corev1 "k8s.io/api/core/v1"
 	apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
-	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 )
 
@@ -2210,11 +2210,6 @@ func (in *RateLimitBucket) DeepCopy() *RateLimitBucket {
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *RateLimitConfig) DeepCopyInto(out *RateLimitConfig) {
 	*out = *in
-	if in.Global != nil {
-		in, out := &in.Global, &out.Global
-		*out = new(RateLimitBucket)
-		**out = **in
-	}
 	if in.Shared != nil {
 		in, out := &in.Shared, &out.Shared
 		*out = new(RateLimitBucket)
@@ -2669,11 +2664,6 @@ func (in *ToolOverride) DeepCopy() *ToolOverride {
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *ToolRateLimitConfig) DeepCopyInto(out *ToolRateLimitConfig) {
 	*out = *in
-	if in.Global != nil {
-		in, out := &in.Global, &out.Global
-		*out = new(RateLimitBucket)
-		**out = **in
-	}
 	if in.Shared != nil {
 		in, out := &in.Shared, &out.Shared
 		*out = new(RateLimitBucket)
diff --git a/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig.go b/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig.go
index b4c4956371..7a37984b56 100644
--- a/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig.go
+++ b/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig.go
@@ -82,9 +82,14 @@ func (r *VirtualMCPServerReconciler) ensureVmcpConfigConfigMap(
 	}
 
 	// Marshal the serializable Config to YAML for storage in ConfigMap.
+	// RateLimiting is a top-level VirtualMCPServer spec field, not part of
+	// spec.config, so add it only when writing the runtime ConfigMap.
 	// Note: gopkg.in/yaml.v3 produces deterministic output by sorting map keys alphabetically.
 	// This ensures stable checksums for triggering pod rollouts only when content actually changes.
-	vmcpConfigYAML, err := yaml.Marshal(config)
+	vmcpConfigYAML, err := yaml.Marshal(vmcpConfigMapConfig{
+		Config:       *config,
+		RateLimiting: vmcp.Spec.RateLimiting,
+	})
 	if err != nil {
 		return fmt.Errorf("failed to marshal vmcp config: %w", err)
 	}
@@ -130,6 +135,12 @@ func (r *VirtualMCPServerReconciler) ensureVmcpConfigConfigMap(
 	return nil
 }
 
+type vmcpConfigMapConfig struct {
+	vmcpconfig.Config `yaml:",inline"`
+
+	RateLimiting *mcpv1beta1.RateLimitConfig `yaml:"rateLimiting,omitempty"`
+}
+
 // populateOptimizerEmbeddingService wires the EmbeddingServer URL into the optimizer
 // config and emits warnings for non-recommended configurations.
 //
diff --git a/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig_test.go b/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig_test.go
index e671e9083b..5e7c095cc5 100644
--- a/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig_test.go
+++ b/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig_test.go
@@ -471,7 +471,7 @@ func TestEnsureVmcpConfigConfigMap(t *testing.T) {
 				Tools: []mcpv1beta1.ToolRateLimitConfig{
 					{
 						Name: "backend_a_echo",
-						Global: &mcpv1beta1.RateLimitBucket{
+						Shared: &mcpv1beta1.RateLimitBucket{
 							MaxTokens:    5,
 							RefillPeriod: metav1.Duration{Duration: time.Minute},
 						},
@@ -527,23 +527,16 @@ func TestEnsureVmcpConfigConfigMap(t *testing.T) {
 	assert.Contains(t, cm.Data, "config.yaml")
 	assert.NotEmpty(t, cm.Annotations["toolhive.stacklok.dev/content-checksum"])
 
-	var cfg vmcpconfig.Config
+	var cfg vmcpConfigMapConfig
 	require.NoError(t, yaml.Unmarshal([]byte(cm.Data["config.yaml"]), &cfg))
 	require.NotNil(t, cfg.RateLimiting, "runtime config must include spec.rateLimiting")
 
-	rateLimiting := cfg.RateLimiting.Get()
-	perUser, ok := rateLimiting["perUser"].(map[string]any)
-	require.True(t, ok)
-	assert.EqualValues(t, 2, perUser["maxTokens"])
-	tools, ok := rateLimiting["tools"].([]any)
-	require.True(t, ok)
-	require.Len(t, tools, 1)
-	tool, ok := tools[0].(map[string]any)
-	require.True(t, ok)
-	assert.Equal(t, "backend_a_echo", tool["name"])
-	global, ok := tool["global"].(map[string]any)
-	require.True(t, ok)
-	assert.EqualValues(t, 5, global["maxTokens"])
+	require.NotNil(t, cfg.RateLimiting.PerUser)
+	assert.EqualValues(t, 2, cfg.RateLimiting.PerUser.MaxTokens)
+	require.Len(t, cfg.RateLimiting.Tools, 1)
+	assert.Equal(t, "backend_a_echo", cfg.RateLimiting.Tools[0].Name)
+	require.NotNil(t, cfg.RateLimiting.Tools[0].Shared)
+	assert.EqualValues(t, 5, cfg.RateLimiting.Tools[0].Shared.MaxTokens)
 }
 
 // TestSetAuthConfigConditions tests that auth config conditions reflect the current state
diff --git a/cmd/thv-operator/pkg/vmcpconfig/converter.go b/cmd/thv-operator/pkg/vmcpconfig/converter.go
index 1b0a5ae140..cd31af6d69 100644
--- a/cmd/thv-operator/pkg/vmcpconfig/converter.go
+++ b/cmd/thv-operator/pkg/vmcpconfig/converter.go
@@ -6,7 +6,6 @@ package vmcpconfig
 
 import (
 	"context"
-	"encoding/json"
 	"fmt"
 
 	"github.com/go-logr/logr"
@@ -20,7 +19,6 @@ import (
 	"github.com/stacklok/toolhive/cmd/thv-operator/pkg/oidc"
 	"github.com/stacklok/toolhive/cmd/thv-operator/pkg/spectoconfig"
 	"github.com/stacklok/toolhive/pkg/authserver"
-	thvjson "github.com/stacklok/toolhive/pkg/json"
 	"github.com/stacklok/toolhive/pkg/telemetry"
 	"github.com/stacklok/toolhive/pkg/vmcp/auth/converters"
 	authtypes "github.com/stacklok/toolhive/pkg/vmcp/auth/types"
@@ -142,13 +140,6 @@ func (c *Converter) Convert(
 	}
 
 	config.SessionStorage = convertSessionStorage(vmcp)
-	if vmcp.Spec.RateLimiting != nil {
-		rateLimiting, err := convertRateLimiting(vmcp.Spec.RateLimiting)
-		if err != nil {
-			return nil, nil, fmt.Errorf("failed to convert rate limiting: %w", err)
-		}
-		config.RateLimiting = rateLimiting
-	}
 
 	// Apply operational defaults (fills missing values)
 	config.EnsureOperationalDefaults()
@@ -166,22 +157,6 @@ func (c *Converter) Convert(
 	return config, authServerRC, nil
 }
 
-func convertRateLimiting(rateLimiting *mcpv1beta1.RateLimitConfig) (*thvjson.Map, error) {
-	if rateLimiting == nil {
-		return nil, nil
-	}
-	raw, err := json.Marshal(rateLimiting)
-	if err != nil {
-		return nil, err
-	}
-	var value map[string]any
-	if err := json.Unmarshal(raw, &value); err != nil {
-		return nil, err
-	}
-	converted := thvjson.NewMap(value)
-	return &converted, nil
-}
-
 // convertIncomingAuth converts IncomingAuthConfig from CRD to vmcp config.
 func (c *Converter) convertIncomingAuth(
 	ctx context.Context,
diff --git a/cmd/thv-operator/pkg/vmcpconfig/converter_test.go b/cmd/thv-operator/pkg/vmcpconfig/converter_test.go
index 6e195e9b9e..cee72256af 100644
--- a/cmd/thv-operator/pkg/vmcpconfig/converter_test.go
+++ b/cmd/thv-operator/pkg/vmcpconfig/converter_test.go
@@ -97,97 +97,6 @@ func newTestConverterWithObjects(t *testing.T, resolver oidc.Resolver, objects .
 	return converter
 }
 
-func TestConverter_RateLimitingFromTopLevelSpec(t *testing.T) {
-	t.Parallel()
-
-	vmcp := &mcpv1beta1.VirtualMCPServer{
-		ObjectMeta: metav1.ObjectMeta{Name: "test-vmcp", Namespace: "default"},
-		Spec: mcpv1beta1.VirtualMCPServerSpec{
-			GroupRef:     &mcpv1beta1.MCPGroupRef{Name: "test-group"},
-			IncomingAuth: &mcpv1beta1.IncomingAuthConfig{Type: "anonymous"},
-			RateLimiting: &mcpv1beta1.RateLimitConfig{
-				Global: &mcpv1beta1.RateLimitBucket{
-					MaxTokens:    10,
-					RefillPeriod: metav1.Duration{Duration: time.Minute},
-				},
-				Tools: []mcpv1beta1.ToolRateLimitConfig{
-					{
-						Name: "backend_a_echo",
-						Global: &mcpv1beta1.RateLimitBucket{
-							MaxTokens:    1,
-							RefillPeriod: metav1.Duration{Duration: time.Minute},
-						},
-					},
-				},
-			},
-		},
-	}
-
-	converter := newTestConverter(t, newNoOpMockResolver(t))
-	cfg, _, err := converter.Convert(context.Background(), vmcp, nil)
-	require.NoError(t, err)
-	require.NotNil(t, cfg.RateLimiting)
-
-	rateLimiting := cfg.RateLimiting.Get()
-	global, ok := rateLimiting["global"].(map[string]any)
-	require.True(t, ok)
-	assert.EqualValues(t, 10, global["maxTokens"])
-	tools, ok := rateLimiting["tools"].([]any)
-	require.True(t, ok)
-	require.Len(t, tools, 1)
-	tool, ok := tools[0].(map[string]any)
-	require.True(t, ok)
-	assert.Equal(t, "backend_a_echo", tool["name"])
-}
-
-func TestConverter_RateLimitingPreservesSharedAliasAndCopiesInput(t *testing.T) {
-	t.Parallel()
-
-	vmcp := &mcpv1beta1.VirtualMCPServer{
-		ObjectMeta: metav1.ObjectMeta{Name: "test-vmcp", Namespace: "default"},
-		Spec: mcpv1beta1.VirtualMCPServerSpec{
-			GroupRef:     &mcpv1beta1.MCPGroupRef{Name: "test-group"},
-			IncomingAuth: &mcpv1beta1.IncomingAuthConfig{Type: "anonymous"},
-			RateLimiting: &mcpv1beta1.RateLimitConfig{
-				Shared: &mcpv1beta1.RateLimitBucket{
-					MaxTokens:    10,
-					RefillPeriod: metav1.Duration{Duration: time.Minute},
-				},
-				Tools: []mcpv1beta1.ToolRateLimitConfig{
-					{
-						Name: "backend_a_echo",
-						Shared: &mcpv1beta1.RateLimitBucket{
-							MaxTokens:    1,
-							RefillPeriod: metav1.Duration{Duration: time.Second},
-						},
-					},
-				},
-			},
-		},
-	}
-
-	converter := newTestConverter(t, newNoOpMockResolver(t))
-	cfg, _, err := converter.Convert(context.Background(), vmcp, nil)
-	require.NoError(t, err)
-	require.NotNil(t, cfg.RateLimiting)
-
-	vmcp.Spec.RateLimiting.Shared.MaxTokens = 99
-	vmcp.Spec.RateLimiting.Tools[0].Shared.MaxTokens = 88
-
-	rateLimiting := cfg.RateLimiting.Get()
-	shared, ok := rateLimiting["shared"].(map[string]any)
-	require.True(t, ok)
-	assert.EqualValues(t, 10, shared["maxTokens"])
-	tools, ok := rateLimiting["tools"].([]any)
-	require.True(t, ok)
-	require.Len(t, tools, 1)
-	tool, ok := tools[0].(map[string]any)
-	require.True(t, ok)
-	toolShared, ok := tool["shared"].(map[string]any)
-	require.True(t, ok)
-	assert.EqualValues(t, 1, toolShared["maxTokens"])
-}
-
 func TestConverter_OIDCResolution(t *testing.T) {
 	t.Parallel()
 
diff --git a/cmd/thv-operator/test-integration/virtualmcp/virtualmcpserver_sessionstorage_cel_test.go b/cmd/thv-operator/test-integration/virtualmcp/virtualmcpserver_sessionstorage_cel_test.go
index dc00607240..6a74efd2aa 100644
--- a/cmd/thv-operator/test-integration/virtualmcp/virtualmcpserver_sessionstorage_cel_test.go
+++ b/cmd/thv-operator/test-integration/virtualmcp/virtualmcpserver_sessionstorage_cel_test.go
@@ -113,7 +113,7 @@ var _ = Describe("CEL Validation for SessionStorageConfig on VirtualMCPServer",
 			It("should reject rate limiting without redis session storage", func() {
 				vmcp := newVirtualMCPServerWithSessionStorage("vmcp-rl-no-redis", nil)
 				vmcp.Spec.RateLimiting = &mcpv1beta1.RateLimitConfig{
-					Global: &mcpv1beta1.RateLimitBucket{
+					Shared: &mcpv1beta1.RateLimitBucket{
 						MaxTokens:    1,
 						RefillPeriod: metav1.Duration{Duration: time.Minute},
 					},
diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpservers.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpservers.yaml
index 33aa0febec..44eb6f7a02 100644
--- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpservers.yaml
+++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_mcpservers.yaml
@@ -320,28 +320,6 @@ spec:
                   RateLimiting defines rate limiting configuration for the MCP server.
                   Requires Redis session storage to be configured for distributed rate limiting.
                 properties:
-                  global:
-                    description: Global is a token bucket shared across all users
-                      for the entire server.
-                    properties:
-                      maxTokens:
-                        description: |-
-                          MaxTokens is the maximum number of tokens (bucket capacity).
-                          This is also the burst size: the maximum number of requests that can be served
-                          instantaneously before the bucket is depleted.
-                        format: int32
-                        minimum: 1
-                        type: integer
-                      refillPeriod:
-                        description: |-
-                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
-                          The effective refill rate is maxTokens / refillPeriod tokens per second.
-                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
-                        type: string
-                    required:
-                    - maxTokens
-                    - refillPeriod
-                    type: object
                   perUser:
                     description: |-
                       PerUser is a token bucket applied independently to each authenticated user
@@ -368,8 +346,8 @@ spec:
                     - refillPeriod
                     type: object
                   shared:
-                    description: Shared is a deprecated alias for Global. Use global
-                      instead.
+                    description: Shared is a token bucket shared across all users
+                      for the entire server.
                     properties:
                       maxTokens:
                         description: |-
@@ -397,29 +375,8 @@ spec:
                     items:
                       description: |-
                         ToolRateLimitConfig defines rate limits for a specific tool.
-                        At least one of global, shared, or perUser must be configured.
+                        At least one of shared or perUser must be configured.
                       properties:
-                        global:
-                          description: Global token bucket for this specific tool.
-                          properties:
-                            maxTokens:
-                              description: |-
-                                MaxTokens is the maximum number of tokens (bucket capacity).
-                                This is also the burst size: the maximum number of requests that can be served
-                                instantaneously before the bucket is depleted.
-                              format: int32
-                              minimum: 1
-                              type: integer
-                            refillPeriod:
-                              description: |-
-                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
-                                The effective refill rate is maxTokens / refillPeriod tokens per second.
-                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
-                              type: string
-                          required:
-                          - maxTokens
-                          - refillPeriod
-                          type: object
                         name:
                           description: Name is the MCP tool name this limit applies
                             to.
@@ -448,8 +405,7 @@ spec:
                           - refillPeriod
                           type: object
                         shared:
-                          description: Shared is a deprecated alias for Global. Use
-                            global instead.
+                          description: Shared token bucket for this specific tool.
                           properties:
                             maxTokens:
                               description: |-
@@ -473,19 +429,17 @@ spec:
                       - name
                       type: object
                       x-kubernetes-validations:
-                      - message: at least one of global, shared, or perUser must be
-                          configured
-                        rule: has(self.global) || has(self.shared) || has(self.perUser)
+                      - message: at least one of shared or perUser must be configured
+                        rule: has(self.shared) || has(self.perUser)
                     type: array
                     x-kubernetes-list-map-keys:
                     - name
                     x-kubernetes-list-type: map
                 type: object
                 x-kubernetes-validations:
-                - message: at least one of global, shared, perUser, or tools must
-                    be configured
-                  rule: has(self.global) || has(self.shared) || has(self.perUser)
-                    || (has(self.tools) && size(self.tools) > 0)
+                - message: at least one of shared, perUser, or tools must be configured
+                  rule: has(self.shared) || has(self.perUser) || (has(self.tools)
+                    && size(self.tools) > 0)
               replicas:
                 description: |-
                   Replicas is the desired number of proxy runner (thv run) pod replicas.
@@ -1215,28 +1169,6 @@ spec:
                   RateLimiting defines rate limiting configuration for the MCP server.
                   Requires Redis session storage to be configured for distributed rate limiting.
                 properties:
-                  global:
-                    description: Global is a token bucket shared across all users
-                      for the entire server.
-                    properties:
-                      maxTokens:
-                        description: |-
-                          MaxTokens is the maximum number of tokens (bucket capacity).
-                          This is also the burst size: the maximum number of requests that can be served
-                          instantaneously before the bucket is depleted.
-                        format: int32
-                        minimum: 1
-                        type: integer
-                      refillPeriod:
-                        description: |-
-                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
-                          The effective refill rate is maxTokens / refillPeriod tokens per second.
-                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
-                        type: string
-                    required:
-                    - maxTokens
-                    - refillPeriod
-                    type: object
                   perUser:
                     description: |-
                       PerUser is a token bucket applied independently to each authenticated user
@@ -1263,8 +1195,8 @@ spec:
                     - refillPeriod
                     type: object
                   shared:
-                    description: Shared is a deprecated alias for Global. Use global
-                      instead.
+                    description: Shared is a token bucket shared across all users
+                      for the entire server.
                     properties:
                       maxTokens:
                         description: |-
@@ -1292,29 +1224,8 @@ spec:
                     items:
                       description: |-
                         ToolRateLimitConfig defines rate limits for a specific tool.
-                        At least one of global, shared, or perUser must be configured.
+                        At least one of shared or perUser must be configured.
                       properties:
-                        global:
-                          description: Global token bucket for this specific tool.
-                          properties:
-                            maxTokens:
-                              description: |-
-                                MaxTokens is the maximum number of tokens (bucket capacity).
-                                This is also the burst size: the maximum number of requests that can be served
-                                instantaneously before the bucket is depleted.
-                              format: int32
-                              minimum: 1
-                              type: integer
-                            refillPeriod:
-                              description: |-
-                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
-                                The effective refill rate is maxTokens / refillPeriod tokens per second.
-                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
-                              type: string
-                          required:
-                          - maxTokens
-                          - refillPeriod
-                          type: object
                         name:
                           description: Name is the MCP tool name this limit applies
                             to.
@@ -1343,8 +1254,7 @@ spec:
                           - refillPeriod
                           type: object
                         shared:
-                          description: Shared is a deprecated alias for Global. Use
-                            global instead.
+                          description: Shared token bucket for this specific tool.
                           properties:
                             maxTokens:
                               description: |-
@@ -1368,19 +1278,17 @@ spec:
                       - name
                       type: object
                       x-kubernetes-validations:
-                      - message: at least one of global, shared, or perUser must be
-                          configured
-                        rule: has(self.global) || has(self.shared) || has(self.perUser)
+                      - message: at least one of shared or perUser must be configured
+                        rule: has(self.shared) || has(self.perUser)
                     type: array
                     x-kubernetes-list-map-keys:
                     - name
                     x-kubernetes-list-type: map
                 type: object
                 x-kubernetes-validations:
-                - message: at least one of global, shared, perUser, or tools must
-                    be configured
-                  rule: has(self.global) || has(self.shared) || has(self.perUser)
-                    || (has(self.tools) && size(self.tools) > 0)
+                - message: at least one of shared, perUser, or tools must be configured
+                  rule: has(self.shared) || has(self.perUser) || (has(self.tools)
+                    && size(self.tools) > 0)
               replicas:
                 description: |-
                   Replicas is the desired number of proxy runner (thv run) pod replicas.
diff --git a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml
index bbc8df3509..913eb6ca9e 100644
--- a/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml
+++ b/deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml
@@ -2247,28 +2247,6 @@ spec:
                   RateLimiting defines rate limiting configuration for the Virtual MCP server.
                   Requires Redis session storage to be configured for distributed rate limiting.
                 properties:
-                  global:
-                    description: Global is a token bucket shared across all users
-                      for the entire server.
-                    properties:
-                      maxTokens:
-                        description: |-
-                          MaxTokens is the maximum number of tokens (bucket capacity).
-                          This is also the burst size: the maximum number of requests that can be served
-                          instantaneously before the bucket is depleted.
-                        format: int32
-                        minimum: 1
-                        type: integer
-                      refillPeriod:
-                        description: |-
-                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
-                          The effective refill rate is maxTokens / refillPeriod tokens per second.
-                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
-                        type: string
-                    required:
-                    - maxTokens
-                    - refillPeriod
-                    type: object
                   perUser:
                     description: |-
                       PerUser is a token bucket applied independently to each authenticated user
@@ -2295,8 +2273,8 @@ spec:
                     - refillPeriod
                     type: object
                   shared:
-                    description: Shared is a deprecated alias for Global. Use global
-                      instead.
+                    description: Shared is a token bucket shared across all users
+                      for the entire server.
                     properties:
                       maxTokens:
                         description: |-
@@ -2324,29 +2302,8 @@ spec:
                     items:
                       description: |-
                         ToolRateLimitConfig defines rate limits for a specific tool.
-                        At least one of global, shared, or perUser must be configured.
+                        At least one of shared or perUser must be configured.
                       properties:
-                        global:
-                          description: Global token bucket for this specific tool.
-                          properties:
-                            maxTokens:
-                              description: |-
-                                MaxTokens is the maximum number of tokens (bucket capacity).
-                                This is also the burst size: the maximum number of requests that can be served
-                                instantaneously before the bucket is depleted.
-                              format: int32
-                              minimum: 1
-                              type: integer
-                            refillPeriod:
-                              description: |-
-                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
-                                The effective refill rate is maxTokens / refillPeriod tokens per second.
-                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
-                              type: string
-                          required:
-                          - maxTokens
-                          - refillPeriod
-                          type: object
                         name:
                           description: Name is the MCP tool name this limit applies
                             to.
@@ -2375,8 +2332,7 @@ spec:
                           - refillPeriod
                           type: object
                         shared:
-                          description: Shared is a deprecated alias for Global. Use
-                            global instead.
+                          description: Shared token bucket for this specific tool.
                           properties:
                             maxTokens:
                               description: |-
@@ -2400,19 +2356,17 @@ spec:
                       - name
                       type: object
                       x-kubernetes-validations:
-                      - message: at least one of global, shared, or perUser must be
-                          configured
-                        rule: has(self.global) || has(self.shared) || has(self.perUser)
+                      - message: at least one of shared or perUser must be configured
+                        rule: has(self.shared) || has(self.perUser)
                     type: array
                     x-kubernetes-list-map-keys:
                     - name
                     x-kubernetes-list-type: map
                 type: object
                 x-kubernetes-validations:
-                - message: at least one of global, shared, perUser, or tools must
-                    be configured
-                  rule: has(self.global) || has(self.shared) || has(self.perUser)
-                    || (has(self.tools) && size(self.tools) > 0)
+                - message: at least one of shared, perUser, or tools must be configured
+                  rule: has(self.shared) || has(self.perUser) || (has(self.tools)
+                    && size(self.tools) > 0)
               replicas:
                 description: |-
                   Replicas is the desired number of vMCP pod replicas.
@@ -4925,28 +4879,6 @@ spec:
                   RateLimiting defines rate limiting configuration for the Virtual MCP server.
                   Requires Redis session storage to be configured for distributed rate limiting.
                 properties:
-                  global:
-                    description: Global is a token bucket shared across all users
-                      for the entire server.
-                    properties:
-                      maxTokens:
-                        description: |-
-                          MaxTokens is the maximum number of tokens (bucket capacity).
-                          This is also the burst size: the maximum number of requests that can be served
-                          instantaneously before the bucket is depleted.
-                        format: int32
-                        minimum: 1
-                        type: integer
-                      refillPeriod:
-                        description: |-
-                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
-                          The effective refill rate is maxTokens / refillPeriod tokens per second.
-                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
-                        type: string
-                    required:
-                    - maxTokens
-                    - refillPeriod
-                    type: object
                   perUser:
                     description: |-
                       PerUser is a token bucket applied independently to each authenticated user
@@ -4973,8 +4905,8 @@ spec:
                     - refillPeriod
                     type: object
                   shared:
-                    description: Shared is a deprecated alias for Global. Use global
-                      instead.
+                    description: Shared is a token bucket shared across all users
+                      for the entire server.
                     properties:
                       maxTokens:
                         description: |-
@@ -5002,29 +4934,8 @@ spec:
                     items:
                       description: |-
                         ToolRateLimitConfig defines rate limits for a specific tool.
-                        At least one of global, shared, or perUser must be configured.
+                        At least one of shared or perUser must be configured.
                       properties:
-                        global:
-                          description: Global token bucket for this specific tool.
-                          properties:
-                            maxTokens:
-                              description: |-
-                                MaxTokens is the maximum number of tokens (bucket capacity).
-                                This is also the burst size: the maximum number of requests that can be served
-                                instantaneously before the bucket is depleted.
-                              format: int32
-                              minimum: 1
-                              type: integer
-                            refillPeriod:
-                              description: |-
-                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
-                                The effective refill rate is maxTokens / refillPeriod tokens per second.
-                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
-                              type: string
-                          required:
-                          - maxTokens
-                          - refillPeriod
-                          type: object
                         name:
                           description: Name is the MCP tool name this limit applies
                             to.
@@ -5053,8 +4964,7 @@ spec:
                           - refillPeriod
                           type: object
                         shared:
-                          description: Shared is a deprecated alias for Global. Use
-                            global instead.
+                          description: Shared token bucket for this specific tool.
                           properties:
                             maxTokens:
                               description: |-
@@ -5078,19 +4988,17 @@ spec:
                       - name
                       type: object
                       x-kubernetes-validations:
-                      - message: at least one of global, shared, or perUser must be
-                          configured
-                        rule: has(self.global) || has(self.shared) || has(self.perUser)
+                      - message: at least one of shared or perUser must be configured
+                        rule: has(self.shared) || has(self.perUser)
                     type: array
                     x-kubernetes-list-map-keys:
                     - name
                     x-kubernetes-list-type: map
                 type: object
                 x-kubernetes-validations:
-                - message: at least one of global, shared, perUser, or tools must
-                    be configured
-                  rule: has(self.global) || has(self.shared) || has(self.perUser)
-                    || (has(self.tools) && size(self.tools) > 0)
+                - message: at least one of shared, perUser, or tools must be configured
+                  rule: has(self.shared) || has(self.perUser) || (has(self.tools)
+                    && size(self.tools) > 0)
               replicas:
                 description: |-
                   Replicas is the desired number of vMCP pod replicas.
diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpservers.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpservers.yaml
index 8f07967a66..a248508d41 100644
--- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpservers.yaml
+++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_mcpservers.yaml
@@ -323,28 +323,6 @@ spec:
                   RateLimiting defines rate limiting configuration for the MCP server.
                   Requires Redis session storage to be configured for distributed rate limiting.
                 properties:
-                  global:
-                    description: Global is a token bucket shared across all users
-                      for the entire server.
-                    properties:
-                      maxTokens:
-                        description: |-
-                          MaxTokens is the maximum number of tokens (bucket capacity).
-                          This is also the burst size: the maximum number of requests that can be served
-                          instantaneously before the bucket is depleted.
-                        format: int32
-                        minimum: 1
-                        type: integer
-                      refillPeriod:
-                        description: |-
-                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
-                          The effective refill rate is maxTokens / refillPeriod tokens per second.
-                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
-                        type: string
-                    required:
-                    - maxTokens
-                    - refillPeriod
-                    type: object
                   perUser:
                     description: |-
                       PerUser is a token bucket applied independently to each authenticated user
@@ -371,8 +349,8 @@ spec:
                     - refillPeriod
                     type: object
                   shared:
-                    description: Shared is a deprecated alias for Global. Use global
-                      instead.
+                    description: Shared is a token bucket shared across all users
+                      for the entire server.
                     properties:
                       maxTokens:
                         description: |-
@@ -400,29 +378,8 @@ spec:
                     items:
                       description: |-
                         ToolRateLimitConfig defines rate limits for a specific tool.
-                        At least one of global, shared, or perUser must be configured.
+                        At least one of shared or perUser must be configured.
                       properties:
-                        global:
-                          description: Global token bucket for this specific tool.
-                          properties:
-                            maxTokens:
-                              description: |-
-                                MaxTokens is the maximum number of tokens (bucket capacity).
-                                This is also the burst size: the maximum number of requests that can be served
-                                instantaneously before the bucket is depleted.
-                              format: int32
-                              minimum: 1
-                              type: integer
-                            refillPeriod:
-                              description: |-
-                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
-                                The effective refill rate is maxTokens / refillPeriod tokens per second.
-                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
-                              type: string
-                          required:
-                          - maxTokens
-                          - refillPeriod
-                          type: object
                         name:
                           description: Name is the MCP tool name this limit applies
                             to.
@@ -451,8 +408,7 @@ spec:
                           - refillPeriod
                           type: object
                         shared:
-                          description: Shared is a deprecated alias for Global. Use
-                            global instead.
+                          description: Shared token bucket for this specific tool.
                           properties:
                             maxTokens:
                               description: |-
@@ -476,19 +432,17 @@ spec:
                       - name
                       type: object
                       x-kubernetes-validations:
-                      - message: at least one of global, shared, or perUser must be
-                          configured
-                        rule: has(self.global) || has(self.shared) || has(self.perUser)
+                      - message: at least one of shared or perUser must be configured
+                        rule: has(self.shared) || has(self.perUser)
                     type: array
                     x-kubernetes-list-map-keys:
                     - name
                     x-kubernetes-list-type: map
                 type: object
                 x-kubernetes-validations:
-                - message: at least one of global, shared, perUser, or tools must
-                    be configured
-                  rule: has(self.global) || has(self.shared) || has(self.perUser)
-                    || (has(self.tools) && size(self.tools) > 0)
+                - message: at least one of shared, perUser, or tools must be configured
+                  rule: has(self.shared) || has(self.perUser) || (has(self.tools)
+                    && size(self.tools) > 0)
               replicas:
                 description: |-
                   Replicas is the desired number of proxy runner (thv run) pod replicas.
@@ -1218,28 +1172,6 @@ spec:
                   RateLimiting defines rate limiting configuration for the MCP server.
                   Requires Redis session storage to be configured for distributed rate limiting.
                 properties:
-                  global:
-                    description: Global is a token bucket shared across all users
-                      for the entire server.
-                    properties:
-                      maxTokens:
-                        description: |-
-                          MaxTokens is the maximum number of tokens (bucket capacity).
-                          This is also the burst size: the maximum number of requests that can be served
-                          instantaneously before the bucket is depleted.
-                        format: int32
-                        minimum: 1
-                        type: integer
-                      refillPeriod:
-                        description: |-
-                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
-                          The effective refill rate is maxTokens / refillPeriod tokens per second.
-                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
-                        type: string
-                    required:
-                    - maxTokens
-                    - refillPeriod
-                    type: object
                   perUser:
                     description: |-
                       PerUser is a token bucket applied independently to each authenticated user
@@ -1266,8 +1198,8 @@ spec:
                     - refillPeriod
                     type: object
                   shared:
-                    description: Shared is a deprecated alias for Global. Use global
-                      instead.
+                    description: Shared is a token bucket shared across all users
+                      for the entire server.
                     properties:
                       maxTokens:
                         description: |-
@@ -1295,29 +1227,8 @@ spec:
                     items:
                       description: |-
                         ToolRateLimitConfig defines rate limits for a specific tool.
-                        At least one of global, shared, or perUser must be configured.
+                        At least one of shared or perUser must be configured.
                       properties:
-                        global:
-                          description: Global token bucket for this specific tool.
-                          properties:
-                            maxTokens:
-                              description: |-
-                                MaxTokens is the maximum number of tokens (bucket capacity).
-                                This is also the burst size: the maximum number of requests that can be served
-                                instantaneously before the bucket is depleted.
-                              format: int32
-                              minimum: 1
-                              type: integer
-                            refillPeriod:
-                              description: |-
-                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
-                                The effective refill rate is maxTokens / refillPeriod tokens per second.
-                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
-                              type: string
-                          required:
-                          - maxTokens
-                          - refillPeriod
-                          type: object
                         name:
                           description: Name is the MCP tool name this limit applies
                             to.
@@ -1346,8 +1257,7 @@ spec:
                           - refillPeriod
                           type: object
                         shared:
-                          description: Shared is a deprecated alias for Global. Use
-                            global instead.
+                          description: Shared token bucket for this specific tool.
                           properties:
                             maxTokens:
                               description: |-
@@ -1371,19 +1281,17 @@ spec:
                       - name
                       type: object
                       x-kubernetes-validations:
-                      - message: at least one of global, shared, or perUser must be
-                          configured
-                        rule: has(self.global) || has(self.shared) || has(self.perUser)
+                      - message: at least one of shared or perUser must be configured
+                        rule: has(self.shared) || has(self.perUser)
                     type: array
                     x-kubernetes-list-map-keys:
                     - name
                     x-kubernetes-list-type: map
                 type: object
                 x-kubernetes-validations:
-                - message: at least one of global, shared, perUser, or tools must
-                    be configured
-                  rule: has(self.global) || has(self.shared) || has(self.perUser)
-                    || (has(self.tools) && size(self.tools) > 0)
+                - message: at least one of shared, perUser, or tools must be configured
+                  rule: has(self.shared) || has(self.perUser) || (has(self.tools)
+                    && size(self.tools) > 0)
               replicas:
                 description: |-
                   Replicas is the desired number of proxy runner (thv run) pod replicas.
diff --git a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml
index cc804a054c..1429da0fbe 100644
--- a/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml
+++ b/deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml
@@ -2250,28 +2250,6 @@ spec:
                   RateLimiting defines rate limiting configuration for the Virtual MCP server.
                   Requires Redis session storage to be configured for distributed rate limiting.
                 properties:
-                  global:
-                    description: Global is a token bucket shared across all users
-                      for the entire server.
-                    properties:
-                      maxTokens:
-                        description: |-
-                          MaxTokens is the maximum number of tokens (bucket capacity).
-                          This is also the burst size: the maximum number of requests that can be served
-                          instantaneously before the bucket is depleted.
-                        format: int32
-                        minimum: 1
-                        type: integer
-                      refillPeriod:
-                        description: |-
-                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
-                          The effective refill rate is maxTokens / refillPeriod tokens per second.
-                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
-                        type: string
-                    required:
-                    - maxTokens
-                    - refillPeriod
-                    type: object
                   perUser:
                     description: |-
                       PerUser is a token bucket applied independently to each authenticated user
@@ -2298,8 +2276,8 @@ spec:
                     - refillPeriod
                     type: object
                   shared:
-                    description: Shared is a deprecated alias for Global. Use global
-                      instead.
+                    description: Shared is a token bucket shared across all users
+                      for the entire server.
                     properties:
                       maxTokens:
                         description: |-
@@ -2327,29 +2305,8 @@ spec:
                     items:
                       description: |-
                         ToolRateLimitConfig defines rate limits for a specific tool.
-                        At least one of global, shared, or perUser must be configured.
+                        At least one of shared or perUser must be configured.
                       properties:
-                        global:
-                          description: Global token bucket for this specific tool.
-                          properties:
-                            maxTokens:
-                              description: |-
-                                MaxTokens is the maximum number of tokens (bucket capacity).
-                                This is also the burst size: the maximum number of requests that can be served
-                                instantaneously before the bucket is depleted.
-                              format: int32
-                              minimum: 1
-                              type: integer
-                            refillPeriod:
-                              description: |-
-                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
-                                The effective refill rate is maxTokens / refillPeriod tokens per second.
-                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
-                              type: string
-                          required:
-                          - maxTokens
-                          - refillPeriod
-                          type: object
                         name:
                           description: Name is the MCP tool name this limit applies
                             to.
@@ -2378,8 +2335,7 @@ spec:
                           - refillPeriod
                           type: object
                         shared:
-                          description: Shared is a deprecated alias for Global. Use
-                            global instead.
+                          description: Shared token bucket for this specific tool.
                           properties:
                             maxTokens:
                               description: |-
@@ -2403,19 +2359,17 @@ spec:
                       - name
                       type: object
                       x-kubernetes-validations:
-                      - message: at least one of global, shared, or perUser must be
-                          configured
-                        rule: has(self.global) || has(self.shared) || has(self.perUser)
+                      - message: at least one of shared or perUser must be configured
+                        rule: has(self.shared) || has(self.perUser)
                     type: array
                     x-kubernetes-list-map-keys:
                     - name
                     x-kubernetes-list-type: map
                 type: object
                 x-kubernetes-validations:
-                - message: at least one of global, shared, perUser, or tools must
-                    be configured
-                  rule: has(self.global) || has(self.shared) || has(self.perUser)
-                    || (has(self.tools) && size(self.tools) > 0)
+                - message: at least one of shared, perUser, or tools must be configured
+                  rule: has(self.shared) || has(self.perUser) || (has(self.tools)
+                    && size(self.tools) > 0)
               replicas:
                 description: |-
                   Replicas is the desired number of vMCP pod replicas.
@@ -4928,28 +4882,6 @@ spec:
                   RateLimiting defines rate limiting configuration for the Virtual MCP server.
                   Requires Redis session storage to be configured for distributed rate limiting.
                 properties:
-                  global:
-                    description: Global is a token bucket shared across all users
-                      for the entire server.
-                    properties:
-                      maxTokens:
-                        description: |-
-                          MaxTokens is the maximum number of tokens (bucket capacity).
-                          This is also the burst size: the maximum number of requests that can be served
-                          instantaneously before the bucket is depleted.
-                        format: int32
-                        minimum: 1
-                        type: integer
-                      refillPeriod:
-                        description: |-
-                          RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
-                          The effective refill rate is maxTokens / refillPeriod tokens per second.
-                          Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
-                        type: string
-                    required:
-                    - maxTokens
-                    - refillPeriod
-                    type: object
                   perUser:
                     description: |-
                       PerUser is a token bucket applied independently to each authenticated user
@@ -4976,8 +4908,8 @@ spec:
                     - refillPeriod
                     type: object
                   shared:
-                    description: Shared is a deprecated alias for Global. Use global
-                      instead.
+                    description: Shared is a token bucket shared across all users
+                      for the entire server.
                     properties:
                       maxTokens:
                         description: |-
@@ -5005,29 +4937,8 @@ spec:
                     items:
                       description: |-
                         ToolRateLimitConfig defines rate limits for a specific tool.
-                        At least one of global, shared, or perUser must be configured.
+                        At least one of shared or perUser must be configured.
                       properties:
-                        global:
-                          description: Global token bucket for this specific tool.
-                          properties:
-                            maxTokens:
-                              description: |-
-                                MaxTokens is the maximum number of tokens (bucket capacity).
-                                This is also the burst size: the maximum number of requests that can be served
-                                instantaneously before the bucket is depleted.
-                              format: int32
-                              minimum: 1
-                              type: integer
-                            refillPeriod:
-                              description: |-
-                                RefillPeriod is the duration to fully refill the bucket from zero to maxTokens.
-                                The effective refill rate is maxTokens / refillPeriod tokens per second.
-                                Format: Go duration string (e.g., "1m0s", "30s", "1h0m0s").
-                              type: string
-                          required:
-                          - maxTokens
-                          - refillPeriod
-                          type: object
                         name:
                           description: Name is the MCP tool name this limit applies
                             to.
@@ -5056,8 +4967,7 @@ spec:
                           - refillPeriod
                           type: object
                         shared:
-                          description: Shared is a deprecated alias for Global. Use
-                            global instead.
+                          description: Shared token bucket for this specific tool.
                           properties:
                             maxTokens:
                               description: |-
@@ -5081,19 +4991,17 @@ spec:
                       - name
                       type: object
                       x-kubernetes-validations:
-                      - message: at least one of global, shared, or perUser must be
-                          configured
-                        rule: has(self.global) || has(self.shared) || has(self.perUser)
+                      - message: at least one of shared or perUser must be configured
+                        rule: has(self.shared) || has(self.perUser)
                     type: array
                     x-kubernetes-list-map-keys:
                     - name
                     x-kubernetes-list-type: map
                 type: object
                 x-kubernetes-validations:
-                - message: at least one of global, shared, perUser, or tools must
-                    be configured
-                  rule: has(self.global) || has(self.shared) || has(self.perUser)
-                    || (has(self.tools) && size(self.tools) > 0)
+                - message: at least one of shared, perUser, or tools must be configured
+                  rule: has(self.shared) || has(self.perUser) || (has(self.tools)
+                    && size(self.tools) > 0)
               replicas:
                 description: |-
                   Replicas is the desired number of vMCP pod replicas.
diff --git a/docs/operator/crd-api.md b/docs/operator/crd-api.md
index 2b66ac030d..9edb64f5ba 100644
--- a/docs/operator/crd-api.md
+++ b/docs/operator/crd-api.md
@@ -2687,7 +2687,7 @@ _Appears in:_
 
 
 RateLimitBucket defines a token bucket configuration with a maximum capacity
-and a refill period. Used by both shared (global) and per-user rate limits.
+and a refill period. Used by both shared and per-user rate limits.
 
 
 
@@ -2716,8 +2716,7 @@ _Appears in:_
 
 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
-| `global` _[api.v1beta1.RateLimitBucket](#apiv1beta1ratelimitbucket)_ | Global is a token bucket shared across all users for the entire server. |  | Optional: \{\} <br /> |
-| `shared` _[api.v1beta1.RateLimitBucket](#apiv1beta1ratelimitbucket)_ | Shared is a deprecated alias for Global. Use global instead. |  | Optional: \{\} <br /> |
+| `shared` _[api.v1beta1.RateLimitBucket](#apiv1beta1ratelimitbucket)_ | Shared is a token bucket shared across all users for the entire server. |  | Optional: \{\} <br /> |
 | `perUser` _[api.v1beta1.RateLimitBucket](#apiv1beta1ratelimitbucket)_ | PerUser is a token bucket applied independently to each authenticated user<br />at the server level. Requires authentication to be enabled.<br />Each unique userID creates Redis keys that expire after 2x refillPeriod.<br />Memory formula: unique_users_per_TTL_window * (1 + num_tools_with_per_user_limits) keys. |  | Optional: \{\} <br /> |
 | `tools` _[api.v1beta1.ToolRateLimitConfig](#apiv1beta1toolratelimitconfig) array_ | Tools defines per-tool rate limit overrides.<br />Each entry applies additional rate limits to calls targeting a specific tool name.<br />A request must pass both the server-level limit and the per-tool limit. |  | Optional: \{\} <br /> |
 
@@ -3132,7 +3131,7 @@ _Appears in:_
 
 
 ToolRateLimitConfig defines rate limits for a specific tool.
-At least one of global, shared, or perUser must be configured.
+At least one of shared or perUser must be configured.
 
 
 
@@ -3142,8 +3141,7 @@ _Appears in:_
 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
 | `name` _string_ | Name is the MCP tool name this limit applies to. |  | MinLength: 1 <br />Required: \{\} <br /> |
-| `global` _[api.v1beta1.RateLimitBucket](#apiv1beta1ratelimitbucket)_ | Global token bucket for this specific tool. |  | Optional: \{\} <br /> |
-| `shared` _[api.v1beta1.RateLimitBucket](#apiv1beta1ratelimitbucket)_ | Shared is a deprecated alias for Global. Use global instead. |  | Optional: \{\} <br /> |
+| `shared` _[api.v1beta1.RateLimitBucket](#apiv1beta1ratelimitbucket)_ | Shared token bucket for this specific tool. |  | Optional: \{\} <br /> |
 | `perUser` _[api.v1beta1.RateLimitBucket](#apiv1beta1ratelimitbucket)_ | PerUser token bucket configuration for this tool. |  | Optional: \{\} <br /> |
 
 
diff --git a/docs/server/docs.go b/docs/server/docs.go
index bad149a9d7..a108f6087f 100644
--- a/docs/server/docs.go
+++ b/docs/server/docs.go
@@ -60,9 +60,6 @@ const docTemplate = `{
             "github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.RateLimitConfig": {
                 "description": "RateLimitConfig contains the CRD rate limiting configuration.\nWhen set, rate limiting middleware is added to the proxy middleware chain.",
                 "properties": {
-                    "global": {
-                        "$ref": "#/components/schemas/github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.RateLimitBucket"
-                    },
                     "perUser": {
                         "$ref": "#/components/schemas/github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.RateLimitBucket"
                     },
@@ -82,9 +79,6 @@ const docTemplate = `{
             },
             "github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.ToolRateLimitConfig": {
                 "properties": {
-                    "global": {
-                        "$ref": "#/components/schemas/github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.RateLimitBucket"
-                    },
                     "name": {
                         "description": "Name is the MCP tool name this limit applies to.\n+kubebuilder:validation:Required\n+kubebuilder:validation:MinLength=1",
                         "type": "string"
diff --git a/docs/server/swagger.json b/docs/server/swagger.json
index 6269efbf3d..8a5d8df9b5 100644
--- a/docs/server/swagger.json
+++ b/docs/server/swagger.json
@@ -53,9 +53,6 @@
             "github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.RateLimitConfig": {
                 "description": "RateLimitConfig contains the CRD rate limiting configuration.\nWhen set, rate limiting middleware is added to the proxy middleware chain.",
                 "properties": {
-                    "global": {
-                        "$ref": "#/components/schemas/github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.RateLimitBucket"
-                    },
                     "perUser": {
                         "$ref": "#/components/schemas/github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.RateLimitBucket"
                     },
@@ -75,9 +72,6 @@
             },
             "github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.ToolRateLimitConfig": {
                 "properties": {
-                    "global": {
-                        "$ref": "#/components/schemas/github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.RateLimitBucket"
-                    },
                     "name": {
                         "description": "Name is the MCP tool name this limit applies to.\n+kubebuilder:validation:Required\n+kubebuilder:validation:MinLength=1",
                         "type": "string"
diff --git a/docs/server/swagger.yaml b/docs/server/swagger.yaml
index 2799158f5c..bdc1cac8a0 100644
--- a/docs/server/swagger.yaml
+++ b/docs/server/swagger.yaml
@@ -52,8 +52,6 @@ components:
         RateLimitConfig contains the CRD rate limiting configuration.
         When set, rate limiting middleware is added to the proxy middleware chain.
       properties:
-        global:
-          $ref: '#/components/schemas/github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.RateLimitBucket'
         perUser:
           $ref: '#/components/schemas/github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.RateLimitBucket'
         shared:
@@ -73,8 +71,6 @@ components:
       type: object
     github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.ToolRateLimitConfig:
       properties:
-        global:
-          $ref: '#/components/schemas/github_com_stacklok_toolhive_cmd_thv-operator_api_v1beta1.RateLimitBucket'
         name:
           description: |-
             Name is the MCP tool name this limit applies to.
diff --git a/pkg/ratelimit/internal/bucket/bucket.go b/pkg/ratelimit/internal/bucket/bucket.go
index 28903bcdcd..d68a1709c8 100644
--- a/pkg/ratelimit/internal/bucket/bucket.go
+++ b/pkg/ratelimit/internal/bucket/bucket.go
@@ -90,7 +90,7 @@ type TokenBucket struct {
 }
 
 // New creates a TokenBucket. The Redis key is derived from namespace, server
-// name, and suffix (e.g., "global" or "global:tool:search").
+// name, and suffix (e.g., "shared" or "shared:tool:search").
 func New(namespace, serverName, suffix string, maxTokens int32, refillPeriod time.Duration) *TokenBucket {
 	refillSec := refillPeriod.Seconds()
 	return &TokenBucket{
diff --git a/pkg/ratelimit/limiter.go b/pkg/ratelimit/limiter.go
index 6679d6db4a..3490c1cbaa 100644
--- a/pkg/ratelimit/limiter.go
+++ b/pkg/ratelimit/limiter.go
@@ -46,10 +46,10 @@ func NewLimiter(client redis.Cmdable, namespace, name string, crd *v1beta1.RateL
 
 	l := &limiter{client: client}
 
-	if global, suffix := serverGlobalBucket(crd); global != nil {
-		b, err := newBucket(namespace, name, suffix, global)
+	if crd.Shared != nil {
+		b, err := newBucket(namespace, name, "shared", crd.Shared)
 		if err != nil {
-			return nil, fmt.Errorf("%s bucket: %w", suffix, err)
+			return nil, fmt.Errorf("shared bucket: %w", err)
 		}
 		l.serverBucket = b
 	}
@@ -63,10 +63,10 @@ func NewLimiter(client redis.Cmdable, namespace, name string, crd *v1beta1.RateL
 	}
 
 	for _, t := range crd.Tools {
-		if global, suffix := toolGlobalBucket(&t); global != nil {
-			b, err := newBucket(namespace, name, suffix+":tool:"+t.Name, global)
+		if t.Shared != nil {
+			b, err := newBucket(namespace, name, "shared:tool:"+t.Name, t.Shared)
 			if err != nil {
-				return nil, fmt.Errorf("tool %q %s bucket: %w", t.Name, suffix, err)
+				return nil, fmt.Errorf("tool %q shared bucket: %w", t.Name, err)
 			}
 			if l.toolBuckets == nil {
 				l.toolBuckets = make(map[string]*bucket.TokenBucket)
@@ -88,26 +88,6 @@ func NewLimiter(client redis.Cmdable, namespace, name string, crd *v1beta1.RateL
 	return l, nil
 }
 
-func serverGlobalBucket(crd *v1beta1.RateLimitConfig) (*v1beta1.RateLimitBucket, string) {
-	if crd.Global != nil {
-		return crd.Global, "global"
-	}
-	if crd.Shared != nil {
-		return crd.Shared, "shared"
-	}
-	return nil, ""
-}
-
-func toolGlobalBucket(tool *v1beta1.ToolRateLimitConfig) (*v1beta1.RateLimitBucket, string) {
-	if tool.Global != nil {
-		return tool.Global, "global"
-	}
-	if tool.Shared != nil {
-		return tool.Shared, "shared"
-	}
-	return nil, ""
-}
-
 // bucketSpec holds deferred bucket parameters for per-user buckets that are
 // created on the fly in Allow() because the userID is not known at construction time.
 type bucketSpec struct {
diff --git a/pkg/ratelimit/limiter_test.go b/pkg/ratelimit/limiter_test.go
index aea2b1636a..147e22cd7e 100644
--- a/pkg/ratelimit/limiter_test.go
+++ b/pkg/ratelimit/limiter_test.go
@@ -70,7 +70,7 @@ func TestNewLimiter_ZeroDuration(t *testing.T) {
 	assert.Contains(t, err.Error(), "refillPeriod must be positive")
 }
 
-func TestLimiter_ServerGlobalExhausted(t *testing.T) {
+func TestLimiter_ServerSharedExhausted(t *testing.T) {
 	t.Parallel()
 	client, _ := newTestClient(t)
 	ctx := t.Context()
@@ -93,7 +93,7 @@ func TestLimiter_ServerGlobalExhausted(t *testing.T) {
 	assert.Greater(t, d.RetryAfter, time.Duration(0))
 }
 
-func TestLimiter_SharedAliasUsesLegacyRedisKeys(t *testing.T) {
+func TestLimiter_SharedUsesRedisKeys(t *testing.T) {
 	t.Parallel()
 	client, _ := newTestClient(t)
 	ctx := t.Context()
@@ -114,45 +114,10 @@ func TestLimiter_SharedAliasUsesLegacyRedisKeys(t *testing.T) {
 	require.NoError(t, err)
 	require.True(t, d.Allowed)
 
-	legacyServerKey := "thv:rl:{ns:srv}:shared"
-	legacyToolKey := "thv:rl:{ns:srv}:shared:tool:search"
-	newServerKey := "thv:rl:{ns:srv}:global"
-	newToolKey := "thv:rl:{ns:srv}:global:tool:search"
+	serverKey := "thv:rl:{ns:srv}:shared"
+	toolKey := "thv:rl:{ns:srv}:shared:tool:search"
 
-	exists, err := client.Exists(ctx, legacyServerKey, legacyToolKey).Result()
-	require.NoError(t, err)
-	assert.Equal(t, int64(2), exists)
-
-	exists, err = client.Exists(ctx, newServerKey, newToolKey).Result()
-	require.NoError(t, err)
-	assert.Equal(t, int64(0), exists)
-}
-
-func TestLimiter_GlobalUsesGlobalRedisKeys(t *testing.T) {
-	t.Parallel()
-	client, _ := newTestClient(t)
-	ctx := t.Context()
-
-	crd := &v1beta1.RateLimitConfig{
-		Global: &v1beta1.RateLimitBucket{MaxTokens: 10, RefillPeriod: metav1.Duration{Duration: time.Minute}},
-		Tools: []v1beta1.ToolRateLimitConfig{
-			{
-				Name:   "search",
-				Global: &v1beta1.RateLimitBucket{MaxTokens: 10, RefillPeriod: metav1.Duration{Duration: time.Minute}},
-			},
-		},
-	}
-	l, err := NewLimiter(client, "ns", "srv", crd)
-	require.NoError(t, err)
-
-	d, err := l.Allow(ctx, "search", "")
-	require.NoError(t, err)
-	require.True(t, d.Allowed)
-
-	exists, err := client.Exists(ctx,
-		"thv:rl:{ns:srv}:global",
-		"thv:rl:{ns:srv}:global:tool:search",
-	).Result()
+	exists, err := client.Exists(ctx, serverKey, toolKey).Result()
 	require.NoError(t, err)
 	assert.Equal(t, int64(2), exists)
 }
diff --git a/pkg/vmcp/config/config.go b/pkg/vmcp/config/config.go
index 4d46cf9032..7a2c699290 100644
--- a/pkg/vmcp/config/config.go
+++ b/pkg/vmcp/config/config.go
@@ -173,14 +173,6 @@ type Config struct {
 	// the THV_SESSION_REDIS_PASSWORD environment variable.
 	// +optional
 	SessionStorage *SessionStorageConfig `json:"sessionStorage,omitempty" yaml:"sessionStorage,omitempty"`
-
-	// RateLimiting configures request rate limits for tools/call.
-	// Kubernetes users should set VirtualMCPServer.spec.rateLimiting; the operator
-	// copies that top-level field here for the vMCP runtime.
-	// This remains YAML-only so it can be written to the runtime config ConfigMap
-	// without exposing spec.config.rateLimiting in the Kubernetes API.
-	// +optional
-	RateLimiting *thvjson.Map `json:"-" yaml:"rateLimiting,omitempty"`
 }
 
 // IncomingAuthConfig configures client authentication to the virtual MCP server.
diff --git a/pkg/vmcp/config/config_test.go b/pkg/vmcp/config/config_test.go
index da9e88aeda..5f652ebce0 100644
--- a/pkg/vmcp/config/config_test.go
+++ b/pkg/vmcp/config/config_test.go
@@ -4,7 +4,6 @@
 package config
 
 import (
-	"encoding/json"
 	"fmt"
 	"os"
 	"path/filepath"
@@ -16,36 +15,10 @@ import (
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
-	"gopkg.in/yaml.v3"
 
-	thvjson "github.com/stacklok/toolhive/pkg/json"
 	authtypes "github.com/stacklok/toolhive/pkg/vmcp/auth/types"
 )
 
-func TestConfigRateLimitingIsYAMLOnly(t *testing.T) {
-	t.Parallel()
-
-	rateLimiting := thvjson.NewMap(map[string]any{
-		"global": map[string]any{
-			"maxTokens":    10,
-			"refillPeriod": "1m0s",
-		},
-	})
-	cfg := Config{
-		Name:         "test-vmcp",
-		RateLimiting: &rateLimiting,
-	}
-
-	jsonBytes, err := json.Marshal(cfg)
-	require.NoError(t, err)
-	assert.NotContains(t, string(jsonBytes), "rateLimiting")
-
-	yamlBytes, err := yaml.Marshal(cfg)
-	require.NoError(t, err)
-	assert.Contains(t, string(yamlBytes), "rateLimiting:")
-	assert.Contains(t, string(yamlBytes), "maxTokens: 10")
-}
-
 func TestOutgoingAuthConfig_ResolveForBackend(t *testing.T) {
 	t.Parallel()
 
diff --git a/pkg/vmcp/config/zz_generated.deepcopy.go b/pkg/vmcp/config/zz_generated.deepcopy.go
index 018d6cabdd..80861bff11 100644
--- a/pkg/vmcp/config/zz_generated.deepcopy.go
+++ b/pkg/vmcp/config/zz_generated.deepcopy.go
@@ -204,10 +204,6 @@ func (in *Config) DeepCopyInto(out *Config) {
 		*out = new(SessionStorageConfig)
 		**out = **in
 	}
-	if in.RateLimiting != nil {
-		in, out := &in.RateLimiting, &out.RateLimiting
-		*out = (*in).DeepCopy()
-	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Config.

From dc566e1c6b1d1859e47fea2acfcad95d1002de1f Mon Sep 17 00:00:00 2001
From: Sanskarzz <sanskar.gur@gmail.com>
Date: Sat, 2 May 2026 21:28:39 +0530
Subject: [PATCH 4/4] removed wrapper and reverted path marshall directly

Signed-off-by: Sanskarzz <sanskar.gur@gmail.com>
---
 .../virtualmcpserver_vmcpconfig.go            | 13 +-------
 .../virtualmcpserver_vmcpconfig_test.go       | 31 ++-----------------
 2 files changed, 4 insertions(+), 40 deletions(-)

diff --git a/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig.go b/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig.go
index 7a37984b56..b4c4956371 100644
--- a/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig.go
+++ b/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig.go
@@ -82,14 +82,9 @@ func (r *VirtualMCPServerReconciler) ensureVmcpConfigConfigMap(
 	}
 
 	// Marshal the serializable Config to YAML for storage in ConfigMap.
-	// RateLimiting is a top-level VirtualMCPServer spec field, not part of
-	// spec.config, so add it only when writing the runtime ConfigMap.
 	// Note: gopkg.in/yaml.v3 produces deterministic output by sorting map keys alphabetically.
 	// This ensures stable checksums for triggering pod rollouts only when content actually changes.
-	vmcpConfigYAML, err := yaml.Marshal(vmcpConfigMapConfig{
-		Config:       *config,
-		RateLimiting: vmcp.Spec.RateLimiting,
-	})
+	vmcpConfigYAML, err := yaml.Marshal(config)
 	if err != nil {
 		return fmt.Errorf("failed to marshal vmcp config: %w", err)
 	}
@@ -135,12 +130,6 @@ func (r *VirtualMCPServerReconciler) ensureVmcpConfigConfigMap(
 	return nil
 }
 
-type vmcpConfigMapConfig struct {
-	vmcpconfig.Config `yaml:",inline"`
-
-	RateLimiting *mcpv1beta1.RateLimitConfig `yaml:"rateLimiting,omitempty"`
-}
-
 // populateOptimizerEmbeddingService wires the EmbeddingServer URL into the optimizer
 // config and emits warnings for non-recommended configurations.
 //
diff --git a/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig_test.go b/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig_test.go
index 5e7c095cc5..2da5a24a45 100644
--- a/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig_test.go
+++ b/cmd/thv-operator/controllers/virtualmcpserver_vmcpconfig_test.go
@@ -459,25 +459,6 @@ func TestEnsureVmcpConfigConfigMap(t *testing.T) {
 		},
 		Spec: mcpv1beta1.VirtualMCPServerSpec{
 			GroupRef: &mcpv1beta1.MCPGroupRef{Name: "test-group"},
-			SessionStorage: &mcpv1beta1.SessionStorageConfig{
-				Provider: mcpv1beta1.SessionStorageProviderRedis,
-				Address:  "redis.default.svc.cluster.local:6379",
-			},
-			RateLimiting: &mcpv1beta1.RateLimitConfig{
-				PerUser: &mcpv1beta1.RateLimitBucket{
-					MaxTokens:    2,
-					RefillPeriod: metav1.Duration{Duration: time.Minute},
-				},
-				Tools: []mcpv1beta1.ToolRateLimitConfig{
-					{
-						Name: "backend_a_echo",
-						Shared: &mcpv1beta1.RateLimitBucket{
-							MaxTokens:    5,
-							RefillPeriod: metav1.Duration{Duration: time.Minute},
-						},
-					},
-				},
-			},
 		},
 	}
 
@@ -527,16 +508,10 @@ func TestEnsureVmcpConfigConfigMap(t *testing.T) {
 	assert.Contains(t, cm.Data, "config.yaml")
 	assert.NotEmpty(t, cm.Annotations["toolhive.stacklok.dev/content-checksum"])
 
-	var cfg vmcpConfigMapConfig
+	var cfg vmcpconfig.Config
 	require.NoError(t, yaml.Unmarshal([]byte(cm.Data["config.yaml"]), &cfg))
-	require.NotNil(t, cfg.RateLimiting, "runtime config must include spec.rateLimiting")
-
-	require.NotNil(t, cfg.RateLimiting.PerUser)
-	assert.EqualValues(t, 2, cfg.RateLimiting.PerUser.MaxTokens)
-	require.Len(t, cfg.RateLimiting.Tools, 1)
-	assert.Equal(t, "backend_a_echo", cfg.RateLimiting.Tools[0].Name)
-	require.NotNil(t, cfg.RateLimiting.Tools[0].Shared)
-	assert.EqualValues(t, 5, cfg.RateLimiting.Tools[0].Shared.MaxTokens)
+	assert.Equal(t, "test-vmcp", cfg.Name)
+	assert.Equal(t, "test-group", cfg.Group)
 }
 
 // TestSetAuthConfigConditions tests that auth config conditions reflect the current state