diff --git a/controllers/actions.github.com/constants.go b/controllers/actions.github.com/constants.go index ad1841fca2..6b4f76909e 100644 --- a/controllers/actions.github.com/constants.go +++ b/controllers/actions.github.com/constants.go @@ -75,8 +75,9 @@ const DefaultScaleSetListenerLogFormat = string(logging.LogFormatText) // ownerKey is field selector matching the owner name of a particular resource const resourceOwnerKey = ".metadata.controller" -// EphemeralRunner pod creation failure reasons +// EphemeralRunner failure reasons const ( - ReasonTooManyPodFailures = "TooManyPodFailures" - ReasonInvalidPodFailure = "InvalidPod" + ReasonTooManyPodFailures = "TooManyPodFailures" + ReasonInvalidPodFailure = "InvalidPod" + ReasonRegistrationInvalidated = "RegistrationInvalidated" ) diff --git a/controllers/actions.github.com/ephemeralrunner_controller.go b/controllers/actions.github.com/ephemeralrunner_controller.go index 4d0d978f8a..4282b7416f 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller.go +++ b/controllers/actions.github.com/ephemeralrunner_controller.go @@ -23,6 +23,7 @@ import ( "net/http" "strconv" "strings" + "sync" "time" "github.com/actions/actions-runner-controller/apis/actions.github.com/v1alpha1" @@ -44,12 +45,22 @@ const ( ephemeralRunnerActionsFinalizerName = "ephemeralrunner.actions.github.com/runner-registration-finalizer" ) +const defaultHealthCheckInterval = 5 * time.Minute + // EphemeralRunnerReconciler reconciles a EphemeralRunner object type EphemeralRunnerReconciler struct { client.Client Log logr.Logger Scheme *runtime.Scheme ResourceBuilder + + // HealthCheckInterval controls how often the GitHub registration + // health check runs per runner. Defaults to defaultHealthCheckInterval. + HealthCheckInterval time.Duration + + // healthCheckLastRun tracks the last time each runner's registration + // was checked, keyed by "namespace/name". + healthCheckLastRun sync.Map } // precompute backoff durations for failed ephemeral runners @@ -82,6 +93,9 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ ephemeralRunner := new(v1alpha1.EphemeralRunner) if err := r.Get(ctx, req.NamespacedName, ephemeralRunner); err != nil { + if client.IgnoreNotFound(err) == nil { + r.healthCheckLastRun.Delete(req.Namespace + "/" + req.Name) + } return ctrl.Result{}, client.IgnoreNotFound(err) } @@ -356,6 +370,22 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ log.Info("Failed to update ephemeral runner status. Requeue to not miss this event") return ctrl.Result{}, err } + + // Only check registration when the container is actually running + // (not in Waiting states like ImagePullBackOff or CrashLoopBackOff) + if cs.State.Running != nil { + healthy, err := r.checkRunnerRegistration(ctx, ephemeralRunner, log) + if err != nil { + return ctrl.Result{}, err + } + if !healthy { + if err := r.markAsFailed(ctx, ephemeralRunner, "Runner registration no longer exists on GitHub", ReasonRegistrationInvalidated, log); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, nil + } + } + return ctrl.Result{}, nil case cs.State.Terminated.ExitCode != 0: // failed @@ -804,6 +834,60 @@ func (r *EphemeralRunnerReconciler) updateRunStatusFromPod(ctx context.Context, return nil } +func (r *EphemeralRunnerReconciler) healthCheckIntervalOrDefault() time.Duration { + if r.HealthCheckInterval > 0 { + return r.HealthCheckInterval + } + return defaultHealthCheckInterval +} + +// checkRunnerRegistration verifies the runner's GitHub-side registration is still valid. +// Returns true if the runner is healthy or we can't determine status (API errors). +// Returns false if the registration is confirmed gone (404). +// Calls are throttled to at most once per HealthCheckInterval per runner. +func (r *EphemeralRunnerReconciler) checkRunnerRegistration(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) (bool, error) { + // Only check runners that have been registered (have a RunnerId) + if ephemeralRunner.Status.RunnerId == 0 { + return true, nil + } + + // Throttle: skip if we checked this runner recently + key := ephemeralRunner.Namespace + "/" + ephemeralRunner.Name + if lastRun, ok := r.healthCheckLastRun.Load(key); ok { + if time.Since(lastRun.(time.Time)) < r.healthCheckIntervalOrDefault() { + return true, nil + } + } + + actionsClient, err := r.GetActionsService(ctx, ephemeralRunner) + if err != nil { + log.Error(err, "Failed to get actions client for health check, skipping") + return true, nil + } + + _, err = actionsClient.GetRunner(ctx, int64(ephemeralRunner.Status.RunnerId)) + r.healthCheckLastRun.Store(key, time.Now()) + if err == nil { + return true, nil + } + + var actionsErr *actions.ActionsError + if errors.As(err, &actionsErr) && actionsErr.StatusCode == 404 { + log.Info("Runner registration not found on GitHub, marking as failed", + "runnerId", ephemeralRunner.Status.RunnerId, + "runnerName", ephemeralRunner.Status.RunnerName, + ) + return false, nil + } + + // For non-404 errors (transient API issues), don't take action + log.Info("Health check API call failed, will retry after health check interval", + "runnerId", ephemeralRunner.Status.RunnerId, + "error", err.Error(), + ) + return true, nil +} + func (r *EphemeralRunnerReconciler) deleteRunnerFromService(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) error { client, err := r.GetActionsService(ctx, ephemeralRunner) if err != nil { @@ -813,6 +897,11 @@ func (r *EphemeralRunnerReconciler) deleteRunnerFromService(ctx context.Context, log.Info("Removing runner from the service", "runnerId", ephemeralRunner.Status.RunnerId) err = client.RemoveRunner(ctx, int64(ephemeralRunner.Status.RunnerId)) if err != nil { + var actionsErr *actions.ActionsError + if errors.As(err, &actionsErr) && actionsErr.StatusCode == 404 { + log.Info("Runner already removed from service", "runnerId", ephemeralRunner.Status.RunnerId) + return nil + } return fmt.Errorf("failed to remove runner from the service: %w", err) } diff --git a/controllers/actions.github.com/ephemeralrunner_controller_test.go b/controllers/actions.github.com/ephemeralrunner_controller_test.go index 5ee0df0328..bc1db8bde7 100644 --- a/controllers/actions.github.com/ephemeralrunner_controller_test.go +++ b/controllers/actions.github.com/ephemeralrunner_controller_test.go @@ -1422,4 +1422,218 @@ var _ = Describe("EphemeralRunner", func() { ).Should(BeTrue(), "failed to contact server") }) }) + + Describe("Stale runner health check", func() { + var ctx context.Context + var mgr ctrl.Manager + var autoscalingNS *corev1.Namespace + var configSecret *corev1.Secret + var controller *EphemeralRunnerReconciler + + Context("when GitHub registration is invalidated (404)", func() { + var ephemeralRunner *v1alpha1.EphemeralRunner + + BeforeEach(func() { + ctx = context.Background() + autoscalingNS, mgr = createNamespace(GinkgoT(), k8sClient) + configSecret = createDefaultSecret(GinkgoT(), k8sClient, autoscalingNS.Name) + + // GetRunner returns 404 — simulates GitHub forgetting the registration + // RemoveRunner also returns 404 — the runner is already gone + fakeClient := fake.NewFakeClient( + fake.WithGetRunner(nil, &actions.ActionsError{ + StatusCode: 404, + Err: &actions.ActionsExceptionError{ + ExceptionName: "AgentNotFoundException", + Message: "runner not found", + }, + }), + fake.WithRemoveRunnerError(&actions.ActionsError{ + StatusCode: 404, + Err: &actions.ActionsExceptionError{ + ExceptionName: "AgentNotFoundException", + Message: "runner not found", + }, + }), + ) + + controller = &EphemeralRunnerReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Log: logf.Log, + HealthCheckInterval: 1 * time.Second, + ResourceBuilder: ResourceBuilder{ + SecretResolver: &SecretResolver{ + k8sClient: mgr.GetClient(), + multiClient: fake.NewMultiClient(fake.WithDefaultClient(fakeClient, nil)), + }, + }, + } + + err := controller.SetupWithManager(mgr) + Expect(err).To(BeNil(), "failed to setup controller") + + ephemeralRunner = newExampleRunner("stale-runner", autoscalingNS.Name, configSecret.Name) + err = k8sClient.Create(ctx, ephemeralRunner) + Expect(err).To(BeNil(), "failed to create ephemeral runner") + + startManagers(GinkgoT(), mgr) + }) + + It("should mark the runner as failed when GetRunner returns 404", func() { + // Wait for the controller to set up the runner (finalizers, secret, pod, status) + Eventually( + func() (int, error) { + updated := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) + if err != nil { + return 0, err + } + return updated.Status.RunnerId, nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).ShouldNot(Equal(0), "runner should have a RunnerId set") + + // Wait for the pod to exist, then simulate a running container + pod := new(corev1.Pod) + Eventually( + func() (bool, error) { + if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil { + return false, err + } + return true, nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(BeEquivalentTo(true), "pod should exist") + + // Set pod to running with a running container status + pod.Status.Phase = corev1.PodRunning + pod.Status.ContainerStatuses = []corev1.ContainerStatus{ + { + Name: v1alpha1.EphemeralRunnerContainerName, + State: corev1.ContainerState{ + Running: &corev1.ContainerStateRunning{ + StartedAt: metav1.Now(), + }, + }, + }, + } + err := k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "failed to update pod status to running") + + // Now the health check should detect the stale registration and mark it failed + Eventually( + func() (corev1.PodPhase, error) { + updated := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) + if err != nil { + return "", err + } + return updated.Status.Phase, nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(Equal(corev1.PodFailed), "runner with invalidated registration should be marked failed") + }) + }) + + Context("when GetRunner returns a transient error (500)", func() { + var ephemeralRunner *v1alpha1.EphemeralRunner + + BeforeEach(func() { + ctx = context.Background() + autoscalingNS, mgr = createNamespace(GinkgoT(), k8sClient) + configSecret = createDefaultSecret(GinkgoT(), k8sClient, autoscalingNS.Name) + + // GetRunner returns 500 — transient error, should NOT kill the runner + fakeClient := fake.NewFakeClient( + fake.WithGetRunner(nil, &actions.ActionsError{ + StatusCode: 500, + Err: fmt.Errorf("internal server error"), + }), + ) + + controller = &EphemeralRunnerReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Log: logf.Log, + HealthCheckInterval: 1 * time.Second, + ResourceBuilder: ResourceBuilder{ + SecretResolver: &SecretResolver{ + k8sClient: mgr.GetClient(), + multiClient: fake.NewMultiClient(fake.WithDefaultClient(fakeClient, nil)), + }, + }, + } + + err := controller.SetupWithManager(mgr) + Expect(err).To(BeNil(), "failed to setup controller") + + ephemeralRunner = newExampleRunner("transient-error-runner", autoscalingNS.Name, configSecret.Name) + err = k8sClient.Create(ctx, ephemeralRunner) + Expect(err).To(BeNil(), "failed to create ephemeral runner") + + startManagers(GinkgoT(), mgr) + }) + + It("should NOT mark the runner as failed on transient errors", func() { + // Wait for runner to get a RunnerId + Eventually( + func() (int, error) { + updated := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) + if err != nil { + return 0, err + } + return updated.Status.RunnerId, nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).ShouldNot(Equal(0), "runner should have a RunnerId set") + + // Wait for pod and set it to running (same as the 404 test) + pod := new(corev1.Pod) + Eventually( + func() (bool, error) { + if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil { + return false, err + } + return true, nil + }, + ephemeralRunnerTimeout, + ephemeralRunnerInterval, + ).Should(BeEquivalentTo(true), "pod should exist") + + pod.Status.Phase = corev1.PodRunning + pod.Status.ContainerStatuses = []corev1.ContainerStatus{ + { + Name: v1alpha1.EphemeralRunnerContainerName, + State: corev1.ContainerState{ + Running: &corev1.ContainerStateRunning{ + StartedAt: metav1.Now(), + }, + }, + }, + } + err := k8sClient.Status().Update(ctx, pod) + Expect(err).To(BeNil(), "failed to update pod status to running") + + // Give the controller time to process — it should NOT mark as failed + Consistently( + func() (corev1.PodPhase, error) { + updated := new(v1alpha1.EphemeralRunner) + err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated) + if err != nil { + return "", err + } + return updated.Status.Phase, nil + }, + time.Second*3, + ephemeralRunnerInterval, + ).ShouldNot(Equal(corev1.PodFailed), "runner should NOT be marked failed on transient errors") + }) + }) + }) }) diff --git a/github/actions/fake/client.go b/github/actions/fake/client.go index a108b902ac..9df026e5c1 100644 --- a/github/actions/fake/client.go +++ b/github/actions/fake/client.go @@ -45,6 +45,12 @@ func WithUpdateRunnerScaleSet(scaleSet *actions.RunnerScaleSet, err error) Optio } } +func WithRemoveRunnerError(err error) Option { + return func(f *FakeClient) { + f.removeRunnerResult.err = err + } +} + var defaultRunnerScaleSet = &actions.RunnerScaleSet{ Id: 1, Name: "testset",