Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions controllers/actions.github.com/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,9 @@ const DefaultScaleSetListenerLogFormat = string(logging.LogFormatText)
// ownerKey is field selector matching the owner name of a particular resource
const resourceOwnerKey = ".metadata.controller"

// EphemeralRunner pod creation failure reasons
// EphemeralRunner failure reasons
const (
ReasonTooManyPodFailures = "TooManyPodFailures"
ReasonInvalidPodFailure = "InvalidPod"
ReasonTooManyPodFailures = "TooManyPodFailures"
ReasonInvalidPodFailure = "InvalidPod"
ReasonRegistrationInvalidated = "RegistrationInvalidated"
)
89 changes: 89 additions & 0 deletions controllers/actions.github.com/ephemeralrunner_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"net/http"
"strconv"
"strings"
"sync"
"time"

"github.com/actions/actions-runner-controller/apis/actions.github.com/v1alpha1"
Expand All @@ -44,12 +45,22 @@ const (
ephemeralRunnerActionsFinalizerName = "ephemeralrunner.actions.github.com/runner-registration-finalizer"
)

const defaultHealthCheckInterval = 5 * time.Minute

// EphemeralRunnerReconciler reconciles a EphemeralRunner object
type EphemeralRunnerReconciler struct {
client.Client
Log logr.Logger
Scheme *runtime.Scheme
ResourceBuilder

// HealthCheckInterval controls how often the GitHub registration
// health check runs per runner. Defaults to defaultHealthCheckInterval.
HealthCheckInterval time.Duration

// healthCheckLastRun tracks the last time each runner's registration
// was checked, keyed by "namespace/name".
healthCheckLastRun sync.Map
}

// precompute backoff durations for failed ephemeral runners
Expand Down Expand Up @@ -82,6 +93,9 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ

ephemeralRunner := new(v1alpha1.EphemeralRunner)
if err := r.Get(ctx, req.NamespacedName, ephemeralRunner); err != nil {
if client.IgnoreNotFound(err) == nil {
r.healthCheckLastRun.Delete(req.Namespace + "/" + req.Name)
}
return ctrl.Result{}, client.IgnoreNotFound(err)
}

Expand Down Expand Up @@ -356,6 +370,22 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
log.Info("Failed to update ephemeral runner status. Requeue to not miss this event")
return ctrl.Result{}, err
}

// Only check registration when the container is actually running
// (not in Waiting states like ImagePullBackOff or CrashLoopBackOff)
if cs.State.Running != nil {
healthy, err := r.checkRunnerRegistration(ctx, ephemeralRunner, log)
if err != nil {
return ctrl.Result{}, err
}
if !healthy {
if err := r.markAsFailed(ctx, ephemeralRunner, "Runner registration no longer exists on GitHub", ReasonRegistrationInvalidated, log); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}
}

return ctrl.Result{}, nil

case cs.State.Terminated.ExitCode != 0: // failed
Expand Down Expand Up @@ -804,6 +834,60 @@ func (r *EphemeralRunnerReconciler) updateRunStatusFromPod(ctx context.Context,
return nil
}

func (r *EphemeralRunnerReconciler) healthCheckIntervalOrDefault() time.Duration {
if r.HealthCheckInterval > 0 {
return r.HealthCheckInterval
}
return defaultHealthCheckInterval
}

// checkRunnerRegistration verifies the runner's GitHub-side registration is still valid.
// Returns true if the runner is healthy or we can't determine status (API errors).
// Returns false if the registration is confirmed gone (404).
// Calls are throttled to at most once per HealthCheckInterval per runner.
func (r *EphemeralRunnerReconciler) checkRunnerRegistration(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) (bool, error) {
// Only check runners that have been registered (have a RunnerId)
if ephemeralRunner.Status.RunnerId == 0 {
return true, nil
}

// Throttle: skip if we checked this runner recently
key := ephemeralRunner.Namespace + "/" + ephemeralRunner.Name
if lastRun, ok := r.healthCheckLastRun.Load(key); ok {
if time.Since(lastRun.(time.Time)) < r.healthCheckIntervalOrDefault() {
return true, nil
}
}

actionsClient, err := r.GetActionsService(ctx, ephemeralRunner)
if err != nil {
log.Error(err, "Failed to get actions client for health check, skipping")
return true, nil
}

_, err = actionsClient.GetRunner(ctx, int64(ephemeralRunner.Status.RunnerId))
r.healthCheckLastRun.Store(key, time.Now())
if err == nil {
return true, nil
}

var actionsErr *actions.ActionsError
if errors.As(err, &actionsErr) && actionsErr.StatusCode == 404 {
log.Info("Runner registration not found on GitHub, marking as failed",
"runnerId", ephemeralRunner.Status.RunnerId,
"runnerName", ephemeralRunner.Status.RunnerName,
)
return false, nil
}

// For non-404 errors (transient API issues), don't take action
log.Info("Health check API call failed, will retry after health check interval",
"runnerId", ephemeralRunner.Status.RunnerId,
"error", err.Error(),
)
return true, nil
}

func (r *EphemeralRunnerReconciler) deleteRunnerFromService(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) error {
client, err := r.GetActionsService(ctx, ephemeralRunner)
if err != nil {
Expand All @@ -813,6 +897,11 @@ func (r *EphemeralRunnerReconciler) deleteRunnerFromService(ctx context.Context,
log.Info("Removing runner from the service", "runnerId", ephemeralRunner.Status.RunnerId)
err = client.RemoveRunner(ctx, int64(ephemeralRunner.Status.RunnerId))
if err != nil {
var actionsErr *actions.ActionsError
if errors.As(err, &actionsErr) && actionsErr.StatusCode == 404 {
log.Info("Runner already removed from service", "runnerId", ephemeralRunner.Status.RunnerId)
return nil
}
return fmt.Errorf("failed to remove runner from the service: %w", err)
}

Expand Down
214 changes: 214 additions & 0 deletions controllers/actions.github.com/ephemeralrunner_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1422,4 +1422,218 @@ var _ = Describe("EphemeralRunner", func() {
).Should(BeTrue(), "failed to contact server")
})
})

Describe("Stale runner health check", func() {
var ctx context.Context
var mgr ctrl.Manager
var autoscalingNS *corev1.Namespace
var configSecret *corev1.Secret
var controller *EphemeralRunnerReconciler

Context("when GitHub registration is invalidated (404)", func() {
var ephemeralRunner *v1alpha1.EphemeralRunner

BeforeEach(func() {
ctx = context.Background()
autoscalingNS, mgr = createNamespace(GinkgoT(), k8sClient)
configSecret = createDefaultSecret(GinkgoT(), k8sClient, autoscalingNS.Name)

// GetRunner returns 404 — simulates GitHub forgetting the registration
// RemoveRunner also returns 404 — the runner is already gone
fakeClient := fake.NewFakeClient(
fake.WithGetRunner(nil, &actions.ActionsError{
StatusCode: 404,
Err: &actions.ActionsExceptionError{
ExceptionName: "AgentNotFoundException",
Message: "runner not found",
},
}),
fake.WithRemoveRunnerError(&actions.ActionsError{
StatusCode: 404,
Err: &actions.ActionsExceptionError{
ExceptionName: "AgentNotFoundException",
Message: "runner not found",
},
}),
)

controller = &EphemeralRunnerReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: logf.Log,
HealthCheckInterval: 1 * time.Second,
ResourceBuilder: ResourceBuilder{
SecretResolver: &SecretResolver{
k8sClient: mgr.GetClient(),
multiClient: fake.NewMultiClient(fake.WithDefaultClient(fakeClient, nil)),
},
},
}

err := controller.SetupWithManager(mgr)
Expect(err).To(BeNil(), "failed to setup controller")

ephemeralRunner = newExampleRunner("stale-runner", autoscalingNS.Name, configSecret.Name)
err = k8sClient.Create(ctx, ephemeralRunner)
Expect(err).To(BeNil(), "failed to create ephemeral runner")

startManagers(GinkgoT(), mgr)
})

It("should mark the runner as failed when GetRunner returns 404", func() {
// Wait for the controller to set up the runner (finalizers, secret, pod, status)
Eventually(
func() (int, error) {
updated := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
if err != nil {
return 0, err
}
return updated.Status.RunnerId, nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).ShouldNot(Equal(0), "runner should have a RunnerId set")

// Wait for the pod to exist, then simulate a running container
pod := new(corev1.Pod)
Eventually(
func() (bool, error) {
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
return false, err
}
return true, nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeEquivalentTo(true), "pod should exist")

// Set pod to running with a running container status
pod.Status.Phase = corev1.PodRunning
pod.Status.ContainerStatuses = []corev1.ContainerStatus{
{
Name: v1alpha1.EphemeralRunnerContainerName,
State: corev1.ContainerState{
Running: &corev1.ContainerStateRunning{
StartedAt: metav1.Now(),
},
},
},
}
err := k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "failed to update pod status to running")

// Now the health check should detect the stale registration and mark it failed
Eventually(
func() (corev1.PodPhase, error) {
updated := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
if err != nil {
return "", err
}
return updated.Status.Phase, nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(Equal(corev1.PodFailed), "runner with invalidated registration should be marked failed")
})
})

Context("when GetRunner returns a transient error (500)", func() {
var ephemeralRunner *v1alpha1.EphemeralRunner

BeforeEach(func() {
ctx = context.Background()
autoscalingNS, mgr = createNamespace(GinkgoT(), k8sClient)
configSecret = createDefaultSecret(GinkgoT(), k8sClient, autoscalingNS.Name)

// GetRunner returns 500 — transient error, should NOT kill the runner
fakeClient := fake.NewFakeClient(
fake.WithGetRunner(nil, &actions.ActionsError{
StatusCode: 500,
Err: fmt.Errorf("internal server error"),
}),
)

controller = &EphemeralRunnerReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: logf.Log,
HealthCheckInterval: 1 * time.Second,
ResourceBuilder: ResourceBuilder{
SecretResolver: &SecretResolver{
k8sClient: mgr.GetClient(),
multiClient: fake.NewMultiClient(fake.WithDefaultClient(fakeClient, nil)),
},
},
}

err := controller.SetupWithManager(mgr)
Expect(err).To(BeNil(), "failed to setup controller")

ephemeralRunner = newExampleRunner("transient-error-runner", autoscalingNS.Name, configSecret.Name)
err = k8sClient.Create(ctx, ephemeralRunner)
Expect(err).To(BeNil(), "failed to create ephemeral runner")

startManagers(GinkgoT(), mgr)
})

It("should NOT mark the runner as failed on transient errors", func() {
// Wait for runner to get a RunnerId
Eventually(
func() (int, error) {
updated := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
if err != nil {
return 0, err
}
return updated.Status.RunnerId, nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).ShouldNot(Equal(0), "runner should have a RunnerId set")

// Wait for pod and set it to running (same as the 404 test)
pod := new(corev1.Pod)
Eventually(
func() (bool, error) {
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
return false, err
}
return true, nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeEquivalentTo(true), "pod should exist")

pod.Status.Phase = corev1.PodRunning
pod.Status.ContainerStatuses = []corev1.ContainerStatus{
{
Name: v1alpha1.EphemeralRunnerContainerName,
State: corev1.ContainerState{
Running: &corev1.ContainerStateRunning{
StartedAt: metav1.Now(),
},
},
},
}
err := k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "failed to update pod status to running")

// Give the controller time to process — it should NOT mark as failed
Consistently(
func() (corev1.PodPhase, error) {
updated := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
if err != nil {
return "", err
}
return updated.Status.Phase, nil
},
time.Second*3,
ephemeralRunnerInterval,
).ShouldNot(Equal(corev1.PodFailed), "runner should NOT be marked failed on transient errors")
})
})
})
})
6 changes: 6 additions & 0 deletions github/actions/fake/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,12 @@ func WithUpdateRunnerScaleSet(scaleSet *actions.RunnerScaleSet, err error) Optio
}
}

func WithRemoveRunnerError(err error) Option {
return func(f *FakeClient) {
f.removeRunnerResult.err = err
}
}

var defaultRunnerScaleSet = &actions.RunnerScaleSet{
Id: 1,
Name: "testset",
Expand Down