diff --git a/Sources/ContainerCommands/Builder/BuilderStart.swift b/Sources/ContainerCommands/Builder/BuilderStart.swift index 7baa82c7a..cea64fdb6 100644 --- a/Sources/ContainerCommands/Builder/BuilderStart.swift +++ b/Sources/ContainerCommands/Builder/BuilderStart.swift @@ -173,7 +173,7 @@ extension Application { // If they changed, stop and delete the existing builder try await client.stop(id: existingContainer.id) try await client.delete(id: existingContainer.id) - case .stopped: + case .stopped, .bootstrapped: // If the builder is stopped and matches our requirements, start it // Otherwise, delete it and create a new one guard imageChanged || cpuChanged || memChanged || envChanged || dnsChanged else { @@ -186,7 +186,7 @@ extension Application { .invalidState, message: "builder is stopping, please wait until it is fully stopped before proceeding" ) - case .unknown: + case .unknown, .restarting: break } } diff --git a/Sources/ContainerCommands/Container/ContainerCreate.swift b/Sources/ContainerCommands/Container/ContainerCreate.swift index ac26d205a..b48642c65 100644 --- a/Sources/ContainerCommands/Container/ContainerCreate.swift +++ b/Sources/ContainerCommands/Container/ContainerCreate.swift @@ -82,7 +82,7 @@ extension Application { log: log ) - let options = ContainerCreateOptions(autoRemove: managementFlags.remove) + let options = ContainerCreateOptions(autoRemove: managementFlags.remove, restartPolicy: managementFlags.restart) let client = ContainerClient() try await client.create(configuration: ck.0, options: options, kernel: ck.1, initImage: ck.2) diff --git a/Sources/ContainerCommands/Container/ContainerList.swift b/Sources/ContainerCommands/Container/ContainerList.swift index 53b110bfa..c64b14355 100644 --- a/Sources/ContainerCommands/Container/ContainerList.swift +++ b/Sources/ContainerCommands/Container/ContainerList.swift @@ -99,7 +99,7 @@ extension ContainerSnapshot { } struct PrintableContainer: Codable { - let status: RuntimeStatus + let status: ContainerStatus let configuration: ContainerConfiguration let networks: [Attachment] let startedDate: Date? diff --git a/Sources/ContainerCommands/Container/ContainerRun.swift b/Sources/ContainerCommands/Container/ContainerRun.swift index c83fbf790..872d39de1 100644 --- a/Sources/ContainerCommands/Container/ContainerRun.swift +++ b/Sources/ContainerCommands/Container/ContainerRun.swift @@ -109,7 +109,7 @@ extension Application { progress.set(description: "Starting container") - let options = ContainerCreateOptions(autoRemove: managementFlags.remove) + let options = ContainerCreateOptions(autoRemove: managementFlags.remove, restartPolicy: managementFlags.restart) try await client.create( configuration: ck.0, options: options, diff --git a/Sources/ContainerResource/Container/ContainerCreateOptions.swift b/Sources/ContainerResource/Container/ContainerCreateOptions.swift index dd9da217a..af6edf24b 100644 --- a/Sources/ContainerResource/Container/ContainerCreateOptions.swift +++ b/Sources/ContainerResource/Container/ContainerCreateOptions.swift @@ -14,13 +14,32 @@ // limitations under the License. //===----------------------------------------------------------------------===// +public enum RestartPolicy: String, Sendable, Codable { + case no + case onFailure + case always +} + public struct ContainerCreateOptions: Codable, Sendable { public let autoRemove: Bool + public let restartPolicy: RestartPolicy - public init(autoRemove: Bool) { + public init(autoRemove: Bool, restartPolicy: RestartPolicy) { self.autoRemove = autoRemove + self.restartPolicy = restartPolicy + } + + public static let `default` = ContainerCreateOptions(autoRemove: false, restartPolicy: .no) + + enum CodingKeys: String, CodingKey { + case autoRemove + case restartPolicy } - public static let `default` = ContainerCreateOptions(autoRemove: false) + public init(from decoder: Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + autoRemove = try container.decode(Bool.self, forKey: .autoRemove) + restartPolicy = try container.decodeIfPresent(RestartPolicy.self, forKey: .restartPolicy) ?? .no + } } diff --git a/Sources/ContainerResource/Container/ContainerListFilters.swift b/Sources/ContainerResource/Container/ContainerListFilters.swift index 038b76d7d..ab2273f3f 100644 --- a/Sources/ContainerResource/Container/ContainerListFilters.swift +++ b/Sources/ContainerResource/Container/ContainerListFilters.swift @@ -21,7 +21,7 @@ public struct ContainerListFilters: Sendable, Codable { /// Filter by container IDs. If non-empty, only containers with matching IDs are returned. public var ids: [String] /// Filter by container status. - public var status: RuntimeStatus? + public var status: ContainerStatus? /// Filter by labels. All specified labels must match. public var labels: [String: String] @@ -30,7 +30,7 @@ public struct ContainerListFilters: Sendable, Codable { public init( ids: [String] = [], - status: RuntimeStatus? = nil, + status: ContainerStatus? = nil, labels: [String: String] = [:] ) { self.ids = ids diff --git a/Sources/ContainerResource/Container/ContainerSnapshot.swift b/Sources/ContainerResource/Container/ContainerSnapshot.swift index bae992423..11efc6c04 100644 --- a/Sources/ContainerResource/Container/ContainerSnapshot.swift +++ b/Sources/ContainerResource/Container/ContainerSnapshot.swift @@ -34,7 +34,7 @@ public struct ContainerSnapshot: Codable, Sendable { } /// The runtime status of the container. - public var status: RuntimeStatus + public var status: ContainerStatus /// Network interfaces attached to the sandbox that are provided to the container. public var networks: [Attachment] /// When the container was started. @@ -42,7 +42,7 @@ public struct ContainerSnapshot: Codable, Sendable { public init( configuration: ContainerConfiguration, - status: RuntimeStatus, + status: ContainerStatus, networks: [Attachment], startedDate: Date? = nil ) { diff --git a/Sources/ContainerResource/Container/ContainerStatus.swift b/Sources/ContainerResource/Container/ContainerStatus.swift new file mode 100644 index 000000000..9b9423068 --- /dev/null +++ b/Sources/ContainerResource/Container/ContainerStatus.swift @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the container project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import Foundation + +/// Runtime status for a sandbox or container. +public enum ContainerStatus: String, CaseIterable, Sendable, Codable { + /// The object is in an unknown status. + case unknown + /// The object is currently stopped. + case stopped + /// The object is waiting to be restarted. + case restarting + /// The object is currently bootstrapped. + case bootstrapped + /// The object is currently running. + case running + /// The object is currently stopping. + case stopping +} diff --git a/Sources/ContainerResource/Container/RuntimeStatus.swift b/Sources/ContainerResource/Container/SandboxStatus.swift similarity index 94% rename from Sources/ContainerResource/Container/RuntimeStatus.swift rename to Sources/ContainerResource/Container/SandboxStatus.swift index 88900735f..97dd27d70 100644 --- a/Sources/ContainerResource/Container/RuntimeStatus.swift +++ b/Sources/ContainerResource/Container/SandboxStatus.swift @@ -17,7 +17,7 @@ import Foundation /// Runtime status for a sandbox or container. -public enum RuntimeStatus: String, CaseIterable, Sendable, Codable { +public enum SandboxStatus: String, CaseIterable, Sendable, Codable { /// The object is in an unknown status. case unknown /// The object is currently stopped. diff --git a/Sources/Helpers/APIServer/APIServer+Start.swift b/Sources/Helpers/APIServer/APIServer+Start.swift index b4df3b09f..45af7f81e 100644 --- a/Sources/Helpers/APIServer/APIServer+Start.swift +++ b/Sources/Helpers/APIServer/APIServer+Start.swift @@ -61,7 +61,7 @@ extension APIServer { var routes = [XPCRoute: XPCServer.RouteHandler]() let pluginLoader = try initializePluginLoader(log: log) try await initializePlugins(pluginLoader: pluginLoader, log: log, routes: &routes) - let containersService = try initializeContainersService( + let containersService = try await initializeContainersService( pluginLoader: pluginLoader, log: log, routes: &routes @@ -261,7 +261,7 @@ extension APIServer { routes[XPCRoute.getDefaultKernel] = harness.getDefaultKernel } - private func initializeContainersService(pluginLoader: PluginLoader, log: Logger, routes: inout [XPCRoute: XPCServer.RouteHandler]) throws -> ContainersService { + private func initializeContainersService(pluginLoader: PluginLoader, log: Logger, routes: inout [XPCRoute: XPCServer.RouteHandler]) async throws -> ContainersService { log.info("initializing containers service") let service = try ContainersService( @@ -288,6 +288,8 @@ extension APIServer { routes[XPCRoute.containerDiskUsage] = harness.diskUsage routes[XPCRoute.containerExport] = harness.export + async let _ = try service.runRestartScheduler() + return service } diff --git a/Sources/Services/ContainerAPIService/Client/Flags.swift b/Sources/Services/ContainerAPIService/Client/Flags.swift index 88de209f9..018d2a846 100644 --- a/Sources/Services/ContainerAPIService/Client/Flags.swift +++ b/Sources/Services/ContainerAPIService/Client/Flags.swift @@ -15,9 +15,12 @@ //===----------------------------------------------------------------------===// import ArgumentParser +import ContainerResource import ContainerizationError import Foundation +extension RestartPolicy: ExpressibleByArgument {} + public struct Flags { public struct Logging: ParsableArguments { public init() {} @@ -304,6 +307,9 @@ public struct Flags { @Flag(name: [.customLong("rm"), .long], help: "Remove the container after it stops") public var remove = false + @Option(name: .long, help: "Restart policy when the container exits") + public var restart: RestartPolicy = .no + @Flag(name: .long, help: "Enable Rosetta in the container") public var rosetta = false diff --git a/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift b/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift index 3c8247770..004ac51d2 100644 --- a/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift +++ b/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift @@ -31,11 +31,24 @@ import Logging import SystemPackage public actor ContainersService { + struct StoppedState { + var startError: Error? + var exitStatus: ExitStatus? + } + struct ContainerState { + private static let initialBackOff = Duration.milliseconds(100) + private static let maxBackOff = Duration.seconds(10) + public static let stabilityCall = Duration.seconds(10) + var snapshot: ContainerSnapshot var client: SandboxClient? var allocatedAttachments: [AllocatedAttachment] + var stoppedState: StoppedState? + var manualStopped: Bool = false + var backOff: Duration? + func getClient() throws -> SandboxClient { guard let client else { var message = "no sandbox client exists" @@ -46,11 +59,32 @@ public actor ContainersService { } return client } + + mutating func setStartError(error: Error) { + stoppedState = StoppedState(startError: error) + backOff = nil + } + + mutating func setExitStatus(exitStatus: ExitStatus?, restartPolicy: RestartPolicy) { + stoppedState = StoppedState(exitStatus: exitStatus) + switch restartPolicy { + case .onFailure where !manualStopped && (exitStatus?.exitCode ?? 0) != 0, + .always where !manualStopped: + backOff = backOff.map { min($0 * 2, Self.maxBackOff) } ?? Self.initialBackOff + case _: + backOff = nil + } + } } private static let machServicePrefix = "com.apple.container" private static let launchdDomainString = try! ServiceManager.getDomainString() + private let exitQueue: AsyncStream + private let exitQueueContinuation: AsyncStream.Continuation + private var restartScheduler: Task? + private var stabilityMonitor: ExitMonitor? + private let log: Logger private let debugHelpers: Bool private let containerRoot: URL @@ -79,7 +113,13 @@ public actor ContainersService { self.log = log self.debugHelpers = debugHelpers self.runtimePlugins = pluginLoader.findPlugins().filter { $0.hasType(.runtime) } + + (self.exitQueue, self.exitQueueContinuation) = AsyncStream.makeStream(of: String.self) self.containers = try Self.loadAtBoot(root: containerRoot, loader: pluginLoader, log: log) + + for id in self.containers.keys { + self.exitQueueContinuation.yield(id) + } } public func setNetworksService(_ service: NetworksService) async { @@ -108,7 +148,7 @@ public actor ContainersService { networks: [], startedDate: nil ), - allocatedAttachments: [] + allocatedAttachments: [], ) results[config.id] = state guard runtimePlugins.first(where: { $0.name == config.runtimeHandler }) != nil else { @@ -130,6 +170,84 @@ public actor ContainersService { return results } + public func runRestartScheduler() throws { + log.debug( + "ContainersService: enter", + metadata: ["func": "\(#function)"] + ) + defer { + log.debug( + "ContainersService: exit", + metadata: ["func": "\(#function)"] + ) + } + + guard restartScheduler == nil else { + throw ContainerizationError(.invalidState, message: "already running restart scheduler") + } + + stabilityMonitor = ExitMonitor(log: log) + restartScheduler = Task { + for await id in self.exitQueue { + Task { + do { + await stabilityMonitor?.stopTracking(id: id) + + let state = try self._getContainerState(id: id) + let options = try self.getContainerCreationOptions(id: id) + guard options.autoRemove == false else { + return + } + + let startFailed = state.stoppedState?.startError != nil + let exitedWithError = (state.stoppedState?.exitStatus?.exitCode ?? 0) != 0 + let manualStopped = state.manualStopped + + switch options.restartPolicy { + case .onFailure where !startFailed && !manualStopped && exitedWithError: + break + case .always where !startFailed && !manualStopped: + break + case _: + return + } + + try await restart(id: id) + if let backOff = state.backOff { + try await Task.sleep(for: backOff) + } + + guard (try self._getContainerState(id: id)).snapshot.status == .restarting else { + return + } + + try await stabilityMonitor?.registerProcess( + id: id, + onExit: { id, code in + guard code.exitCode == 0 else { + return + } + try? await self.resetBackOff(id: id) + } + ) + try await stabilityMonitor?.track(id: id) { + try await Task.sleep(for: ContainerState.stabilityCall) + return ExitStatus(exitCode: 0) + } + + try await bootstrap(id: id, stdio: [FileHandle?](repeating: nil, count: 3)) + try await startProcess(id: id, processID: id) + } catch { + try await kill(id: id, processID: id, signal: Int64(SIGKILL)) + log.error( + "failed to restart container", + metadata: ["id": "\(id)", "error": "\(error)"]) + } + } + } + } + } + /// List containers matching the given filters. public func list(filters: ContainerListFilters = .all) async throws -> [ContainerSnapshot] { log.debug( @@ -370,7 +488,10 @@ public actor ContainersService { networks: [], startedDate: nil ) - await self.setContainerState(configuration.id, ContainerState(snapshot: snapshot, allocatedAttachments: []), context: context) + await self.setContainerState( + configuration.id, + ContainerState(snapshot: snapshot, allocatedAttachments: []), + context: context) } catch { throw error } @@ -462,6 +583,7 @@ public actor ContainersService { state.client = sandboxClient state.allocatedAttachments = allocatedAttachments + state.snapshot.status = .bootstrapped await self.setContainerState(id, state, context: context) } catch { for allocatedAttach in allocatedAttachments { @@ -485,6 +607,10 @@ public actor ContainersService { await self.exitMonitor.stopTracking(id: id) try? ServiceManager.deregister(fullServiceLabel: label) + + state.setStartError(error: error) + await self.setContainerState(id, state, context: context) + throw error } } @@ -587,6 +713,10 @@ public actor ContainersService { } catch { await self.exitMonitor.stopTracking(id: id) try? await client.stop(options: ContainerStopOptions.default) + + state.setStartError(error: error) + await self.setContainerState(id, state, context: context) + throw error } } @@ -614,9 +744,15 @@ public actor ContainersService { ) } - let state = try self._getContainerState(id: id) - let client = try state.getClient() - try await client.kill(processID, signal: signal) + try await self.lock.withLock(logMetadata: ["acquirer": "\(#function)", "id": "\(id)"]) { context in + var state = try await self.getContainerState(id: id, context: context) + + state.manualStopped = true + await self.setContainerState(id, state, context: context) + + let client = try state.getClient() + try await client.kill(processID, signal: signal) + } } /// Stop all containers inside the sandbox, aborting any processes currently @@ -639,24 +775,30 @@ public actor ContainersService { ) } - let state = try self._getContainerState(id: id) + try await self.lock.withLock(logMetadata: ["acquirer": "\(#function)", "id": "\(id)"]) { context in + var state = try await self.getContainerState(id: id, context: context) - // Stop should be idempotent. - let client: SandboxClient - do { - client = try state.getClient() - } catch { - return - } + state.manualStopped = true + await self.setContainerState(id, state, context: context) - do { - try await client.stop(options: options) - } catch let err as ContainerizationError { - if err.code != .interrupted { - throw err + // Stop should be idempotent. + let client: SandboxClient + do { + client = try state.getClient() + } catch { + return + } + + do { + try await client.stop(options: options) + } catch let err as ContainerizationError { + if err.code != .interrupted { + throw err + } } + + try await self.handleContainerExit(id: id, code: nil, context: context) } - try await handleContainerExit(id: id) } public func dial(id: String, port: UInt32) async throws -> FileHandle { @@ -989,15 +1131,43 @@ public actor ContainersService { } } + let options = try getContainerCreationOptions(id: id) + state.snapshot.status = .stopped state.snapshot.networks = [] state.client = nil state.allocatedAttachments = [] + state.setExitStatus(exitStatus: code, restartPolicy: options.restartPolicy) await self.setContainerState(id, state, context: context) - let options = try getContainerCreationOptions(id: id) if options.autoRemove { try await self.cleanUp(id: id, context: context) + } else { + exitQueueContinuation.yield(id) + } + } + + private func restart(id: String) async throws { + try await self.lock.withLock { context in + var state = try await self.getContainerState(id: id, context: context) + guard state.snapshot.status == .stopped else { + throw ContainerizationError(.invalidState, message: "container not stopped: '\(id)'") + } + + state.snapshot.status = .restarting + await self.setContainerState(id, state, context: context) + } + } + + private func resetBackOff(id: String) async throws { + try await self.lock.withLock { context in + var state = try await self.getContainerState(id: id, context: context) + guard state.snapshot.status == .running else { + return + } + + state.backOff = nil + await self.setContainerState(id, state, context: context) } } diff --git a/Sources/Services/ContainerSandboxService/Client/SandboxSnapshot.swift b/Sources/Services/ContainerSandboxService/Client/SandboxSnapshot.swift index 1ba312b8c..9e1841421 100644 --- a/Sources/Services/ContainerSandboxService/Client/SandboxSnapshot.swift +++ b/Sources/Services/ContainerSandboxService/Client/SandboxSnapshot.swift @@ -19,14 +19,14 @@ import ContainerResource /// A snapshot of a sandbox and its resources. public struct SandboxSnapshot: Codable, Sendable { /// The runtime status of the sandbox. - public var status: RuntimeStatus + public var status: SandboxStatus /// Network attachments for the sandbox. public var networks: [Attachment] /// Containers placed in the sandbox. public var containers: [ContainerSnapshot] public init( - status: RuntimeStatus, + status: SandboxStatus, networks: [Attachment], containers: [ContainerSnapshot] ) { diff --git a/Sources/Services/ContainerSandboxService/Server/SandboxService.swift b/Sources/Services/ContainerSandboxService/Server/SandboxService.swift index db8acd03c..ed3037a07 100644 --- a/Sources/Services/ContainerSandboxService/Server/SandboxService.swift +++ b/Sources/Services/ContainerSandboxService/Server/SandboxService.swift @@ -428,7 +428,7 @@ public actor SandboxService { self.log.debug("enter", metadata: ["func": "\(#function)"]) defer { self.log.debug("exit", metadata: ["func": "\(#function)"]) } - var status: RuntimeStatus = .unknown + var status: SandboxStatus = .unknown var networks: [Attachment] = [] var cs: ContainerSnapshot? @@ -444,7 +444,7 @@ public actor SandboxService { networks = ctr.attachments cs = ContainerSnapshot( configuration: ctr.config, - status: RuntimeStatus.running, + status: .running, networks: networks ) } diff --git a/Tests/CLITests/Subcommands/Run/TestCLIRunRestart.swift b/Tests/CLITests/Subcommands/Run/TestCLIRunRestart.swift new file mode 100644 index 000000000..d88a83637 --- /dev/null +++ b/Tests/CLITests/Subcommands/Run/TestCLIRunRestart.swift @@ -0,0 +1,201 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the container project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import ContainerResource +import ContainerizationError +import Foundation +import Testing + +class TestCLIRunRestart: CLITest { + func getTestName() -> String { + Test.current!.name.trimmingCharacters(in: ["(", ")"]).lowercased() + } + + // Run a container that exits with the given code on first run, then sleeps forever on restart. + // This allows waitForContainerRunning to reliably catch the restarted container. + private func runWithRestartOnce(name: String, policy: RestartPolicy, exitCode: Int) throws { + try doLongRun( + name: name, + args: ["--restart", policy.rawValue], + containerArgs: ["sh", "-c", "if [ ! -f /tmp/restarted ]; then touch /tmp/restarted; exit \(exitCode); else sleep infinity; fi"], + autoRemove: false + ) + } + + @Test func testRestartNo() async throws { + let name = getTestName() + defer { try? doRemove(name: name, force: true) } + + try runWithRestartOnce(name: name, policy: .no, exitCode: 0) + + // Give a moment for any (unexpected) restart to occur + try await Task.sleep(for: .seconds(3)) + + let status = try getContainerStatus(name) + #expect(status == "stopped", "expected container with restart policy 'no' to remain stopped, got '\(status)'") + } + + @Test func testRestartOnFailure() async throws { + let failing = "\(getTestName())-exit-fail" + + // Non-zero exit: should restart + try runWithRestartOnce(name: failing, policy: .onFailure, exitCode: 1) + defer { try? doRemove(name: failing, force: true) } + + // Give time for container to restart + try await Task.sleep(for: .seconds(3)) + + try waitForContainerRunning(failing) + var status = try getContainerStatus(failing) + #expect(status == "running", "expected container with 'onFailure' policy to restart after non-zero exit, got '\(status)'") + + try doKill(name: failing) + try await Task.sleep(for: .seconds(3)) + + status = try getContainerStatus(failing) + #expect(status == "stopped", "expected container with 'onFailure' policy to not restart after manual stop, got '\(status)'") + try doRemove(name: failing, force: true) + + let succeeding = "\(getTestName())-exit-succeed" + + try runWithRestartOnce(name: succeeding, policy: .onFailure, exitCode: 0) + defer { try? doRemove(name: succeeding, force: true) } + + try await Task.sleep(for: .seconds(3)) + status = try getContainerStatus(succeeding) + #expect(status == "stopped", "expected container with 'onFailure' policy to not restart after zero exit, got '\(status)'") + } + + @Test func testRestartAlways() async throws { + let name = getTestName() + + try runWithRestartOnce(name: name, policy: .always, exitCode: 0) + defer { try? doRemove(name: name, force: true) } + + // Give time for container to restart + try await Task.sleep(for: .seconds(3)) + + try waitForContainerRunning(name) + var status = try getContainerStatus(name) + #expect(status == "running", "expected container with 'always' policy to restart after zero exit, got '\(status)'") + + try doKill(name: name) + try await Task.sleep(for: .seconds(3)) + + status = try getContainerStatus(name) + #expect(status == "stopped", "expected container with 'always' policy to not restart after manual stop, got '\(status)'") + } + + @Test func testRestartMultiple() async throws { + // Multiple containers restarting must not block each other + let name1 = "\(getTestName())1" + let name2 = "\(getTestName())2" + + try runWithRestartOnce(name: name1, policy: .always, exitCode: 1) + try runWithRestartOnce(name: name2, policy: .always, exitCode: 1) + defer { + try? doStop(name: name1) + try? doStop(name: name2) + try? doRemove(name: name1, force: true) + try? doRemove(name: name2, force: true) + } + + // Both should restart independently without blocking each other + try waitForContainerRunning(name1) + try waitForContainerRunning(name2) + + let status1 = try getContainerStatus(name1) + let status2 = try getContainerStatus(name2) + #expect(status1 == "running", "expected container1 to be running, got '\(status1)'") + #expect(status2 == "running", "expected container2 to be running, got '\(status2)'") + } + + @Test func testBackOff() async throws { + let name = getTestName() + + try doLongRun( + name: name, + args: ["--restart", RestartPolicy.always.rawValue], + containerArgs: ["sh", "-c", "sleep 1; exit 1;"], + autoRemove: false + ) + defer { + try? doStop(name: name) + try? doRemove(name: name, force: true) + } + + // Poll until running (first restart) + var samples: [Bool] = [] + for _ in 0..<30 { + if (try? getContainerStatus(name)) == "running" { + samples.append(true) + } else { + samples.append(false) + } + try await Task.sleep(for: .milliseconds(500)) + } + #expect(samples.contains(true), "container did not restart for the first time") + + // If backOff is doubling correctly, the gap between restarts grows each cycle. + // With 500ms polling and 1s sleep-before-exit per cycle, the backOff doubling + // (100ms -> 200ms -> 400ms -> 800ms -> ...) means consecutive false runs grow longer. + // The maximum run of consecutive false samples must be at least 4. + var maxConsecutiveFalse = 0 + var currentRun = 0 + for sample in samples { + if !sample { + currentRun += 1 + maxConsecutiveFalse = max(maxConsecutiveFalse, currentRun) + } else { + currentRun = 0 + } + } + #expect(maxConsecutiveFalse >= 4, "expected backOff to cause at least 4 consecutive stopped samples, got \(maxConsecutiveFalse)") + } + + @Test func testStabilityCall() async throws { + let name = getTestName() + + // Each run increments a counter by appending a line to /tmp/count. + // On run 8, backOff has accumulated to 10s (capped); the container then sleeps 12s + // to stay alive past the stabilityCall window, which resets backOff to nil. + // + // BackOff sequence: 100ms, 200ms, 400ms, 800ms, 1.6s, 3.2s, 6.4s, 10s (cap) + // Total expected wait: (1+0.1)+(1+0.2)+(1+0.4)+(1+0.8)+(1+1.6)+(1+3.2)+(1+6.4)+(1+10)+12+0.5 ≈ 43.2s + try doLongRun( + name: name, + args: ["--restart", RestartPolicy.always.rawValue], + containerArgs: [ + "sh", "-c", + "echo x >> /tmp/count; n=$(wc -l < /tmp/count); if [ \"$n\" -ge 8 ]; then sleep 12; fi; exit 1", + ], + autoRemove: false + ) + defer { + try? doStop(name: name) + try? doRemove(name: name, force: true) + } + + // Wait for all 8 cycles + stability window + a small buffer. + // (1+0.1)+(1+0.2)+(1+0.4)+(1+0.8)+(1+1.6)+(1+3.2)+(1+6.4)+(1+10)+12+0.5 ≈ 43.2s + try await Task.sleep(for: .seconds(45)) + + // At this point run 8 has slept 12s, triggering stabilityCall and resetting backOff. + // The container should be running (restarted quickly after backOff reset). + let status = try getContainerStatus(name) + #expect(status == "running", "expected container to be running after stabilityCall reset backOff, got '\(status)'") + } +} diff --git a/Tests/CLITests/Utilities/CLITest.swift b/Tests/CLITests/Utilities/CLITest.swift index 91435003a..a34a063d7 100644 --- a/Tests/CLITests/Utilities/CLITest.swift +++ b/Tests/CLITests/Utilities/CLITest.swift @@ -312,6 +312,18 @@ class CLITest { } } + func doKill(name: String, signal: String = "SIGKILL") throws { + let (_, _, error, status) = try run(arguments: [ + "kill", + "-s", + signal, + name, + ]) + if status != 0 { + throw CLIError.executionFailed("command failed: \(error)") + } + } + func doCreate( name: String, image: String? = nil,