diff --git a/build/RunTestsOnHelix.cmd b/build/RunTestsOnHelix.cmd index 569607f04333..9e29fa829a20 100644 --- a/build/RunTestsOnHelix.cmd +++ b/build/RunTestsOnHelix.cmd @@ -9,6 +9,10 @@ set DOTNET_ROOT=%HELIX_CORRELATION_PAYLOAD%\d set PATH=%DOTNET_ROOT%;%PATH% set TestFullMSBuild=%1 +REM Set DOTNET_HOST_PATH so MSBuild task hosts can locate the dotnet executable. +REM Without this, tasks from NuGet packages that use TaskHostFactory fail with MSB4216. +set DOTNET_HOST_PATH=%DOTNET_ROOT%\dotnet.exe + REM Ensure Visual Studio instances allow preview SDKs PowerShell -ExecutionPolicy ByPass -NoProfile -File "%HELIX_CORRELATION_PAYLOAD%\t\eng\enable-preview-sdks.ps1" @@ -35,14 +39,16 @@ dotnet new --debug:ephemeral-hive dotnet nuget list source --configfile %TestExecutionDirectory%\nuget.config if exist %TestExecutionDirectory%\Testpackages dotnet nuget add source %TestExecutionDirectory%\Testpackages --name testpackages --configfile %TestExecutionDirectory%\nuget.config -dotnet nuget remove source dotnet6-transport --configfile %TestExecutionDirectory%\nuget.config -dotnet nuget remove source dotnet6-internal-transport --configfile %TestExecutionDirectory%\nuget.config -dotnet nuget remove source dotnet7-transport --configfile %TestExecutionDirectory%\nuget.config -dotnet nuget remove source dotnet7-internal-transport --configfile %TestExecutionDirectory%\nuget.config -dotnet nuget remove source richnav --configfile %TestExecutionDirectory%\nuget.config -dotnet nuget remove source vs-impl --configfile %TestExecutionDirectory%\nuget.config -dotnet nuget remove source dotnet-libraries-transport --configfile %TestExecutionDirectory%\nuget.config -dotnet nuget remove source dotnet-tools-transport --configfile %TestExecutionDirectory%\nuget.config -dotnet nuget remove source dotnet-libraries --configfile %TestExecutionDirectory%\nuget.config -dotnet nuget remove source dotnet-eng --configfile %TestExecutionDirectory%\nuget.config +REM Remove feeds not needed for tests. Errors from non-existent sources +REM (e.g. internal-transport feeds only present in internal builds) are ignored. +dotnet nuget remove source dotnet6-transport --configfile %TestExecutionDirectory%\nuget.config 2>nul +dotnet nuget remove source dotnet6-internal-transport --configfile %TestExecutionDirectory%\nuget.config 2>nul +dotnet nuget remove source dotnet7-transport --configfile %TestExecutionDirectory%\nuget.config 2>nul +dotnet nuget remove source dotnet7-internal-transport --configfile %TestExecutionDirectory%\nuget.config 2>nul +dotnet nuget remove source richnav --configfile %TestExecutionDirectory%\nuget.config 2>nul +dotnet nuget remove source vs-impl --configfile %TestExecutionDirectory%\nuget.config 2>nul +dotnet nuget remove source dotnet-libraries-transport --configfile %TestExecutionDirectory%\nuget.config 2>nul +dotnet nuget remove source dotnet-tools-transport --configfile %TestExecutionDirectory%\nuget.config 2>nul +dotnet nuget remove source dotnet-libraries --configfile %TestExecutionDirectory%\nuget.config 2>nul +dotnet nuget remove source dotnet-eng --configfile %TestExecutionDirectory%\nuget.config 2>nul dotnet nuget list source --configfile %TestExecutionDirectory%\nuget.config diff --git a/build/RunTestsOnHelix.sh b/build/RunTestsOnHelix.sh index 887748f918b4..973eab473c0e 100644 --- a/build/RunTestsOnHelix.sh +++ b/build/RunTestsOnHelix.sh @@ -9,6 +9,12 @@ export MicrosoftNETBuildExtensionsTargets=$HELIX_CORRELATION_PAYLOAD/ex/msbuildE export DOTNET_ROOT=$HELIX_CORRELATION_PAYLOAD/d export PATH=$DOTNET_ROOT:$PATH +# Set DOTNET_HOST_PATH so MSBuild task hosts can locate the dotnet executable. +# Without this, tasks from NuGet packages that use TaskHostFactory (e.g. ComputeWasmBuildAssets +# from WebAssembly SDK, ComputeManagedAssemblies from ILLink) fail with MSB4216 on macOS +# because the task host process cannot find the dotnet host to launch. +export DOTNET_HOST_PATH=$DOTNET_ROOT/dotnet + export TestExecutionDirectory=$(realpath "$(mktemp -d "${TMPDIR:-/tmp}"/dotnetSdkTests.XXXXXXXX)") export DOTNET_CLI_HOME=$TestExecutionDirectory/.dotnet cp -a $HELIX_CORRELATION_PAYLOAD/t/TestExecutionDirectoryFiles/. $TestExecutionDirectory/ @@ -22,15 +28,17 @@ dotnet new --debug:ephemeral-hive dotnet nuget list source --configfile $TestExecutionDirectory/NuGet.config dotnet nuget add source $TestExecutionDirectory/Testpackages --configfile $TestExecutionDirectory/NuGet.config -#Remove feeds not needed for tests -dotnet nuget remove source dotnet6-transport --configfile $TestExecutionDirectory/NuGet.config -dotnet nuget remove source dotnet6-internal-transport --configfile $TestExecutionDirectory/NuGet.config -dotnet nuget remove source dotnet7-transport --configfile $TestExecutionDirectory/NuGet.config -dotnet nuget remove source dotnet7-internal-transport --configfile $TestExecutionDirectory/NuGet.config -dotnet nuget remove source richnav --configfile $TestExecutionDirectory/NuGet.config -dotnet nuget remove source vs-impl --configfile $TestExecutionDirectory/NuGet.config -dotnet nuget remove source dotnet-libraries-transport --configfile $TestExecutionDirectory/NuGet.config -dotnet nuget remove source dotnet-tools-transport --configfile $TestExecutionDirectory/NuGet.config -dotnet nuget remove source dotnet-libraries --configfile $TestExecutionDirectory/NuGet.config -dotnet nuget remove source dotnet-eng --configfile $TestExecutionDirectory/NuGet.config +# Remove feeds not needed for tests. Use || true to avoid errors when a source +# doesn't exist (e.g. internal-transport feeds are only present in internal builds). +dotnet nuget remove source dotnet6-transport --configfile $TestExecutionDirectory/NuGet.config || true +dotnet nuget remove source dotnet6-internal-transport --configfile $TestExecutionDirectory/NuGet.config || true +dotnet nuget remove source dotnet7-transport --configfile $TestExecutionDirectory/NuGet.config || true +dotnet nuget remove source dotnet7-internal-transport --configfile $TestExecutionDirectory/NuGet.config || true +dotnet nuget remove source richnav --configfile $TestExecutionDirectory/NuGet.config || true +dotnet nuget remove source vs-impl --configfile $TestExecutionDirectory/NuGet.config || true +dotnet nuget remove source dotnet-libraries-transport --configfile $TestExecutionDirectory/NuGet.config || true +dotnet nuget remove source dotnet-tools-transport --configfile $TestExecutionDirectory/NuGet.config || true +dotnet nuget remove source dotnet-libraries --configfile $TestExecutionDirectory/NuGet.config || true +dotnet nuget remove source dotnet-eng --configfile $TestExecutionDirectory/NuGet.config || true dotnet nuget list source --configfile $TestExecutionDirectory/NuGet.config + diff --git a/src/BlazorWasmSdk/Tasks/GZipCompress.cs b/src/BlazorWasmSdk/Tasks/GZipCompress.cs index 96481d04a91b..b5edfd894c14 100644 --- a/src/BlazorWasmSdk/Tasks/GZipCompress.cs +++ b/src/BlazorWasmSdk/Tasks/GZipCompress.cs @@ -20,6 +20,10 @@ public class GZipCompress : Task [Required] public string OutputDirectory { get; set; } + // Retry count for transient file I/O errors (e.g., antivirus locks on CI machines). + private const int MaxRetries = 3; + private const int RetryDelayMs = 200; + public override bool Execute() { CompressedFiles = new ITaskItem[FilesToCompress.Length]; @@ -56,18 +60,31 @@ public override bool Execute() Log.LogMessage(MessageImportance.Low, "Compressing '{0}' because file is newer than '{1}'.", inputFullPath, outputRelativePath); } - try + // Retry on IOException to handle transient file locks from antivirus, file + // indexing, or parallel MSBuild nodes on CI machines (see dotnet/sdk#53424). + for (int attempt = 1; attempt <= MaxRetries; attempt++) { - using var sourceStream = File.OpenRead(file.ItemSpec); - using var fileStream = File.Create(outputRelativePath); - using var stream = new GZipStream(fileStream, CompressionLevel.Optimal); + try + { + using var sourceStream = File.OpenRead(file.ItemSpec); + using var fileStream = File.Create(outputRelativePath); + using var stream = new GZipStream(fileStream, CompressionLevel.Optimal); - sourceStream.CopyTo(stream); - } - catch (Exception e) - { - Log.LogErrorFromException(e); - return; + sourceStream.CopyTo(stream); + return; // Success + } + catch (IOException) when (attempt < MaxRetries) + { + Log.LogMessage(MessageImportance.Low, + "Retrying compression of '{0}' (attempt {1}/{2}) due to transient I/O error.", + file.ItemSpec, attempt, MaxRetries); + Thread.Sleep(RetryDelayMs * attempt); + } + catch (Exception e) + { + Log.LogErrorFromException(e); + return; + } } }); diff --git a/src/Dotnet.Watch/Watch/Aspire/AspireServiceFactory.cs b/src/Dotnet.Watch/Watch/Aspire/AspireServiceFactory.cs index 10d46c189096..52efde284c2f 100644 --- a/src/Dotnet.Watch/Watch/Aspire/AspireServiceFactory.cs +++ b/src/Dotnet.Watch/Watch/Aspire/AspireServiceFactory.cs @@ -76,6 +76,16 @@ public async ValueTask DisposeAsync() _isDisposed = true; // wait for all in-flight process initialization to complete: + // If no session initialization is in-flight (_pendingSessionInitializationCount == 0), + // the semaphore will never be released by StartProjectAsync's finally block. + // Release it here to prevent a deadlock. Protect against the race where + // StartProjectAsync's finally block releases concurrently. + if (Volatile.Read(ref _pendingSessionInitializationCount) == 0) + { + try { _postDisposalSessionInitializationCompleted.Release(); } + catch (SemaphoreFullException) { } + } + await _postDisposalSessionInitializationCompleted.WaitAsync(CancellationToken.None); // terminate all active sessions: @@ -174,7 +184,10 @@ public async ValueTask StartProjectAsync(string dcpId, string sessionId, Project { if (Interlocked.Decrement(ref _pendingSessionInitializationCount) == 0 && _isDisposed) { - _postDisposalSessionInitializationCompleted.Release(); + // Guard against double-release: DisposeAsync may have already released + // the semaphore if it observed count==0 before we decremented. + try { _postDisposalSessionInitializationCompleted.Release(); } + catch (SemaphoreFullException) { } } } diff --git a/test/Microsoft.DotNet.HotReload.Test.Utilities/AwaitableProcess.cs b/test/Microsoft.DotNet.HotReload.Test.Utilities/AwaitableProcess.cs index 9f38f3af4545..96137a1bce6d 100644 --- a/test/Microsoft.DotNet.HotReload.Test.Utilities/AwaitableProcess.cs +++ b/test/Microsoft.DotNet.HotReload.Test.Utilities/AwaitableProcess.cs @@ -10,9 +10,19 @@ namespace Microsoft.DotNet.Watch.UnitTests { internal sealed class AwaitableProcess : IAsyncDisposable { - // cancel just before we hit timeout used on CI (XUnitWorkItemTimeout value in sdk\test\UnitTests.proj) + // Maximum time to wait for a single line of output from the process. + // On CI (Helix), cap at 5 minutes. The HELIX_WORK_ITEM_TIMEOUT is the total budget + // for ALL tests in the work item (~2h), which is far too long for a single + // wait-for-output operation. If a process produces no output for 5 minutes, + // it's deadlocked (e.g., dotnet-watch shutdown race in AspireServiceFactory). + // Capping here turns a 2-hour partition-blocking hang into a 5-minute clean failure. + private static readonly TimeSpan s_maxPerOperationTimeout = TimeSpan.FromMinutes(5); + private static readonly TimeSpan s_timeout = Environment.GetEnvironmentVariable("HELIX_WORK_ITEM_TIMEOUT") is { } value - ? TimeSpan.Parse(value).Subtract(TimeSpan.FromSeconds(10)) : TimeSpan.FromMinutes(10); + ? Min(TimeSpan.Parse(value).Subtract(TimeSpan.FromSeconds(10)), s_maxPerOperationTimeout) + : TimeSpan.FromMinutes(10); + + private static TimeSpan Min(TimeSpan a, TimeSpan b) => a < b ? a : b; private readonly List _lines = []; @@ -226,6 +236,17 @@ public async ValueTask DisposeAsync() { } + // Close stdin before killing. This unblocks PhysicalConsole.ListenToStandardInputAsync() + // which reads from stdin with CancellationToken.None and no timeout. + // Without this, the stdin reader can keep the process alive after Kill() on some platforms. + try + { + Process.StandardInput.Close(); + } + catch + { + } + try { Process.Kill(entireProcessTree: true); @@ -234,8 +255,17 @@ public async ValueTask DisposeAsync() { } - // ensure process has exited - await _processExitAwaiter; + // Wait for process exit with a timeout to prevent hanging the test if Kill() fails. + // The WaitForProcessExitAsync loop checks HasExited every 1 second, so 30s is generous. + using var exitTimeout = new CancellationTokenSource(TimeSpan.FromSeconds(30)); + try + { + await _processExitAwaiter.WaitAsync(exitTimeout.Token); + } + catch (OperationCanceledException) + { + Logger.Log($"Process {Id} did not exit within 30 seconds after Kill()"); + } Process.Dispose(); diff --git a/test/Microsoft.DotNet.HotReload.Test.Utilities/WatchableApp.cs b/test/Microsoft.DotNet.HotReload.Test.Utilities/WatchableApp.cs index c2fe8b02ae11..e6adaf5c7b4b 100644 --- a/test/Microsoft.DotNet.HotReload.Test.Utilities/WatchableApp.cs +++ b/test/Microsoft.DotNet.HotReload.Test.Utilities/WatchableApp.cs @@ -204,12 +204,16 @@ public ProcessStartInfo GetProcessStartInfo(string workingDirectory, string test info.Environment.Add("Microsoft_CodeAnalysis_EditAndContinue_LogDir", testOutputPath); info.Environment.Add("DOTNET_CLI_CONTEXT_VERBOSE", "trace"); - // suppress all timeouts: - info.Environment.Add("DCP_IDE_REQUEST_TIMEOUT_SECONDS", "100000"); - info.Environment.Add("DCP_IDE_NOTIFICATION_TIMEOUT_SECONDS", "100000"); - info.Environment.Add("DCP_IDE_NOTIFICATION_KEEPALIVE_SECONDS", "100000"); + // Use generous but bounded timeouts for DCP operations in CI. + // Previous values of 100,000 seconds (~27 hours) effectively disabled timeouts, + // causing tests to hang for the full Helix work item duration (~2 hours) when + // a DCP operation deadlocked. 300 seconds (5 minutes) per operation is generous + // for slow CI machines while ensuring natural failure recovery. + info.Environment.Add("DCP_IDE_REQUEST_TIMEOUT_SECONDS", "300"); + info.Environment.Add("DCP_IDE_NOTIFICATION_TIMEOUT_SECONDS", "300"); + info.Environment.Add("DCP_IDE_NOTIFICATION_KEEPALIVE_SECONDS", "300"); info.Environment.Add("ASPIRE_ALLOW_UNSECURED_TRANSPORT", "1"); - info.Environment.Add("ASPIRE_WATCH_PIPE_CONNECTION_TIMEOUT_SECONDS", "100000"); + info.Environment.Add("ASPIRE_WATCH_PIPE_CONNECTION_TIMEOUT_SECONDS", "300"); // override defaults: foreach (var (name, value) in EnvironmentVariables) diff --git a/test/Microsoft.NET.Sdk.Razor.Tool.Tests/DefaultRequestDispatcherTest.cs b/test/Microsoft.NET.Sdk.Razor.Tool.Tests/DefaultRequestDispatcherTest.cs index 8f3289b9110b..ef2a49f63577 100644 --- a/test/Microsoft.NET.Sdk.Razor.Tool.Tests/DefaultRequestDispatcherTest.cs +++ b/test/Microsoft.NET.Sdk.Razor.Tool.Tests/DefaultRequestDispatcherTest.cs @@ -358,7 +358,7 @@ public async Task Dispatcher_ProcessSimultaneousConnections_HitsKeepAliveTimeout return connectionTask; } - readySource.SetResult(true); + readySource.TrySetResult(true); return new TaskCompletionSource().Task; }); @@ -382,11 +382,18 @@ public async Task Dispatcher_ProcessSimultaneousConnections_HitsKeepAliveTimeout } }; var keepAlive = TimeSpan.FromSeconds(1); - var dispatcherTask = Task.Run(() => + + // Use Task.Factory.StartNew with LongRunning to run the dispatcher on a dedicated + // OS thread instead of a thread pool thread. The dispatcher's Run() method uses + // blocking Task.WaitAny() which permanently blocks its thread. On Helix CI agents + // running many tests in parallel, blocking a thread pool thread contributes to pool + // starvation, which prevents Task.Delay timer callbacks from firing, causing the + // keep-alive timeout to never complete and the test to hang indefinitely. + var dispatcherTask = Task.Factory.StartNew(() => { var dispatcher = new DefaultRequestDispatcher(connectionHost.Object, compilerHost, CancellationToken.None, eventBus, keepAlive); dispatcher.Run(); - }); + }, CancellationToken.None, TaskCreationOptions.LongRunning, TaskScheduler.Default); // Wait for all connections to be created. await readySource.Task; @@ -402,7 +409,10 @@ public async Task Dispatcher_ProcessSimultaneousConnections_HitsKeepAliveTimeout // Act // Now dispatcher should be in an idle state with no active connections. - await dispatcherTask; + // Use WaitAsync as a safety net: if the keep-alive timeout still can't fire + // (e.g. extreme thread pool starvation), fail the test after 60s instead of + // hanging for 60+ minutes and blocking the entire CI job. + await dispatcherTask.WaitAsync(TimeSpan.FromSeconds(60)); // Assert Assert.False(eventBus.HasDetectedBadConnection); diff --git a/test/TestAssets/Directory.Build.targets b/test/TestAssets/Directory.Build.targets index cecd12d3d0c8..2c8eb1f15b8f 100644 --- a/test/TestAssets/Directory.Build.targets +++ b/test/TestAssets/Directory.Build.targets @@ -1,4 +1,21 @@ + + + + + + +