From d54d09c6d8ca0b2409b5ea76f3ccca95174e5feb Mon Sep 17 00:00:00 2001
From: Lili Deng <lildeng@microsoft.com>
Date: Mon, 27 Apr 2026 09:46:44 +0800
Subject: [PATCH 1/2] azure: surface per-resource errors on truncated
 DeploymentFailed

---
 lisa/sut_orchestrator/azure/platform_.py | 58 ++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/lisa/sut_orchestrator/azure/platform_.py b/lisa/sut_orchestrator/azure/platform_.py
index 03030e27b0..a1dca37807 100644
--- a/lisa/sut_orchestrator/azure/platform_.py
+++ b/lisa/sut_orchestrator/azure/platform_.py
@@ -1870,6 +1870,30 @@ def _deploy(
             assert e.error, f"HttpResponseError: {e}"
 
             error_message = "\n".join(self._parse_detail_errors(e.error))
+            # When Azure returns a generic deployment failure message without
+            # actionable per-resource details (e.g. truncated aggregated error,
+            # ResourceDeploymentFailure with only a tracking id, or any other
+            # *DeploymentFailed* code with no nested details), fall back to
+            # listing deployment operations to surface the real sub-resource
+            # errors.
+            top_code = getattr(e.error, "code", "") or ""
+            if (
+                "aggregated deployment error is too large" in error_message
+                or "ResourceDeploymentFailure" in top_code
+                or (
+                    "DeploymentFailed" in top_code
+                    and not getattr(e.error, "details", None)
+                )
+            ):
+                op_errors = self._collect_deployment_operation_errors(
+                    resource_group_name, log
+                )
+                if op_errors:
+                    log.error(
+                        "deployment failed sub-resource errors:\n"
+                        + "\n".join(op_errors)
+                    )
+                    error_message = error_message + "\n" + "\n".join(op_errors)
             if (
                 self._azure_runbook.ignore_provisioning_error
                 and "OSProvisioningTimedOut: OS Provisioning for VM" in error_message
@@ -1931,6 +1955,40 @@ def _parse_detail_errors(self, error: Any) -> List[str]:
                 errors = [f"{error.code}: {error.message}"]
         return errors
 
+    def _collect_deployment_operation_errors(
+        self, resource_group_name: str, log: Logger
+    ) -> List[str]:
+        """Fetch per-resource errors from a failed ARM deployment.
+
+        Used when the top-level HttpResponseError only carries the aggregated
+        "deployment error is too large" message and no nested details.
+        """
+        errors: List[str] = []
+        try:
+            operations = self._rm_client.deployment_operations.list(
+                resource_group_name=resource_group_name,
+                deployment_name=AZURE_DEPLOYMENT_NAME,
+            )
+            for op in operations:
+                props = getattr(op, "properties", None)
+                if not props or props.provisioning_state != "Failed":
+                    continue
+                target = getattr(props, "target_resource", None)
+                resource_type = getattr(target, "resource_type", "") if target else ""
+                resource_name = getattr(target, "resource_name", "") if target else ""
+                status = getattr(props, "status_message", None)
+                inner = getattr(status, "error", None) if status else None
+                if inner is not None:
+                    errors.extend(
+                        f"{resource_type}/{resource_name}: {msg}"
+                        for msg in self._parse_detail_errors(inner)
+                    )
+                else:
+                    errors.append(f"{resource_type}/{resource_name}: {status}")
+        except Exception as ex:
+            log.debug(f"failed to list deployment operations: {ex}")
+        return errors
+
     # the VM may not be queried after deployed. use retry to mitigate it.
     @retry(exceptions=LisaException, tries=150, delay=2)  # type: ignore
     def _load_vms(

From 48a3d30bd46a0870bf395b4d32fcf333d06e50b6 Mon Sep 17 00:00:00 2001
From: Lili Deng <lildeng@microsoft.com>
Date: Mon, 27 Apr 2026 10:55:54 +0800
Subject: [PATCH 2/2] azure: narrow exception handling in deployment operations
 helper

Catch Azure-specific exceptions (HttpResponseError, ResourceNotFoundError)
explicitly and log them at debug. Keep a broad fallback for unexpected
errors but log with exc_info=True so the traceback is preserved instead
of being silently swallowed. The original error path is unchanged: this
helper still never raises.

Addresses review comment on PR #4438.
---
 lisa/sut_orchestrator/azure/platform_.py | 66 ++++++++++++++++++------
 1 file changed, 50 insertions(+), 16 deletions(-)

diff --git a/lisa/sut_orchestrator/azure/platform_.py b/lisa/sut_orchestrator/azure/platform_.py
index a1dca37807..4fedfbce38 100644
--- a/lisa/sut_orchestrator/azure/platform_.py
+++ b/lisa/sut_orchestrator/azure/platform_.py
@@ -32,7 +32,7 @@
 )
 
 import requests
-from azure.core.exceptions import HttpResponseError, ResourceNotFoundError
+from azure.core.exceptions import AzureError, HttpResponseError, ResourceNotFoundError
 from azure.identity import DefaultAzureCredential
 from azure.mgmt.compute.models import (
     CommunityGalleryImage,
@@ -1877,12 +1877,12 @@ def _deploy(
             # listing deployment operations to surface the real sub-resource
             # errors.
             top_code = getattr(e.error, "code", "") or ""
-            if (
-                "aggregated deployment error is too large" in error_message
-                or "ResourceDeploymentFailure" in top_code
-                or (
-                    "DeploymentFailed" in top_code
-                    and not getattr(e.error, "details", None)
+            has_details = bool(getattr(e.error, "details", None))
+            if "aggregated deployment error is too large" in error_message or (
+                not has_details
+                and (
+                    "ResourceDeploymentFailure" in top_code
+                    or "DeploymentFailed" in top_code
                 )
             ):
                 op_errors = self._collect_deployment_operation_errors(
@@ -1960,18 +1960,43 @@ def _collect_deployment_operation_errors(
     ) -> List[str]:
         """Fetch per-resource errors from a failed ARM deployment.
 
-        Used when the top-level HttpResponseError only carries the aggregated
-        "deployment error is too large" message and no nested details.
+        Used as a fallback when the top-level HttpResponseError does not
+        already carry actionable per-resource details. Callers in ``_deploy``
+        invoke this helper for any of the following cases:
+
+        * The aggregated "deployment error is too large" message, where ARM
+          truncates the nested error tree.
+        * ``ResourceDeploymentFailure`` errors with no nested
+          ``error.details`` (e.g. transient internal server errors that
+          carry only a tracking id).
+        * ``DeploymentFailed`` errors that arrive without any nested
+          ``details`` populated.
+
+        When the top-level error already carries actionable nested
+        ``details``, callers skip this helper to avoid an extra ARM call
+        and duplicated/noisy output.
+
+        In all of these cases, listing the deployment operations is the
+        only way to surface the underlying per-resource failure messages.
         """
         errors: List[str] = []
         try:
-            operations = self._rm_client.deployment_operations.list(
-                resource_group_name=resource_group_name,
-                deployment_name=AZURE_DEPLOYMENT_NAME,
-            )
+            # Azure SDK calls share auth state via files on disk; serialize
+            # access to avoid intermittent failures during parallel runs.
+            # See common.py global_credential_access_lock for context.
+            with global_credential_access_lock:
+                operations = list(
+                    self._rm_client.deployment_operations.list(
+                        resource_group_name=resource_group_name,
+                        deployment_name=AZURE_DEPLOYMENT_NAME,
+                    )
+                )
             for op in operations:
                 props = getattr(op, "properties", None)
-                if not props or props.provisioning_state != "Failed":
+                if not props:
+                    continue
+                provisioning_state = getattr(props, "provisioning_state", None) or ""
+                if provisioning_state.lower() != "failed":
                     continue
                 target = getattr(props, "target_resource", None)
                 resource_type = getattr(target, "resource_type", "") if target else ""
@@ -1985,8 +2010,17 @@ def _collect_deployment_operation_errors(
                     )
                 else:
                     errors.append(f"{resource_type}/{resource_name}: {status}")
-        except Exception as ex:
-            log.debug(f"failed to list deployment operations: {ex}")
+        except (AzureError, ValueError, TypeError, AttributeError) as ex:
+            # Keep the original error path intact: never let this helper raise.
+            # Catch Azure SDK errors (AzureError covers HttpResponseError /
+            # ResourceNotFoundError) plus common parsing/shape mismatches
+            # (ValueError, TypeError, AttributeError). Programming errors
+            # outside this set will still propagate so they remain visible.
+            # Log with traceback so failures here are still debuggable.
+            log.debug(
+                f"failed to collect deployment operation errors: {ex}",
+                exc_info=True,
+            )
         return errors
 
     # the VM may not be queried after deployed. use retry to mitigate it.