From d54d09c6d8ca0b2409b5ea76f3ccca95174e5feb Mon Sep 17 00:00:00 2001 From: Lili Deng Date: Mon, 27 Apr 2026 09:46:44 +0800 Subject: [PATCH 1/2] azure: surface per-resource errors on truncated DeploymentFailed --- lisa/sut_orchestrator/azure/platform_.py | 58 ++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/lisa/sut_orchestrator/azure/platform_.py b/lisa/sut_orchestrator/azure/platform_.py index 03030e27b0..a1dca37807 100644 --- a/lisa/sut_orchestrator/azure/platform_.py +++ b/lisa/sut_orchestrator/azure/platform_.py @@ -1870,6 +1870,30 @@ def _deploy( assert e.error, f"HttpResponseError: {e}" error_message = "\n".join(self._parse_detail_errors(e.error)) + # When Azure returns a generic deployment failure message without + # actionable per-resource details (e.g. truncated aggregated error, + # ResourceDeploymentFailure with only a tracking id, or any other + # *DeploymentFailed* code with no nested details), fall back to + # listing deployment operations to surface the real sub-resource + # errors. + top_code = getattr(e.error, "code", "") or "" + if ( + "aggregated deployment error is too large" in error_message + or "ResourceDeploymentFailure" in top_code + or ( + "DeploymentFailed" in top_code + and not getattr(e.error, "details", None) + ) + ): + op_errors = self._collect_deployment_operation_errors( + resource_group_name, log + ) + if op_errors: + log.error( + "deployment failed sub-resource errors:\n" + + "\n".join(op_errors) + ) + error_message = error_message + "\n" + "\n".join(op_errors) if ( self._azure_runbook.ignore_provisioning_error and "OSProvisioningTimedOut: OS Provisioning for VM" in error_message @@ -1931,6 +1955,40 @@ def _parse_detail_errors(self, error: Any) -> List[str]: errors = [f"{error.code}: {error.message}"] return errors + def _collect_deployment_operation_errors( + self, resource_group_name: str, log: Logger + ) -> List[str]: + """Fetch per-resource errors from a failed ARM deployment. + + Used when the top-level HttpResponseError only carries the aggregated + "deployment error is too large" message and no nested details. + """ + errors: List[str] = [] + try: + operations = self._rm_client.deployment_operations.list( + resource_group_name=resource_group_name, + deployment_name=AZURE_DEPLOYMENT_NAME, + ) + for op in operations: + props = getattr(op, "properties", None) + if not props or props.provisioning_state != "Failed": + continue + target = getattr(props, "target_resource", None) + resource_type = getattr(target, "resource_type", "") if target else "" + resource_name = getattr(target, "resource_name", "") if target else "" + status = getattr(props, "status_message", None) + inner = getattr(status, "error", None) if status else None + if inner is not None: + errors.extend( + f"{resource_type}/{resource_name}: {msg}" + for msg in self._parse_detail_errors(inner) + ) + else: + errors.append(f"{resource_type}/{resource_name}: {status}") + except Exception as ex: + log.debug(f"failed to list deployment operations: {ex}") + return errors + # the VM may not be queried after deployed. use retry to mitigate it. @retry(exceptions=LisaException, tries=150, delay=2) # type: ignore def _load_vms( From 48a3d30bd46a0870bf395b4d32fcf333d06e50b6 Mon Sep 17 00:00:00 2001 From: Lili Deng Date: Mon, 27 Apr 2026 10:55:54 +0800 Subject: [PATCH 2/2] azure: narrow exception handling in deployment operations helper Catch Azure-specific exceptions (HttpResponseError, ResourceNotFoundError) explicitly and log them at debug. Keep a broad fallback for unexpected errors but log with exc_info=True so the traceback is preserved instead of being silently swallowed. The original error path is unchanged: this helper still never raises. Addresses review comment on PR #4438. --- lisa/sut_orchestrator/azure/platform_.py | 66 ++++++++++++++++++------ 1 file changed, 50 insertions(+), 16 deletions(-) diff --git a/lisa/sut_orchestrator/azure/platform_.py b/lisa/sut_orchestrator/azure/platform_.py index a1dca37807..4fedfbce38 100644 --- a/lisa/sut_orchestrator/azure/platform_.py +++ b/lisa/sut_orchestrator/azure/platform_.py @@ -32,7 +32,7 @@ ) import requests -from azure.core.exceptions import HttpResponseError, ResourceNotFoundError +from azure.core.exceptions import AzureError, HttpResponseError, ResourceNotFoundError from azure.identity import DefaultAzureCredential from azure.mgmt.compute.models import ( CommunityGalleryImage, @@ -1877,12 +1877,12 @@ def _deploy( # listing deployment operations to surface the real sub-resource # errors. top_code = getattr(e.error, "code", "") or "" - if ( - "aggregated deployment error is too large" in error_message - or "ResourceDeploymentFailure" in top_code - or ( - "DeploymentFailed" in top_code - and not getattr(e.error, "details", None) + has_details = bool(getattr(e.error, "details", None)) + if "aggregated deployment error is too large" in error_message or ( + not has_details + and ( + "ResourceDeploymentFailure" in top_code + or "DeploymentFailed" in top_code ) ): op_errors = self._collect_deployment_operation_errors( @@ -1960,18 +1960,43 @@ def _collect_deployment_operation_errors( ) -> List[str]: """Fetch per-resource errors from a failed ARM deployment. - Used when the top-level HttpResponseError only carries the aggregated - "deployment error is too large" message and no nested details. + Used as a fallback when the top-level HttpResponseError does not + already carry actionable per-resource details. Callers in ``_deploy`` + invoke this helper for any of the following cases: + + * The aggregated "deployment error is too large" message, where ARM + truncates the nested error tree. + * ``ResourceDeploymentFailure`` errors with no nested + ``error.details`` (e.g. transient internal server errors that + carry only a tracking id). + * ``DeploymentFailed`` errors that arrive without any nested + ``details`` populated. + + When the top-level error already carries actionable nested + ``details``, callers skip this helper to avoid an extra ARM call + and duplicated/noisy output. + + In all of these cases, listing the deployment operations is the + only way to surface the underlying per-resource failure messages. """ errors: List[str] = [] try: - operations = self._rm_client.deployment_operations.list( - resource_group_name=resource_group_name, - deployment_name=AZURE_DEPLOYMENT_NAME, - ) + # Azure SDK calls share auth state via files on disk; serialize + # access to avoid intermittent failures during parallel runs. + # See common.py global_credential_access_lock for context. + with global_credential_access_lock: + operations = list( + self._rm_client.deployment_operations.list( + resource_group_name=resource_group_name, + deployment_name=AZURE_DEPLOYMENT_NAME, + ) + ) for op in operations: props = getattr(op, "properties", None) - if not props or props.provisioning_state != "Failed": + if not props: + continue + provisioning_state = getattr(props, "provisioning_state", None) or "" + if provisioning_state.lower() != "failed": continue target = getattr(props, "target_resource", None) resource_type = getattr(target, "resource_type", "") if target else "" @@ -1985,8 +2010,17 @@ def _collect_deployment_operation_errors( ) else: errors.append(f"{resource_type}/{resource_name}: {status}") - except Exception as ex: - log.debug(f"failed to list deployment operations: {ex}") + except (AzureError, ValueError, TypeError, AttributeError) as ex: + # Keep the original error path intact: never let this helper raise. + # Catch Azure SDK errors (AzureError covers HttpResponseError / + # ResourceNotFoundError) plus common parsing/shape mismatches + # (ValueError, TypeError, AttributeError). Programming errors + # outside this set will still propagate so they remain visible. + # Log with traceback so failures here are still debuggable. + log.debug( + f"failed to collect deployment operation errors: {ex}", + exc_info=True, + ) return errors # the VM may not be queried after deployed. use retry to mitigate it.