diff --git a/lisa/sut_orchestrator/azure/platform_.py b/lisa/sut_orchestrator/azure/platform_.py index 03030e27b0..4fedfbce38 100644 --- a/lisa/sut_orchestrator/azure/platform_.py +++ b/lisa/sut_orchestrator/azure/platform_.py @@ -32,7 +32,7 @@ ) import requests -from azure.core.exceptions import HttpResponseError, ResourceNotFoundError +from azure.core.exceptions import AzureError, HttpResponseError, ResourceNotFoundError from azure.identity import DefaultAzureCredential from azure.mgmt.compute.models import ( CommunityGalleryImage, @@ -1870,6 +1870,30 @@ def _deploy( assert e.error, f"HttpResponseError: {e}" error_message = "\n".join(self._parse_detail_errors(e.error)) + # When Azure returns a generic deployment failure message without + # actionable per-resource details (e.g. truncated aggregated error, + # ResourceDeploymentFailure with only a tracking id, or any other + # *DeploymentFailed* code with no nested details), fall back to + # listing deployment operations to surface the real sub-resource + # errors. + top_code = getattr(e.error, "code", "") or "" + has_details = bool(getattr(e.error, "details", None)) + if "aggregated deployment error is too large" in error_message or ( + not has_details + and ( + "ResourceDeploymentFailure" in top_code + or "DeploymentFailed" in top_code + ) + ): + op_errors = self._collect_deployment_operation_errors( + resource_group_name, log + ) + if op_errors: + log.error( + "deployment failed sub-resource errors:\n" + + "\n".join(op_errors) + ) + error_message = error_message + "\n" + "\n".join(op_errors) if ( self._azure_runbook.ignore_provisioning_error and "OSProvisioningTimedOut: OS Provisioning for VM" in error_message @@ -1931,6 +1955,74 @@ def _parse_detail_errors(self, error: Any) -> List[str]: errors = [f"{error.code}: {error.message}"] return errors + def _collect_deployment_operation_errors( + self, resource_group_name: str, log: Logger + ) -> List[str]: + """Fetch per-resource errors from a failed ARM deployment. + + Used as a fallback when the top-level HttpResponseError does not + already carry actionable per-resource details. Callers in ``_deploy`` + invoke this helper for any of the following cases: + + * The aggregated "deployment error is too large" message, where ARM + truncates the nested error tree. + * ``ResourceDeploymentFailure`` errors with no nested + ``error.details`` (e.g. transient internal server errors that + carry only a tracking id). + * ``DeploymentFailed`` errors that arrive without any nested + ``details`` populated. + + When the top-level error already carries actionable nested + ``details``, callers skip this helper to avoid an extra ARM call + and duplicated/noisy output. + + In all of these cases, listing the deployment operations is the + only way to surface the underlying per-resource failure messages. + """ + errors: List[str] = [] + try: + # Azure SDK calls share auth state via files on disk; serialize + # access to avoid intermittent failures during parallel runs. + # See common.py global_credential_access_lock for context. + with global_credential_access_lock: + operations = list( + self._rm_client.deployment_operations.list( + resource_group_name=resource_group_name, + deployment_name=AZURE_DEPLOYMENT_NAME, + ) + ) + for op in operations: + props = getattr(op, "properties", None) + if not props: + continue + provisioning_state = getattr(props, "provisioning_state", None) or "" + if provisioning_state.lower() != "failed": + continue + target = getattr(props, "target_resource", None) + resource_type = getattr(target, "resource_type", "") if target else "" + resource_name = getattr(target, "resource_name", "") if target else "" + status = getattr(props, "status_message", None) + inner = getattr(status, "error", None) if status else None + if inner is not None: + errors.extend( + f"{resource_type}/{resource_name}: {msg}" + for msg in self._parse_detail_errors(inner) + ) + else: + errors.append(f"{resource_type}/{resource_name}: {status}") + except (AzureError, ValueError, TypeError, AttributeError) as ex: + # Keep the original error path intact: never let this helper raise. + # Catch Azure SDK errors (AzureError covers HttpResponseError / + # ResourceNotFoundError) plus common parsing/shape mismatches + # (ValueError, TypeError, AttributeError). Programming errors + # outside this set will still propagate so they remain visible. + # Log with traceback so failures here are still debuggable. + log.debug( + f"failed to collect deployment operation errors: {ex}", + exc_info=True, + ) + return errors + # the VM may not be queried after deployed. use retry to mitigate it. @retry(exceptions=LisaException, tries=150, delay=2) # type: ignore def _load_vms(