Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 93 additions & 1 deletion lisa/sut_orchestrator/azure/platform_.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
)

import requests
from azure.core.exceptions import HttpResponseError, ResourceNotFoundError
from azure.core.exceptions import AzureError, HttpResponseError, ResourceNotFoundError
from azure.identity import DefaultAzureCredential
from azure.mgmt.compute.models import (
CommunityGalleryImage,
Expand Down Expand Up @@ -1870,6 +1870,30 @@ def _deploy(
assert e.error, f"HttpResponseError: {e}"

error_message = "\n".join(self._parse_detail_errors(e.error))
# When Azure returns a generic deployment failure message without
# actionable per-resource details (e.g. truncated aggregated error,
# ResourceDeploymentFailure with only a tracking id, or any other
# *DeploymentFailed* code with no nested details), fall back to
# listing deployment operations to surface the real sub-resource
# errors.
top_code = getattr(e.error, "code", "") or ""
has_details = bool(getattr(e.error, "details", None))
if "aggregated deployment error is too large" in error_message or (
not has_details
and (
"ResourceDeploymentFailure" in top_code
or "DeploymentFailed" in top_code
)
):
op_errors = self._collect_deployment_operation_errors(
resource_group_name, log
)
if op_errors:
log.error(
"deployment failed sub-resource errors:\n"
+ "\n".join(op_errors)
)
error_message = error_message + "\n" + "\n".join(op_errors)
if (
self._azure_runbook.ignore_provisioning_error
and "OSProvisioningTimedOut: OS Provisioning for VM" in error_message
Expand Down Expand Up @@ -1931,6 +1955,74 @@ def _parse_detail_errors(self, error: Any) -> List[str]:
errors = [f"{error.code}: {error.message}"]
return errors

def _collect_deployment_operation_errors(
self, resource_group_name: str, log: Logger
) -> List[str]:
"""Fetch per-resource errors from a failed ARM deployment.

Used as a fallback when the top-level HttpResponseError does not
already carry actionable per-resource details. Callers in ``_deploy``
invoke this helper for any of the following cases:
Comment thread
LiliDeng marked this conversation as resolved.

* The aggregated "deployment error is too large" message, where ARM
truncates the nested error tree.
* ``ResourceDeploymentFailure`` errors with no nested
``error.details`` (e.g. transient internal server errors that
carry only a tracking id).
* ``DeploymentFailed`` errors that arrive without any nested
``details`` populated.

When the top-level error already carries actionable nested
``details``, callers skip this helper to avoid an extra ARM call
and duplicated/noisy output.

In all of these cases, listing the deployment operations is the
only way to surface the underlying per-resource failure messages.
"""
errors: List[str] = []
try:
# Azure SDK calls share auth state via files on disk; serialize
# access to avoid intermittent failures during parallel runs.
# See common.py global_credential_access_lock for context.
with global_credential_access_lock:
operations = list(
self._rm_client.deployment_operations.list(
resource_group_name=resource_group_name,
deployment_name=AZURE_DEPLOYMENT_NAME,
)
)
for op in operations:
props = getattr(op, "properties", None)
if not props:
continue
provisioning_state = getattr(props, "provisioning_state", None) or ""
if provisioning_state.lower() != "failed":
continue
target = getattr(props, "target_resource", None)
resource_type = getattr(target, "resource_type", "") if target else ""
resource_name = getattr(target, "resource_name", "") if target else ""
status = getattr(props, "status_message", None)
inner = getattr(status, "error", None) if status else None
if inner is not None:
errors.extend(
f"{resource_type}/{resource_name}: {msg}"
for msg in self._parse_detail_errors(inner)
)
else:
errors.append(f"{resource_type}/{resource_name}: {status}")
Comment thread
LiliDeng marked this conversation as resolved.
except (AzureError, ValueError, TypeError, AttributeError) as ex:
# Keep the original error path intact: never let this helper raise.
# Catch Azure SDK errors (AzureError covers HttpResponseError /
# ResourceNotFoundError) plus common parsing/shape mismatches
# (ValueError, TypeError, AttributeError). Programming errors
# outside this set will still propagate so they remain visible.
# Log with traceback so failures here are still debuggable.
log.debug(
f"failed to collect deployment operation errors: {ex}",
exc_info=True,
)
return errors

# the VM may not be queried after deployed. use retry to mitigate it.
@retry(exceptions=LisaException, tries=150, delay=2) # type: ignore
def _load_vms(
Expand Down
Loading