diff --git a/nutanix/assets/monitors/alerts.json b/nutanix/assets/monitors/alerts.json new file mode 100644 index 0000000000000..38303262cdf5a --- /dev/null +++ b/nutanix/assets/monitors/alerts.json @@ -0,0 +1,27 @@ +{ + "version": 2, + "created_at": "2026-04-29", + "last_updated_at": "2026-04-29", + "title": "Nutanix alert is open", + "description": "Notifies when a Nutanix alert is unresolved (open or acknowledged) and auto-resolves when the alert is marked resolved in Prism Central. Each Nutanix alert produces its own monitor case via the `ext_id` grouping facet.", + "definition": { + "name": "{{#is_alert}}OPEN{{/is_alert}}{{#is_recovery}}RESOLVED{{/is_recovery}} [Nutanix {{ntnx_alert_severity.name}}] [Impact: {{ntnx_alert_impact.name}}] [PC: {{ntnx_cluster_name.name}}] - ID: {{ext_id.name}}", + "type": "query alert", + "query": "avg(last_15m):clamp_min(default_zero(avg:nutanix.alert.open{*} by {ext_id,ntnx_alert_impact,ntnx_cluster_name,ntnx_alert_severity,prism_central,ntnx_alert_type}) - default_zero(avg:nutanix.alert.resolved{*} by {ext_id,ntnx_alert_impact,ntnx_cluster_name,ntnx_alert_severity,prism_central,ntnx_alert_type}), 0) > 0", + "message": "@your-team\n\n{{#is_alert}}A Nutanix alert is currently open and requires attention.{{/is_alert}}\n{{#is_recovery}}The Nutanix alert has been resolved.{{/is_recovery}}\n\n**Alert details**\n- **Nutanix Alert ID (extId):** `{{ext_id.name}}`\n- **Severity:** `{{ntnx_alert_severity.name}}`\n- **Impact:** `{{ntnx_alert_impact.name}}`\n- **Type:** `{{ntnx_alert_type.name}}`\n- **Cluster:** `{{ntnx_cluster_name.name}}`", + "tags": ["integration:nutanix"], + "options": { + "thresholds": { + "critical": 0 + }, + "notify_audit": false, + "on_missing_data": "default", + "include_tags": false, + "new_group_delay": 60, + "renotify_interval": 0, + "escalation_message": "", + "notify_by": ["ext_id"] + } + }, + "tags": ["integration:nutanix"] +} diff --git a/nutanix/changelog.d/23489.added b/nutanix/changelog.d/23489.added new file mode 100644 index 0000000000000..d533e6e2cfbb6 --- /dev/null +++ b/nutanix/changelog.d/23489.added @@ -0,0 +1 @@ +Submit `nutanix.alert.open` gauge per Nutanix alert (1 while open, 0 once on resolution) for individual alert lifecycle monitoring. \ No newline at end of file diff --git a/nutanix/changelog.d/23501.fixed b/nutanix/changelog.d/23501.fixed new file mode 100644 index 0000000000000..8b062cdc6b6ec --- /dev/null +++ b/nutanix/changelog.d/23501.fixed @@ -0,0 +1 @@ +Reconcile open alerts against the unresolved-alerts API on every check cycle instead of relying on a persistent cache. Eliminates orphaned `nutanix.alert.open` metric emissions when a resolution is missed (e.g. agent downtime spanning a resolution, or alert deletion). \ No newline at end of file diff --git a/nutanix/changelog.d/23508.added b/nutanix/changelog.d/23508.added new file mode 100644 index 0000000000000..6be67940cc17d --- /dev/null +++ b/nutanix/changelog.d/23508.added @@ -0,0 +1 @@ +Split alert lifecycle gauge into per-state metrics: `nutanix.alert.open` (unacknowledged + unresolved), `nutanix.alert.acknowledged` (acknowledged + unresolved), and `nutanix.alert.resolved` (one-shot on resolution detection). State transitions emit an explicit 0 to the previous state's metric. \ No newline at end of file diff --git a/nutanix/changelog.d/23511.added b/nutanix/changelog.d/23511.added new file mode 100644 index 0000000000000..7ea70d9f49877 --- /dev/null +++ b/nutanix/changelog.d/23511.added @@ -0,0 +1 @@ +Add `ntnx_originating_cluster_name`, `ntnx_alert_user_defined`, and `ntnx_alert_service` (when present) tags to alert events and metrics. Drop redundant `ntnx_alert_status` tag from per-state metrics (state is encoded in the metric name). \ No newline at end of file diff --git a/nutanix/datadog_checks/nutanix/__about__.py b/nutanix/datadog_checks/nutanix/__about__.py index 49c38e19af7b7..f95f33cb9bcac 100644 --- a/nutanix/datadog_checks/nutanix/__about__.py +++ b/nutanix/datadog_checks/nutanix/__about__.py @@ -1,4 +1,4 @@ # (C) Datadog, Inc. 2026-present # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) -__version__ = '1.1.0' +__version__ = '1.2.0+dev' diff --git a/nutanix/datadog_checks/nutanix/activity_monitor.py b/nutanix/datadog_checks/nutanix/activity_monitor.py index 3914c44466634..0742f1a8b03ed 100644 --- a/nutanix/datadog_checks/nutanix/activity_monitor.py +++ b/nutanix/datadog_checks/nutanix/activity_monitor.py @@ -17,6 +17,9 @@ from datadog_checks.nutanix.check import NutanixCheck +_SEVERITY_TO_ALERT_TYPE = {"CRITICAL": "error", "WARNING": "warning", "INFO": "info"} + + class _SafeDict(dict): """Dict that returns missing keys as template placeholders for safe string formatting.""" @@ -31,7 +34,6 @@ def __init__(self, check: NutanixCheck): self.last_event_collection_time = self.check.read_persistent_cache("last_event_collection_time") self.last_task_collection_time = self.check.read_persistent_cache("last_task_collection_time") self.last_audit_collection_time = self.check.read_persistent_cache("last_audit_collection_time") - self.last_alert_collection_time = self.check.read_persistent_cache("last_alert_collection_time") # In-memory caches: id -> raw item (reset each check run) self.events: dict[str, dict] = {} self.audits: dict[str, dict] = {} @@ -51,6 +53,9 @@ def __init__(self, check: NutanixCheck): else: self.alerts_v42_supported = None + # extId -> last-seen alert dict; reconciled each cycle against the unresolved-alerts API. + self._open_alerts: dict[str, dict] = {} + def reset_state(self) -> None: """Reset in-memory caches and counters for a new collection run.""" self.events = {} @@ -191,16 +196,46 @@ def collect_audits(self) -> None: ) def collect_alerts(self) -> None: - self.alerts_count = self._safe_collect( - "alert", - lambda: self._collect( - activity_kind="alert", - list_fn=self._list_alerts, - process_fn=self._process_alert, - time_field="creationTime", - cache_key="last_alert_collection_time", - ), - ) + self.alerts_count = self._safe_collect("alert", self._reconcile_alerts) + + @staticmethod + def _alert_state(alert: dict) -> str: + return "acknowledged" if alert.get("isAcknowledged") else "open" + + def _submit_state_metric(self, alert: dict, state: str, value: int) -> None: + """Submit nutanix.alert.= (state encoded in metric name, not tag).""" + self.check.gauge(f"alert.{state}", value, tags=self._build_alert_tags(alert)) + + def _reconcile_alerts(self) -> int: + """Reconcile open alerts against the unresolved-alerts API and emit per-state metrics.""" + alerts = self._list_alerts_unresolved() + alerts = [a for a in alerts if self._should_collect_activity_item(a, "alert")] + api_alerts = {a.get("extId"): a for a in alerts if a.get("extId")} + + count = 0 + for ext_id in api_alerts.keys() - self._open_alerts.keys(): + alert = api_alerts[ext_id] + self._open_alerts[ext_id] = alert + self._process_alert(alert) + count += 1 + + for ext_id in self._open_alerts.keys() - api_alerts.keys(): + cached = self._open_alerts.pop(ext_id) + self._emit_resolution_event(self._get_alert(ext_id) or cached, cached_tags_alert=cached) + count += 1 + + for ext_id in api_alerts.keys() & self._open_alerts.keys(): + old_alert = self._open_alerts[ext_id] + new_alert = api_alerts[ext_id] + old_state = self._alert_state(old_alert) + if old_state != self._alert_state(new_alert): + self._submit_state_metric(old_alert, old_state, 0) + self._open_alerts[ext_id] = new_alert + + for alert in self._open_alerts.values(): + self._submit_state_metric(alert, self._alert_state(alert), 1) + + return count def _list_activity(self, endpoint: str, time_field: str, start_time_str: str) -> list[dict]: """Fetch activity items from Prism Central.""" @@ -210,41 +245,52 @@ def _list_activity(self, endpoint: str, time_field: str, start_time_str: str) -> } return self.check._get_paginated_request_data(endpoint, params=params) - def _list_alerts(self, start_time_str: str) -> list[dict]: - """Fetch alerts from Prism Central with v4.2/v4.0 fallback.""" + def _list_alerts_unresolved(self) -> list[dict]: + """Fetch all currently-unresolved alerts (the source of truth each cycle).""" params = { - "$filter": f"creationTime gt {start_time_str}", - "$orderBy": "creationTime asc", + "$filter": "isResolved eq false", + "$orderBy": "lastUpdatedTime asc", } + return self._list_alerts_with_fallback(params, fallback_filter=lambda a: not a.get("isResolved")) + def _list_alerts_with_fallback( + self, + params: dict, + fallback_filter: Callable[[dict], bool] | None = None, + ) -> list[dict]: + """Fetch alerts with v4.2/v4.0 fallback.""" if self.alerts_v42_supported is False: self.check.log.debug("[%s] Using alerts API v4.0 (v4.2 not supported)", self._pc_label) - del params["$filter"] - return self.check._get_paginated_request_data("api/monitoring/v4.0/serviceability/alerts", params=params) + v40_params = {"$orderBy": params.get("$orderBy", "lastUpdatedTime asc")} + alerts = self.check._get_paginated_request_data( + "api/monitoring/v4.0/serviceability/alerts", params=v40_params + ) + if fallback_filter: + alerts = [a for a in alerts if fallback_filter(a)] + return alerts try: self.check.log.debug("[%s] Attempting to use alerts API v4.2", self._pc_label) result = self.check._get_paginated_request_data("api/monitoring/v4.2/serviceability/alerts", params=params) if self.alerts_v42_supported is None: - self.check.log.debug( - "[%s] Alerts API v4.2 is supported, caching for future use", - self._pc_label, - ) + self.check.log.debug("[%s] Alerts API v4.2 is supported, caching for future use", self._pc_label) self.alerts_v42_supported = True self.check.write_persistent_cache("alerts_v42_supported", "True") return result except HTTPError as e: if e.response is not None and e.response.status_code == 404: self.check.log.debug( - "[%s] Alerts API v4.2 not supported, falling back to v4.0 permanently", - self._pc_label, + "[%s] Alerts API v4.2 not supported, falling back to v4.0 permanently", self._pc_label ) self.alerts_v42_supported = False self.check.write_persistent_cache("alerts_v42_supported", "False") - del params["$filter"] - return self.check._get_paginated_request_data( - "api/monitoring/v4.0/serviceability/alerts", params=params + v40_params = {"$orderBy": params.get("$orderBy", "lastUpdatedTime asc")} + alerts = self.check._get_paginated_request_data( + "api/monitoring/v4.0/serviceability/alerts", params=v40_params ) + if fallback_filter: + alerts = [a for a in alerts if fallback_filter(a)] + return alerts raise def _get_alert(self, alert_ext_id: str) -> dict | None: @@ -415,44 +461,50 @@ def _render_message(self, message: str, parameters: list[dict]) -> str: self.check.log.debug("Failed to render alert message template: %s", e) return message + def _build_alert_tags(self, alert: dict, status: str | None = None) -> list[str]: + """Build the alert tag set; pass status only for events (metrics encode state in the name).""" + tags = self.check.base_tags.copy() + if ext_id := alert.get("extId"): + tags.append(f"ext_id:{ext_id}") + if alert_type := alert.get("alertType"): + tags.append(f"ntnx_alert_type:{alert_type}") + if severity := alert.get("severity"): + tags.append(f"ntnx_alert_severity:{severity}") + if (user_defined := alert.get("isUserDefined")) is not None: + tags.append(f"ntnx_alert_user_defined:{str(user_defined).lower()}") + if service_name := alert.get("serviceName"): + tags.append(f"ntnx_alert_service:{service_name}") + + self._add_cluster_name_tag(tags, alert.get("clusterUUID")) + self._add_cluster_name_tag(tags, alert.get("originatingClusterUUID"), tag_name="ntnx_originating_cluster_name") + + for classification in alert.get("classifications", []) or []: + tags.append(f"ntnx_alert_classification:{classification}") + for impact in alert.get("impactTypes", []) or []: + tags.append(f"ntnx_alert_impact:{impact}") + + self._add_source_entity_tags(tags, alert) + + tags.append("ntnx_type:alert") + if status is not None: + tags.append(f"ntnx_alert_status:{status}") + return tags + def _process_alert(self, alert: dict) -> None: """Process and send a single alert to Datadog.""" + ext_id = alert.get("extId", "") title = alert.get("title", "Nutanix Alert") message = alert.get("message", "") created_time = alert.get("creationTime") - severity = alert.get("severity") - alert_type = alert.get("alertType") + is_acknowledged = alert.get("isAcknowledged", False) - # Render template variables in title and message from parameters if parameters := alert.get("parameters"): title = self._render_message(title, parameters) message = self._render_message(message, parameters) - # map severity to alert_type - severity_map = { - "CRITICAL": "error", - "WARNING": "warning", - "INFO": "info", - } - event_alert_type = severity_map.get(severity, "info") - - alert_tags = self.check.base_tags.copy() - if alert_type: - alert_tags.append(f"ntnx_alert_type:{alert_type}") - if severity: - alert_tags.append(f"ntnx_alert_severity:{severity}") - - self._add_cluster_name_tag(alert_tags, alert.get("clusterUUID")) - - for classification in alert.get("classifications", []) or []: - alert_tags.append(f"ntnx_alert_classification:{classification}") - - for impact in alert.get("impactTypes", []) or []: - alert_tags.append(f"ntnx_alert_impact:{impact}") - - self._add_source_entity_tags(alert_tags, alert) - - alert_tags.append("ntnx_type:alert") + # Acknowledged alerts soften to "warning" regardless of severity (operator already triaging). + event_alert_type = "warning" if is_acknowledged else _SEVERITY_TO_ALERT_TYPE.get(alert.get("severity"), "info") + alert_tags = self._build_alert_tags(alert, "acknowledged" if is_acknowledged else "open") self.check.event( { @@ -465,9 +517,50 @@ def _process_alert(self, alert: dict) -> None: "alert_type": event_alert_type, "source_type_name": self.check.__NAMESPACE__, "tags": alert_tags, + "aggregation_key": f"nutanix-alert-{ext_id}", } ) + def _emit_resolution_event(self, alert: dict, cached_tags_alert: dict | None = None) -> None: + """Emit resolution event, close prev-state gauge, and emit nutanix.alert.resolved=1.""" + ext_id = alert.get("extId", "") + title = alert.get("title", "Nutanix Alert") + resolved_time = alert.get("resolvedTime") + resolved_by = alert.get("resolvedByUsername") + is_auto_resolved = alert.get("isAutoResolved", False) + + if parameters := alert.get("parameters"): + title = self._render_message(title, parameters) + + msg_text = "Auto-resolved" if is_auto_resolved else f"Resolved by {resolved_by}" if resolved_by else "Resolved" + + prev_alert = cached_tags_alert or alert + prev_state = self._alert_state(prev_alert) + metric_tags = self._build_alert_tags(prev_alert) + event_tags = [ + *metric_tags, + "ntnx_alert_status:resolved", + f"ntnx_alert_auto_resolved:{str(is_auto_resolved).lower()}", + ] + + self.check.event( + { + "timestamp": self._parse_timestamp(resolved_time) + if resolved_time + else get_timestamp(get_current_datetime()), + "event_type": self.check.__NAMESPACE__, + "msg_title": f"Alert Resolved: {title}", + "msg_text": msg_text, + "alert_type": "success", + "source_type_name": self.check.__NAMESPACE__, + "aggregation_key": f"nutanix-alert-{ext_id}", + "tags": event_tags, + } + ) + + self.check.gauge(f"alert.{prev_state}", 0, tags=metric_tags) + self.check.gauge("alert.resolved", 1, tags=metric_tags) + def _process_task(self, task: dict) -> None: """Process and send a single task to Datadog as an event.""" task_operation = task.get("operation", "Nutanix Task") @@ -554,14 +647,20 @@ def _add_source_entity_tags(self, tags: list[str], item: dict) -> None: if entity_name := source_entity.get("name"): tags.append(f"ntnx_{entity_type}_name:{entity_name}") - def _add_cluster_name_tag(self, tags: list[str], cluster_id: str | None, fallback_name: str | None = None) -> None: - """Add cluster name tag from ID lookup, with optional fallback.""" + def _add_cluster_name_tag( + self, + tags: list[str], + cluster_id: str | None, + fallback_name: str | None = None, + tag_name: str = "ntnx_cluster_name", + ) -> None: + """Add a cluster name tag from ID lookup, with optional fallback.""" if not cluster_id: return if cluster_id in self.check.cluster_names: - tags.append(f"ntnx_cluster_name:{self.check.cluster_names[cluster_id]}") + tags.append(f"{tag_name}:{self.check.cluster_names[cluster_id]}") elif fallback_name: - tags.append(f"ntnx_cluster_name:{fallback_name}") + tags.append(f"{tag_name}:{fallback_name}") def _cluster_resource(self, cluster_id: str) -> tuple[str, dict]: """Build a cluster resource tuple from a cluster ID.""" diff --git a/nutanix/manifest.json b/nutanix/manifest.json index 75aedb90c5a7e..6e597ceff127b 100644 --- a/nutanix/manifest.json +++ b/nutanix/manifest.json @@ -62,7 +62,9 @@ "Nutanix - Overview": "assets/dashboards/nutanix_overview.json", "Nutanix - Activity Monitoring": "assets/dashboards/nutanix_activity_monitoring.json" }, - "monitors": {}, + "monitors": { + "Nutanix alert is open": "assets/monitors/alerts.json" + }, "saved_views": {} }, "author": { diff --git a/nutanix/metadata.csv b/nutanix/metadata.csv index c77d5a4f8fbb8..0b7f120838db1 100644 --- a/nutanix/metadata.csv +++ b/nutanix/metadata.csv @@ -1,4 +1,7 @@ metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name,curated_metric,sample_tags +nutanix.alert.acknowledged,gauge,,,,1 while a Nutanix alert is acknowledged but not yet resolved; 0 emitted once when leaving the acknowledged state. Tagged per-alert via ext_id.,0,nutanix,alert acknowledged,,ext_id +nutanix.alert.open,gauge,,,,1 while a Nutanix alert is unresolved and unacknowledged; 0 emitted once when leaving the open state (acknowledged or resolved). Tagged per-alert via ext_id.,0,nutanix,alert open,,ext_id +nutanix.alert.resolved,gauge,,,,1 emitted once when a Nutanix alert is detected as resolved or deleted. One-shot signal useful for resolution-rate dashboards. Tagged per-alert via ext_id.,0,nutanix,alert resolved,,ext_id nutanix.api.rate_limited,count,,,,Count of HTTP 429 rate limit responses from the Prism Central API.,0,nutanix,rate_limited,, nutanix.cluster.aggregate_hypervisor.memory_usage,gauge,,,,Total memory usage across all hypervisors in the cluster.,0,nutanix,usage,, nutanix.cluster.controller.avg_io_latency,gauge,,,,Average I/O latency of the cluster storage controller.,0,nutanix,latency,, diff --git a/nutanix/tests/conftest.py b/nutanix/tests/conftest.py index 9dbee745235ec..a19eb0c08c6f0 100644 --- a/nutanix/tests/conftest.py +++ b/nutanix/tests/conftest.py @@ -297,7 +297,31 @@ def mock_response(url, params=None, *args, **kwargs): response_data = load_fixture_page("alerts.json", page) filter_param = params.get('$filter', '') if params else '' - if 'creationTime gt' in filter_param: + if 'isResolved eq false' in filter_param: + filtered_data = [a for a in response_data.get('data', []) if not a.get('isResolved')] + response_data = dict(response_data) + response_data['data'] = filtered_data + elif 'lastUpdatedTime gt' in filter_param: + from datetime import datetime + + filter_time_str = filter_param.split('lastUpdatedTime gt ')[-1].strip() + filter_time = datetime.fromisoformat(filter_time_str.replace('Z', '+00:00')) + + filtered_data = [] + for alert in response_data.get('data', []): + alert_time_str = alert.get('lastUpdatedTime', '') + if alert_time_str: + alert_time = datetime.fromisoformat(alert_time_str.replace('Z', '+00:00')) + if alert_time > filter_time: + filtered_data.append(alert) + + filtered_data.sort( + key=lambda t: datetime.fromisoformat(t.get('lastUpdatedTime', '').replace('Z', '+00:00')) + ) + + response_data = dict(response_data) + response_data['data'] = filtered_data + elif 'creationTime gt' in filter_param: from datetime import datetime filter_time_str = filter_param.split('creationTime gt ')[-1].strip() diff --git a/nutanix/tests/metrics.py b/nutanix/tests/metrics.py index 65d13c41ebcc7..a49ba845395f6 100644 --- a/nutanix/tests/metrics.py +++ b/nutanix/tests/metrics.py @@ -5,6 +5,9 @@ HEALTH_METRICS = ["nutanix.health.up"] +ALERT_METRICS = ["nutanix.alert.open", "nutanix.alert.acknowledged"] +ALERT_METRICS_OPTIONAL = ["nutanix.alert.resolved"] + CLUSTER_STATS_METRICS_REQUIRED = [ "nutanix.cluster.aggregate_hypervisor.memory_usage", "nutanix.cluster.controller.avg_io_latency", diff --git a/nutanix/tests/test_alerts.py b/nutanix/tests/test_alerts.py index aa5ccf29adac4..b3364dbd42299 100644 --- a/nutanix/tests/test_alerts.py +++ b/nutanix/tests/test_alerts.py @@ -10,91 +10,22 @@ from datadog_checks.nutanix import NutanixCheck +from .conftest import load_fixture + pytestmark = [pytest.mark.unit] -# Mock datetime to cover full alerts fixture window MOCK_ALERT_DATETIME = datetime.fromisoformat("2026-01-04T21:09:00.000000Z") +MOCK_ALERT_DATETIME_AFTER_ALL = datetime.fromisoformat("2026-05-01T00:00:00.000000Z") -EXPECTED_ALERTS = [ - { - 'alert_type': 'warning', - 'event_type': 'nutanix', - 'msg_text': 'Disk space usage for {mount_path} on {entity} {ip_address} has exceeded {threshold}%. {ref_msg}', - 'msg_title': 'Alert: Disk space usage high for {mount_path} on {entity} {ip_address}', - 'source_type_name': 'nutanix', - 'tags': [ - 'nutanix', - 'prism_central:10.0.0.197', - 'ntnx_alert_type:A1031', - 'ntnx_alert_severity:WARNING', - 'ntnx_alert_classification:Storage', - 'ntnx_alert_impact:SYSTEM_INDICATOR', - 'ntnx_node_name:10-0-0-103-aws-us-east-1a', - 'ntnx_type:alert', - ], - 'timestamp': 1767560958, - }, - { - 'alert_type': 'warning', - 'event_type': 'nutanix', - 'msg_text': 'Disk space usage for {mount_path} on {entity} {ip_address} has exceeded {threshold}%. {ref_msg}', - 'msg_title': 'Alert: Disk space usage high for {mount_path} on {entity} {ip_address}', - 'source_type_name': 'nutanix', - 'tags': [ - 'nutanix', - 'prism_central:10.0.0.197', - 'ntnx_alert_type:A1031', - 'ntnx_alert_severity:WARNING', - 'ntnx_alert_classification:Storage', - 'ntnx_alert_impact:SYSTEM_INDICATOR', - 'ntnx_node_name:10-0-0-103-aws-us-east-1a', - 'ntnx_type:alert', - ], - 'timestamp': 1767691459, - }, - { - 'alert_type': 'info', - 'event_type': 'nutanix', - 'msg_text': 'Recovery Point for VM {vm_name} failed to capture associated policies ' - 'and categories because {reason}.', - 'msg_title': 'Alert: Degraded VM Recovery Point.', - 'source_type_name': 'nutanix', - 'tags': [ - 'nutanix', - 'prism_central:10.0.0.197', - 'ntnx_alert_type:A130172', - 'ntnx_alert_severity:INFO', - 'ntnx_alert_classification:DR', - 'ntnx_alert_impact:SYSTEM_INDICATOR', - 'ntnx_vm_name:ubuntu-vm', - 'ntnx_type:alert', - ], - 'timestamp': 1768302387, - }, -] - - -@mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") -def test_alerts_collection(get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get): - """Test that alerts are collected and have basic structure.""" - instance = mock_instance.copy() - instance["collect_alerts"] = True - - get_current_datetime.return_value = MOCK_ALERT_DATETIME - check = NutanixCheck('nutanix', {}, [instance]) - dd_run_check(check) - - alerts = [e for e in aggregator.events if "ntnx_type:alert" in e.get("tags", [])] - - assert len(alerts) > 0, "Expected alerts to be collected" - # Check that alerts have the expected structure - for alert in alerts: - assert alert['event_type'] == 'nutanix' - assert alert['source_type_name'] == 'nutanix' - assert 'ntnx_type:alert' in alert['tags'] - assert 'ntnx_alert_type' in str(alert['tags']) +def _fixture_alert(alert_type, **overrides): + """Load the first fixture alert with the given alertType and apply overrides.""" + for page in load_fixture('alerts.json'): + for alert in page.get('data', []): + if alert.get('alertType') == alert_type: + return {**alert, **overrides} + raise ValueError(f"No alert with alertType={alert_type} in fixture") @mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") @@ -105,7 +36,8 @@ def test_alerts_no_duplicates_on_subsequent_runs( instance = mock_instance.copy() instance["collect_alerts"] = True - get_current_datetime.return_value = MOCK_ALERT_DATETIME + # Datetime past all fixture alerts, so the cursor doesn't surface anything new on re-run + get_current_datetime.return_value = MOCK_ALERT_DATETIME_AFTER_ALL check = NutanixCheck('nutanix', {}, [instance]) dd_run_check(check) @@ -115,7 +47,7 @@ def test_alerts_no_duplicates_on_subsequent_runs( aggregator.reset() - # second check run, no new alerts to be collected + # Second run with the same fixture state: reconciliation diff is empty dd_run_check(check) alerts = [e for e in aggregator.events if "ntnx_type:alert" in e.get("tags", [])] @@ -144,7 +76,6 @@ def test_alerts_filtered_by_resource_filters_exclude_cluster( dd_run_check(check) alerts = [e for e in aggregator.events if "ntnx_type:alert" in e.get("tags", [])] - # No alerts should have the excluded cluster assert all("ntnx_cluster_name:datadog-nutanix-dev" not in e["tags"] for e in alerts) @@ -166,7 +97,6 @@ def test_alerts_filtered_by_resource_filters_include_cluster( alerts = [e for e in aggregator.events if "ntnx_type:alert" in e.get("tags", [])] assert len(alerts) > 0, "Expected some alerts to be collected" - # All collected alerts should have the included cluster ID assert all("ntnx_cluster_name:datadog-nutanix-dev" in e["tags"] for e in alerts) @@ -186,9 +116,12 @@ def test_alerts_filtered_by_activity_filter_severity( check = NutanixCheck('nutanix', {}, [instance]) dd_run_check(check) - alerts = [e for e in aggregator.events if "ntnx_type:alert" in e.get("tags", [])] + alerts = [ + e + for e in aggregator.events + if "ntnx_type:alert" in e.get("tags", []) and "ntnx_alert_status:open" in e.get("tags", []) + ] assert len(alerts) > 0, "Expected some WARNING alerts to be collected" - # All collected alerts should have WARNING severity assert all("ntnx_alert_severity:WARNING" in e["tags"] for e in alerts) @@ -214,30 +147,41 @@ def test_alerts_filtered_by_inexistent_property_nothing_collected( @mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") def test_alerts_filtered_by_activity_filter_alertType( - get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get + get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get, mocker ): """Test that only alerts matching the alertType filter are collected. Uses property 'alertType' to match the Nutanix API field name. + A200335 only exists as a resolved alert in the fixture; inject an + unresolved synthetic copy so reconciliation surfaces it. """ instance = mock_instance.copy() instance["collect_alerts"] = True instance["resource_filters"] = [ - {"resource": "alert", "property": "alertType", "patterns": ["^A130172$"]}, + {"resource": "alert", "property": "alertType", "patterns": ["^A200335$"]}, ] get_current_datetime.return_value = MOCK_ALERT_DATETIME check = NutanixCheck('nutanix', {}, [instance]) + synthetic = _fixture_alert("A200335", isResolved=False, isAcknowledged=False) + mocker.patch.object(check.activity_monitor, '_list_alerts_unresolved', return_value=[synthetic]) dd_run_check(check) - alerts = [e for e in aggregator.events if "ntnx_type:alert" in e.get("tags", [])] + alerts = [ + e + for e in aggregator.events + if "ntnx_type:alert" in e.get("tags", []) + and any(t in e.get("tags", []) for t in ("ntnx_alert_status:open", "ntnx_alert_status:acknowledged")) + ] assert len(alerts) == 1 - assert "ntnx_alert_type:A130172" in alerts[0]["tags"] + assert "ntnx_alert_type:A200335" in alerts[0]["tags"] @mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") -def test_alert_message_template_rendering(get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get): +def test_alert_message_template_rendering( + get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get, mocker +): """Test that alert messages with template variables are rendered correctly.""" instance = mock_instance.copy() instance["collect_alerts"] = True @@ -248,9 +192,16 @@ def test_alert_message_template_rendering(get_current_datetime, dd_run_check, ag get_current_datetime.return_value = MOCK_ALERT_DATETIME check = NutanixCheck('nutanix', {}, [instance]) + synthetic = _fixture_alert("A6227", isResolved=False, isAcknowledged=False) + mocker.patch.object(check.activity_monitor, '_list_alerts_unresolved', return_value=[synthetic]) dd_run_check(check) - alerts = [e for e in aggregator.events if "ntnx_type:alert" in e.get("tags", [])] + alerts = [ + e + for e in aggregator.events + if "ntnx_type:alert" in e.get("tags", []) + and any(t in e.get("tags", []) for t in ("ntnx_alert_status:open", "ntnx_alert_status:acknowledged")) + ] assert len(alerts) > 0 alert = alerts[0] @@ -276,27 +227,27 @@ def test_alert_a1031_disk_space_complete_output( check = NutanixCheck('nutanix', {}, [instance]) dd_run_check(check) - alerts = [e for e in aggregator.events if "ntnx_type:alert" in e.get("tags", [])] + alerts = [ + e + for e in aggregator.events + if "ntnx_type:alert" in e.get("tags", []) and "ntnx_alert_status:acknowledged" in e.get("tags", []) + ] assert len(alerts) >= 1, "Expected at least one A1031 alert" alert = alerts[0] - # Verify message rendering assert "Disk space usage for /var/log on Controller VM 10.0.0.108 has exceeded 75%" in alert["msg_text"] assert "{mount_path}" not in alert["msg_text"] assert "{entity}" not in alert["msg_text"] assert "{ip_address}" not in alert["msg_text"] assert "{threshold}" not in alert["msg_text"] - # Verify title rendering assert "Disk space usage high for /var/log on Controller VM 10.0.0.108" in alert["msg_title"] - # Verify alert structure assert alert["event_type"] == "nutanix" assert alert["alert_type"] == "warning" assert alert["source_type_name"] == "nutanix" - # Verify tags assert "ntnx_type:alert" in alert["tags"] assert "ntnx_alert_type:A1031" in alert["tags"] assert "ntnx_alert_severity:WARNING" in alert["tags"] @@ -307,53 +258,54 @@ def test_alert_a1031_disk_space_complete_output( @mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") -def test_alert_a130172_vm_recovery_complete_output( - get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get +def test_alert_a111050_default_password_complete_output( + get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get, mocker ): - """Test complete alert output for A130172 (VM recovery) with rendered message.""" + """Test complete alert output for A111050 (default password) with rendered message.""" instance = mock_instance.copy() instance["collect_alerts"] = True instance["resource_filters"] = [ - {"resource": "alert", "property": "alertType", "patterns": ["^A130172$"]}, + {"resource": "alert", "property": "alertType", "patterns": ["^A111050$"]}, ] get_current_datetime.return_value = MOCK_ALERT_DATETIME check = NutanixCheck('nutanix', {}, [instance]) + # A111050 (c7dbae76) is resolved+acknowledged in the fixture; inject an + # unresolved+acknowledged synthetic copy so reconciliation surfaces it. + synthetic = _fixture_alert("A111050", isResolved=False, isAcknowledged=True) + mocker.patch.object(check.activity_monitor, '_list_alerts_unresolved', return_value=[synthetic]) dd_run_check(check) - alerts = [e for e in aggregator.events if "ntnx_type:alert" in e.get("tags", [])] - assert len(alerts) == 1, "Expected exactly one A130172 alert" + alerts = [ + e + for e in aggregator.events + if "ntnx_type:alert" in e.get("tags", []) and "ntnx_alert_status:acknowledged" in e.get("tags", []) + ] + assert len(alerts) >= 1, "Expected at least one A111050 alert" alert = alerts[0] - # Verify message rendering - expected_message = ( - "Recovery Point for VM ubuntu-vm failed to capture associated policies " - "and categories because Management plane is not available to get the configuration." - ) - assert alert["msg_text"] == expected_message - assert "{vm_name}" not in alert["msg_text"] - assert "{reason}" not in alert["msg_text"] + assert "nutanix" in alert["msg_text"] + assert "{users}" not in alert["msg_text"] + assert "{pcvm_ip}" not in alert["msg_title"] + assert "10.0.0.165" in alert["msg_title"] - # Verify alert structure assert alert["event_type"] == "nutanix" - assert alert["alert_type"] == "info" + assert alert["alert_type"] == "warning" # acknowledged alerts use "warning" regardless of severity assert alert["source_type_name"] == "nutanix" - assert alert["msg_title"] == "Alert: Degraded VM Recovery Point." - # Verify tags assert "ntnx_type:alert" in alert["tags"] - assert "ntnx_alert_type:A130172" in alert["tags"] - assert "ntnx_alert_severity:INFO" in alert["tags"] - assert "ntnx_alert_classification:DR" in alert["tags"] - assert "ntnx_alert_impact:SYSTEM_INDICATOR" in alert["tags"] - assert "ntnx_vm_name:ubuntu-vm" in alert["tags"] + assert "ntnx_alert_type:A111050" in alert["tags"] + assert "ntnx_alert_severity:CRITICAL" in alert["tags"] + assert "ntnx_alert_classification:Cluster" in alert["tags"] + assert "ntnx_alert_impact:CONFIGURATION" in alert["tags"] + assert "ntnx_cluster_name:prism-central-deployment" in alert["tags"] @mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") def test_alert_a6227_password_expiry_complete_output( - get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get + get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get, mocker ): """Test complete alert output for A6227 (password expiry) with rendered message.""" instance = mock_instance.copy() @@ -365,25 +317,30 @@ def test_alert_a6227_password_expiry_complete_output( get_current_datetime.return_value = MOCK_ALERT_DATETIME check = NutanixCheck('nutanix', {}, [instance]) + # A6227 alerts in the fixture are resolved+acknowledged; inject an + # unresolved+acknowledged synthetic copy. + synthetic = _fixture_alert("A6227", isResolved=False, isAcknowledged=True) + mocker.patch.object(check.activity_monitor, '_list_alerts_unresolved', return_value=[synthetic]) dd_run_check(check) - alerts = [e for e in aggregator.events if "ntnx_type:alert" in e.get("tags", [])] + alerts = [ + e + for e in aggregator.events + if "ntnx_type:alert" in e.get("tags", []) and "ntnx_alert_status:acknowledged" in e.get("tags", []) + ] assert len(alerts) >= 1, "Expected at least one A6227 alert" alert = alerts[0] - # Verify message rendering expected_message = "Admin user password has expired. Please change the admin password." assert alert["msg_text"] == expected_message assert "{alert_msg}" not in alert["msg_text"] - # Verify alert structure assert alert["event_type"] == "nutanix" - assert alert["alert_type"] == "error" + assert alert["alert_type"] == "warning" # acknowledged alerts use "warning" regardless of severity assert alert["source_type_name"] == "nutanix" assert alert["msg_title"] == "Alert: The PC admin user password is going to expire soon or has already expired." - # Verify tags assert "ntnx_type:alert" in alert["tags"] assert "ntnx_alert_type:A6227" in alert["tags"] assert "ntnx_alert_severity:CRITICAL" in alert["tags"] @@ -392,6 +349,79 @@ def test_alert_a6227_password_expiry_complete_output( assert "ntnx_cluster_name:prism-central-deployment" in alert["tags"] +@mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") +def test_alert_event_has_aggregation_key_and_status_tag( + get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get +): + """Test that alert events include aggregation_key and ntnx_alert_status:open tag.""" + instance = mock_instance.copy() + instance["collect_alerts"] = True + + get_current_datetime.return_value = MOCK_ALERT_DATETIME + + check = NutanixCheck('nutanix', {}, [instance]) + dd_run_check(check) + + alerts = [e for e in aggregator.events if "ntnx_type:alert" in e.get("tags", [])] + assert len(alerts) > 0 + + for alert in alerts: + assert "aggregation_key" in alert, "Alert event must have aggregation_key" + assert alert["aggregation_key"].startswith("nutanix-alert-") + assert any(t in alert["tags"] for t in ("ntnx_alert_status:open", "ntnx_alert_status:acknowledged")) + + +@pytest.mark.parametrize( + "is_auto_resolved, expected_msg_text, expected_auto_tag", + [ + (False, "Resolved by noueman", "ntnx_alert_auto_resolved:false"), + (True, "Auto-resolved", "ntnx_alert_auto_resolved:true"), + ], +) +@mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") +def test_emit_resolution_event( + get_current_datetime, + dd_run_check, + aggregator, + mock_instance, + mock_http_get, + is_auto_resolved, + expected_msg_text, + expected_auto_tag, +): + """_emit_resolution_event produces a success event with the right msg_text and auto_resolved tag.""" + get_current_datetime.return_value = MOCK_ALERT_DATETIME + + check = NutanixCheck('nutanix', {}, [mock_instance]) + dd_run_check(check) + aggregator.reset() + + resolved_alert = { + "extId": "test-alert-123", + "title": "Test Alert Title", + "severity": "WARNING", + "alertType": "A1031", + "isResolved": True, + "resolvedTime": "2026-03-04T00:49:39.030653Z", + "resolvedByUsername": "noueman", + "isAutoResolved": is_auto_resolved, + "classifications": ["Storage"], + "impactTypes": ["SYSTEM_INDICATOR"], + } + + check.activity_monitor._emit_resolution_event(resolved_alert) + + event = aggregator.events[0] + assert event["alert_type"] == "success" + assert event["aggregation_key"] == "nutanix-alert-test-alert-123" + assert event["msg_title"] == "Alert Resolved: Test Alert Title" + assert expected_msg_text in event["msg_text"] + assert expected_auto_tag in event["tags"] + assert "ntnx_alert_status:resolved" in event["tags"] + assert "ntnx_type:alert" in event["tags"] + assert "ntnx_alert_severity:WARNING" in event["tags"] + + @mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") def test_alert_with_ip_address_rendering(get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get): """Test that ip_address template variable is rendered correctly in alert messages.""" @@ -406,20 +436,608 @@ def test_alert_with_ip_address_rendering(get_current_datetime, dd_run_check, agg check = NutanixCheck('nutanix', {}, [instance]) dd_run_check(check) - alerts = [e for e in aggregator.events if "ntnx_type:alert" in e.get("tags", [])] + alerts = [ + e + for e in aggregator.events + if "ntnx_type:alert" in e.get("tags", []) and "ntnx_alert_status:acknowledged" in e.get("tags", []) + ] assert len(alerts) >= 1, "Expected at least one A1031 alert with ip_address" alert = alerts[0] - # Verify ip_address is rendered in message assert "10.0.0.108" in alert["msg_text"], "IP address should be rendered in message" assert "{ip_address}" not in alert["msg_text"], "Template variable should be replaced" - # Verify ip_address is rendered in title assert "10.0.0.108" in alert["msg_title"], "IP address should be rendered in title" assert "{ip_address}" not in alert["msg_title"], "Template variable should be replaced" - # Verify complete rendered message contains ip_address in proper context assert "Disk space usage for /var/log on Controller VM 10.0.0.108" in alert["msg_text"], ( "Message should contain rendered ip_address in context" ) + + +@mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") +def test_alerts_first_run_collects_only_unresolved( + get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get +): + """First check cycle should track only currently-unresolved alerts.""" + instance = mock_instance.copy() + instance["collect_alerts"] = True + + get_current_datetime.return_value = MOCK_ALERT_DATETIME + + check = NutanixCheck('nutanix', {}, [instance]) + dd_run_check(check) + + alerts = [e for e in aggregator.events if "ntnx_type:alert" in e.get("tags", [])] + assert len(alerts) > 0 + for alert in alerts: + assert alert["event_type"] == "nutanix" + assert alert["source_type_name"] == "nutanix" + assert any(t in alert["tags"] for t in ("ntnx_alert_status:open", "ntnx_alert_status:acknowledged")) + assert any(t.startswith("ntnx_alert_type:") for t in alert["tags"]) + + resolved = [e for e in aggregator.events if "ntnx_alert_status:resolved" in e.get("tags", [])] + assert len(resolved) == 0 + assert len(check.activity_monitor._open_alerts) == len(alerts) + + +@mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") +def test_alerts_resolution_detected_on_subsequent_run( + get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get, mocker +): + """When a previously-tracked alert disappears from the unresolved list, a resolution event is emitted.""" + instance = mock_instance.copy() + instance["collect_alerts"] = True + + get_current_datetime.return_value = MOCK_ALERT_DATETIME + + check = NutanixCheck('nutanix', {}, [instance]) + + # First run: populate _open_alerts from the real fixture + dd_run_check(check) + + open_events = [e for e in aggregator.events if "ntnx_alert_status:open" in e.get("tags", [])] + assert len(open_events) > 0 + + target_ext_id = next(iter(check.activity_monitor._open_alerts)) + + aggregator.reset() + + resolved_alert = { + "$objectType": "monitoring.v4.serviceability.Alert", + "extId": target_ext_id, + "isResolved": True, + "resolvedTime": "2026-03-04T01:10:00.000000Z", + "resolvedByUsername": "admin", + "isAutoResolved": False, + "isAcknowledged": False, + "title": "Resolved Test Alert", + "alertType": "A1031", + "severity": "WARNING", + "creationTime": "2026-03-04T00:46:29.532987Z", + "lastUpdatedTime": "2026-03-04T01:10:00.000000Z", + "classifications": ["Storage"], + "impactTypes": ["SYSTEM_INDICATOR"], + } + + # Second run: target alert is no longer in the unresolved list (others remain). + remaining = [a for a in check.activity_monitor._open_alerts.values() if a.get("extId") != target_ext_id] + mocker.patch.object(check.activity_monitor, '_list_alerts_unresolved', return_value=remaining) + # _get_alert returns the resolved metadata for the resolution event. + mocker.patch.object(check.activity_monitor, '_get_alert', return_value=resolved_alert) + + dd_run_check(check) + + resolved_events = [e for e in aggregator.events if "ntnx_alert_status:resolved" in e.get("tags", [])] + assert len(resolved_events) == 1 + + event = resolved_events[0] + assert event["alert_type"] == "success" + assert event["aggregation_key"] == f"nutanix-alert-{target_ext_id}" + assert "Resolved by admin" in event["msg_text"] + + assert target_ext_id not in check.activity_monitor._open_alerts + + +@mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") +def test_alerts_still_open_no_duplicate_event( + get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get +): + """Two consecutive cycles with the same unresolved list emit no duplicate events.""" + instance = mock_instance.copy() + instance["collect_alerts"] = True + + get_current_datetime.return_value = MOCK_ALERT_DATETIME + + check = NutanixCheck('nutanix', {}, [instance]) + + dd_run_check(check) + aggregator.reset() + + dd_run_check(check) + + all_alert_events = [e for e in aggregator.events if "ntnx_type:alert" in e.get("tags", [])] + assert len(all_alert_events) == 0, "Should not emit any event when no alert state has changed" + + +# --- nutanix.alert.open metric emission --- + + +@mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") +def test_alert_open_metric_emitted_per_tracked_alert( + get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get +): + """One :1 emission per tracked alert, partitioned across .open and .acknowledged by state.""" + instance = mock_instance.copy() + instance["collect_alerts"] = True + + get_current_datetime.return_value = MOCK_ALERT_DATETIME + + check = NutanixCheck('nutanix', {}, [instance]) + dd_run_check(check) + + tracked = list(check.activity_monitor._open_alerts.values()) + expected_open = sum(1 for a in tracked if not a.get("isAcknowledged")) + expected_ack = sum(1 for a in tracked if a.get("isAcknowledged")) + assert expected_open + expected_ack > 0 + + open_ones = [m for m in aggregator.metrics("nutanix.alert.open") if m.value == 1] + ack_ones = [m for m in aggregator.metrics("nutanix.alert.acknowledged") if m.value == 1] + open_zeros = [m for m in aggregator.metrics("nutanix.alert.open") if m.value == 0] + ack_zeros = [m for m in aggregator.metrics("nutanix.alert.acknowledged") if m.value == 0] + + assert len(open_ones) == expected_open + assert len(ack_ones) == expected_ack + assert len(open_zeros) == 0 + assert len(ack_zeros) == 0 + + for m in open_ones + ack_ones: + assert any(t.startswith("ext_id:") for t in m.tags) + + +@mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") +def test_alert_open_metric_zero_on_resolution( + get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get, mocker +): + """nutanix.alert.open=0 is submitted exactly once when an open alert resolves.""" + instance = mock_instance.copy() + instance["collect_alerts"] = True + + get_current_datetime.return_value = MOCK_ALERT_DATETIME + + check = NutanixCheck('nutanix', {}, [instance]) + dd_run_check(check) + + target_ext_id = next(iter(check.activity_monitor._open_alerts)) + + aggregator.reset() + + resolved_alert = { + "$objectType": "monitoring.v4.serviceability.Alert", + "extId": target_ext_id, + "isResolved": True, + "resolvedTime": "2026-03-04T01:10:00.000000Z", + "resolvedByUsername": "admin", + "isAutoResolved": False, + "isAcknowledged": False, + "title": "Resolved Test Alert", + "alertType": "A1031", + "severity": "WARNING", + "creationTime": "2026-03-04T00:46:29.532987Z", + "lastUpdatedTime": "2026-03-04T01:10:00.000000Z", + "classifications": ["Storage"], + "impactTypes": ["SYSTEM_INDICATOR"], + } + + # Pin target's prior state (open) so we know which metric receives the :0. + cached = check.activity_monitor._open_alerts[target_ext_id] + prior_state = "acknowledged" if cached.get("isAcknowledged") else "open" + + remaining = [a for a in check.activity_monitor._open_alerts.values() if a.get("extId") != target_ext_id] + mocker.patch.object(check.activity_monitor, '_list_alerts_unresolved', return_value=remaining) + mocker.patch.object(check.activity_monitor, '_get_alert', return_value=resolved_alert) + + dd_run_check(check) + + zero_metric_name = f"nutanix.alert.{prior_state}" + zero_metrics = [m for m in aggregator.metrics(zero_metric_name) if m.value == 0] + assert len(zero_metrics) == 1 + assert f"ext_id:{target_ext_id}" in zero_metrics[0].tags + + resolved_metrics = [ + m for m in aggregator.metrics("nutanix.alert.resolved") if m.value == 1 and f"ext_id:{target_ext_id}" in m.tags + ] + assert len(resolved_metrics) == 1 + + target_ones = [ + m + for name in ("nutanix.alert.open", "nutanix.alert.acknowledged") + for m in aggregator.metrics(name) + if m.value == 1 and f"ext_id:{target_ext_id}" in m.tags + ] + assert len(target_ones) == 0 + + +@mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") +def test_alert_open_metric_re_emitted_each_cycle( + get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get +): + """Gauges keep being submitted on subsequent cycles for still-tracked alerts.""" + instance = mock_instance.copy() + instance["collect_alerts"] = True + + get_current_datetime.return_value = MOCK_ALERT_DATETIME_AFTER_ALL + + check = NutanixCheck('nutanix', {}, [instance]) + + def total_ones(): + return sum( + 1 + for name in ("nutanix.alert.open", "nutanix.alert.acknowledged") + for m in aggregator.metrics(name) + if m.value == 1 + ) + + dd_run_check(check) + first_cycle_count = total_ones() + assert first_cycle_count > 0 + + aggregator.reset() + dd_run_check(check) + second_cycle_count = total_ones() + assert second_cycle_count == first_cycle_count + + +@mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") +def test_alert_state_transition_open_to_acknowledged( + get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get, mocker +): + """open->acknowledged: 0 to nutanix.alert.open and 1 to nutanix.alert.acknowledged.""" + instance = mock_instance.copy() + instance["collect_alerts"] = True + + get_current_datetime.return_value = MOCK_ALERT_DATETIME + + check = NutanixCheck('nutanix', {}, [instance]) + dd_run_check(check) + + # Pick an alert currently tracked as open (not acknowledged) so we can transition it + target_ext_id = next( + ext_id for ext_id, a in check.activity_monitor._open_alerts.items() if not a.get("isAcknowledged") + ) + + aggregator.reset() + + # Same unresolved set, but the target now has isAcknowledged=True + refreshed = [] + for ext_id, a in check.activity_monitor._open_alerts.items(): + if ext_id == target_ext_id: + updated = dict(a) + updated["isAcknowledged"] = True + updated["lastUpdatedTime"] = "2026-04-15T00:00:00.000000Z" + refreshed.append(updated) + else: + refreshed.append(a) + + mocker.patch.object(check.activity_monitor, '_list_alerts_unresolved', return_value=refreshed) + + dd_run_check(check) + + open_zeros = [ + m for m in aggregator.metrics("nutanix.alert.open") if m.value == 0 and f"ext_id:{target_ext_id}" in m.tags + ] + assert len(open_zeros) == 1 + + ack_ones = [ + m + for m in aggregator.metrics("nutanix.alert.acknowledged") + if m.value == 1 and f"ext_id:{target_ext_id}" in m.tags + ] + assert len(ack_ones) >= 1 + + open_ones = [ + m for m in aggregator.metrics("nutanix.alert.open") if m.value == 1 and f"ext_id:{target_ext_id}" in m.tags + ] + assert len(open_ones) == 0 + + +# --- Tier 1 tags + status-tag-free metrics --- + + +@mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") +def test_alert_event_carries_originating_cluster_and_user_defined_tags( + get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get +): + """Alert events carry ntnx_originating_cluster_name and ntnx_alert_user_defined tags.""" + instance = mock_instance.copy() + instance["collect_alerts"] = True + get_current_datetime.return_value = MOCK_ALERT_DATETIME + + check = NutanixCheck('nutanix', {}, [instance]) + dd_run_check(check) + + alerts = [e for e in aggregator.events if "ntnx_type:alert" in e.get("tags", [])] + assert len(alerts) > 0 + + # At least one alert exposes both cluster perspectives with distinct values + # (clusterUUID = managed cluster the alert is reported against, + # originatingClusterUUID = PC's own cluster federating the alert) + distinct_pairs = [ + e + for e in alerts + if any(t.startswith("ntnx_cluster_name:") for t in e["tags"]) + and any(t.startswith("ntnx_originating_cluster_name:") for t in e["tags"]) + ] + assert len(distinct_pairs) > 0, "Expected at least one alert with both cluster tags" + + for e in alerts: + assert any(t.startswith("ntnx_alert_user_defined:") for t in e["tags"]) + + +@mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") +def test_alert_event_carries_service_tag_when_available( + get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get, mocker +): + """ntnx_alert_service is emitted only when the alert has a serviceName.""" + instance = mock_instance.copy() + instance["collect_alerts"] = True + get_current_datetime.return_value = MOCK_ALERT_DATETIME + + check = NutanixCheck('nutanix', {}, [instance]) + synthetic = _fixture_alert( + "NTNX_IAMv2_Authn_Database_Connectivity_Error_Warning", + isResolved=False, + isAcknowledged=False, + ) + mocker.patch.object(check.activity_monitor, '_list_alerts_unresolved', return_value=[synthetic]) + dd_run_check(check) + + alerts = [e for e in aggregator.events if "ntnx_type:alert" in e.get("tags", [])] + assert len(alerts) == 1 + assert "ntnx_alert_service:IAMv2" in alerts[0]["tags"] + + +@mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") +def test_alert_state_metrics_do_not_carry_status_tag( + get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get +): + """The metric name encodes the state, so ntnx_alert_status is omitted from metric tags.""" + instance = mock_instance.copy() + instance["collect_alerts"] = True + get_current_datetime.return_value = MOCK_ALERT_DATETIME + + check = NutanixCheck('nutanix', {}, [instance]) + dd_run_check(check) + + for name in ("nutanix.alert.open", "nutanix.alert.acknowledged"): + for m in aggregator.metrics(name): + assert not any(t.startswith("ntnx_alert_status:") for t in m.tags), ( + f"{name} should not carry ntnx_alert_status tag (redundant with metric name)" + ) + + +# --- Edge cases: state transitions, filter changes, deletion, empty list --- + + +@mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") +def test_alert_open_metric_zero_on_resolution_from_acknowledged_state( + get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get, mocker +): + """Resolution from the acknowledged state closes nutanix.alert.acknowledged, not .open.""" + instance = mock_instance.copy() + instance["collect_alerts"] = True + get_current_datetime.return_value = MOCK_ALERT_DATETIME + + check = NutanixCheck('nutanix', {}, [instance]) + dd_run_check(check) + + # Specifically pick an alert tracked in the acknowledged state + target_ext_id = next(ext_id for ext_id, a in check.activity_monitor._open_alerts.items() if a.get("isAcknowledged")) + + aggregator.reset() + + resolved_alert = { + "extId": target_ext_id, + "isResolved": True, + "resolvedTime": "2026-03-04T01:10:00.000000Z", + "resolvedByUsername": "admin", + "isAutoResolved": False, + "isAcknowledged": True, + "title": "Resolved Acknowledged Alert", + "alertType": "A1031", + "severity": "WARNING", + } + + remaining = [a for a in check.activity_monitor._open_alerts.values() if a.get("extId") != target_ext_id] + mocker.patch.object(check.activity_monitor, '_list_alerts_unresolved', return_value=remaining) + mocker.patch.object(check.activity_monitor, '_get_alert', return_value=resolved_alert) + + dd_run_check(check) + + ack_zeros = [ + m + for m in aggregator.metrics("nutanix.alert.acknowledged") + if m.value == 0 and f"ext_id:{target_ext_id}" in m.tags + ] + assert len(ack_zeros) == 1 + + # No :0 on nutanix.alert.open for this ext_id (it never was in that state on this run) + open_zeros = [ + m for m in aggregator.metrics("nutanix.alert.open") if m.value == 0 and f"ext_id:{target_ext_id}" in m.tags + ] + assert len(open_zeros) == 0 + + resolved_metrics = [ + m for m in aggregator.metrics("nutanix.alert.resolved") if m.value == 1 and f"ext_id:{target_ext_id}" in m.tags + ] + assert len(resolved_metrics) == 1 + + +@mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") +def test_alert_state_transition_acknowledged_to_open( + get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get, mocker +): + """Un-acknowledging in Nutanix moves the alert back from .acknowledged to .open.""" + instance = mock_instance.copy() + instance["collect_alerts"] = True + get_current_datetime.return_value = MOCK_ALERT_DATETIME + + check = NutanixCheck('nutanix', {}, [instance]) + dd_run_check(check) + + target_ext_id = next(ext_id for ext_id, a in check.activity_monitor._open_alerts.items() if a.get("isAcknowledged")) + + aggregator.reset() + + # Same unresolved set, but the target is now un-acknowledged + refreshed = [] + for ext_id, a in check.activity_monitor._open_alerts.items(): + if ext_id == target_ext_id: + updated = dict(a) + updated["isAcknowledged"] = False + refreshed.append(updated) + else: + refreshed.append(a) + + mocker.patch.object(check.activity_monitor, '_list_alerts_unresolved', return_value=refreshed) + + dd_run_check(check) + + ack_zeros = [ + m + for m in aggregator.metrics("nutanix.alert.acknowledged") + if m.value == 0 and f"ext_id:{target_ext_id}" in m.tags + ] + assert len(ack_zeros) == 1 + + open_ones = [ + m for m in aggregator.metrics("nutanix.alert.open") if m.value == 1 and f"ext_id:{target_ext_id}" in m.tags + ] + assert len(open_ones) >= 1 + + # Target should not have a :1 on .acknowledged this cycle + ack_ones_target = [ + m + for m in aggregator.metrics("nutanix.alert.acknowledged") + if m.value == 1 and f"ext_id:{target_ext_id}" in m.tags + ] + assert len(ack_ones_target) == 0 + + +@mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") +def test_alert_filter_excludes_tracked_alert_emits_spurious_resolution( + get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get, mocker +): + """A tracked alert that becomes filter-excluded mid-life is currently treated as resolved. + + Documents existing behavior: when a resource_filter is added that excludes a + previously-tracked alert, reconciliation sees it disappear from api_alerts and + emits a resolution event + nutanix.alert.resolved=1 — even though Prism Central + still has the alert open. Operators changing filter rules should be aware. + """ + instance = mock_instance.copy() + instance["collect_alerts"] = True + get_current_datetime.return_value = MOCK_ALERT_DATETIME + + check = NutanixCheck('nutanix', {}, [instance]) + dd_run_check(check) + + target_ext_id = next(iter(check.activity_monitor._open_alerts)) + + aggregator.reset() + + # Simulate adding a filter that excludes the target ext_id mid-life + original = check.activity_monitor._should_collect_activity_item + mocker.patch.object( + check.activity_monitor, + '_should_collect_activity_item', + side_effect=lambda item, kind: item.get("extId") != target_ext_id and original(item, kind), + ) + + dd_run_check(check) + + resolved_events = [ + e + for e in aggregator.events + if "ntnx_alert_status:resolved" in e.get("tags", []) and f"ext_id:{target_ext_id}" in e.get("tags", []) + ] + assert len(resolved_events) == 1 + assert target_ext_id not in check.activity_monitor._open_alerts + + resolved_metrics = [ + m for m in aggregator.metrics("nutanix.alert.resolved") if m.value == 1 and f"ext_id:{target_ext_id}" in m.tags + ] + assert len(resolved_metrics) == 1 + + +@mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") +def test_alerts_collection_empty_unresolved_list( + get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get, mocker +): + """Cold start with no open alerts in Prism Central is a clean no-op.""" + instance = mock_instance.copy() + instance["collect_alerts"] = True + get_current_datetime.return_value = MOCK_ALERT_DATETIME + + check = NutanixCheck('nutanix', {}, [instance]) + mocker.patch.object(check.activity_monitor, '_list_alerts_unresolved', return_value=[]) + dd_run_check(check) + + assert check.activity_monitor.alerts_count == 0 + assert check.activity_monitor._open_alerts == {} + assert [e for e in aggregator.events if "ntnx_type:alert" in e.get("tags", [])] == [] + for name in ("nutanix.alert.open", "nutanix.alert.acknowledged", "nutanix.alert.resolved"): + assert list(aggregator.metrics(name)) == [] + + +@mock.patch("datadog_checks.nutanix.activity_monitor.get_current_datetime") +def test_alert_resolution_with_no_metadata_when_alert_deleted( + get_current_datetime, dd_run_check, aggregator, mock_instance, mock_http_get, mocker +): + """An alert deleted (not resolved) in Nutanix: _get_alert returns None; falls back to cached metadata. + + Exercises the bare 'Resolved' msg_text fallback (no resolvedByUsername, not auto-resolved) + and the cached-tags path when the API can no longer return the alert. + """ + instance = mock_instance.copy() + instance["collect_alerts"] = True + get_current_datetime.return_value = MOCK_ALERT_DATETIME + + check = NutanixCheck('nutanix', {}, [instance]) + dd_run_check(check) + + target_ext_id = next(iter(check.activity_monitor._open_alerts)) + cached_state = check.activity_monitor._open_alerts[target_ext_id].copy() + prior_state = "acknowledged" if cached_state.get("isAcknowledged") else "open" + + aggregator.reset() + + remaining = [a for a in check.activity_monitor._open_alerts.values() if a.get("extId") != target_ext_id] + mocker.patch.object(check.activity_monitor, '_list_alerts_unresolved', return_value=remaining) + mocker.patch.object(check.activity_monitor, '_get_alert', return_value=None) + + dd_run_check(check) + + resolution_events = [ + e + for e in aggregator.events + if "ntnx_alert_status:resolved" in e.get("tags", []) and f"ext_id:{target_ext_id}" in e.get("tags", []) + ] + assert len(resolution_events) == 1 + # No resolvedByUsername and isAutoResolved defaults to False on the cached alert, + # so the message falls through to the bare "Resolved" branch. + assert resolution_events[0]["msg_text"] == "Resolved" + assert "ntnx_alert_auto_resolved:false" in resolution_events[0]["tags"] + + zero_metrics = [ + m + for m in aggregator.metrics(f"nutanix.alert.{prior_state}") + if m.value == 0 and f"ext_id:{target_ext_id}" in m.tags + ] + assert len(zero_metrics) == 1 + + resolved_metrics = [ + m for m in aggregator.metrics("nutanix.alert.resolved") if m.value == 1 and f"ext_id:{target_ext_id}" in m.tags + ] + assert len(resolved_metrics) == 1 diff --git a/nutanix/tests/test_clusters.py b/nutanix/tests/test_clusters.py index 82315ad8cb3cf..22ed28f582164 100644 --- a/nutanix/tests/test_clusters.py +++ b/nutanix/tests/test_clusters.py @@ -67,7 +67,9 @@ def test_entity_counts(dd_run_check, mock_instance, mock_http_get): assert activity.events_count == 0 assert activity.tasks_count == 0 assert activity.audits_count == 0 - assert activity.alerts_count == 0 + # First run fetches all unresolved alerts (no time filter), so the count + # reflects every isResolved=false alert in the fixture. + assert activity.alerts_count == 5 def test_summary_log_message(dd_run_check, mock_instance, mock_http_get, caplog): @@ -75,7 +77,7 @@ def test_summary_log_message(dd_run_check, mock_instance, mock_http_get, caplog) with caplog.at_level(logging.INFO): dd_run_check(check) - expected = "[PC:10.0.0.197] Check completed: 2 clusters, 2 hosts, 4 VMs, 0 events, 0 tasks, 0 audits, 0 alerts" + expected = "[PC:10.0.0.197] Check completed: 2 clusters, 2 hosts, 4 VMs, 0 events, 0 tasks, 0 audits, 5 alerts" summary_lines = [r.message for r in caplog.records if "Check completed" in r.message] assert len(summary_lines) == 1 assert summary_lines[0] == expected diff --git a/nutanix/tests/test_metadata.py b/nutanix/tests/test_metadata.py index 98a063650008a..9d71cf52abeaf 100644 --- a/nutanix/tests/test_metadata.py +++ b/nutanix/tests/test_metadata.py @@ -8,6 +8,8 @@ from datadog_checks.nutanix import NutanixCheck from .metrics import ( + ALERT_METRICS, + ALERT_METRICS_OPTIONAL, CLUSTER_BASIC_METRICS, CLUSTER_CAPACITY_METRICS, CLUSTER_STATS_METRICS_OPTIONAL, @@ -25,6 +27,7 @@ REQUIRED_METRICS = ( HEALTH_METRICS + + ALERT_METRICS + CLUSTER_BASIC_METRICS + CLUSTER_CAPACITY_METRICS + CLUSTER_STATS_METRICS_REQUIRED @@ -36,7 +39,9 @@ + VM_STATS_METRICS_REQUIRED ) -OPTIONAL_METRICS = CLUSTER_STATS_METRICS_OPTIONAL + HOST_STATS_METRICS_OPTIONAL + VM_STATS_METRICS_OPTIONAL +OPTIONAL_METRICS = ( + ALERT_METRICS_OPTIONAL + CLUSTER_STATS_METRICS_OPTIONAL + HOST_STATS_METRICS_OPTIONAL + VM_STATS_METRICS_OPTIONAL +) @pytest.mark.unit