Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions nutanix/assets/monitors/alerts.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{
"version": 2,
"created_at": "2026-04-29",
"last_updated_at": "2026-04-29",
"title": "Nutanix alert is open",
"description": "Notifies when a Nutanix alert is unresolved (open or acknowledged) and auto-resolves when the alert is marked resolved in Prism Central. Each Nutanix alert produces its own monitor case via the `ext_id` grouping facet.",
"definition": {
"name": "{{#is_alert}}OPEN{{/is_alert}}{{#is_recovery}}RESOLVED{{/is_recovery}} [Nutanix {{ntnx_alert_severity.name}}] [Impact: {{ntnx_alert_impact.name}}] [PC: {{ntnx_cluster_name.name}}] - ID: {{ext_id.name}}",
"type": "query alert",
"query": "avg(last_15m):clamp_min(default_zero(avg:nutanix.alert.open{*} by {ext_id,ntnx_alert_impact,ntnx_cluster_name,ntnx_alert_severity,prism_central,ntnx_alert_type}) - default_zero(avg:nutanix.alert.resolved{*} by {ext_id,ntnx_alert_impact,ntnx_cluster_name,ntnx_alert_severity,prism_central,ntnx_alert_type}), 0) > 0",
"message": "@your-team\n\n{{#is_alert}}A Nutanix alert is currently open and requires attention.{{/is_alert}}\n{{#is_recovery}}The Nutanix alert has been resolved.{{/is_recovery}}\n\n**Alert details**\n- **Nutanix Alert ID (extId):** `{{ext_id.name}}`\n- **Severity:** `{{ntnx_alert_severity.name}}`\n- **Impact:** `{{ntnx_alert_impact.name}}`\n- **Type:** `{{ntnx_alert_type.name}}`\n- **Cluster:** `{{ntnx_cluster_name.name}}`",
"tags": ["integration:nutanix"],
"options": {
"thresholds": {
"critical": 0
},
"notify_audit": false,
"on_missing_data": "default",
"include_tags": false,
"new_group_delay": 60,
"renotify_interval": 0,
"escalation_message": "",
"notify_by": ["ext_id"]
}
},
"tags": ["integration:nutanix"]
}
1 change: 1 addition & 0 deletions nutanix/changelog.d/23489.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Submit `nutanix.alert.open` gauge per Nutanix alert (1 while open, 0 once on resolution) for individual alert lifecycle monitoring.
1 change: 1 addition & 0 deletions nutanix/changelog.d/23501.fixed
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Reconcile open alerts against the unresolved-alerts API on every check cycle instead of relying on a persistent cache. Eliminates orphaned `nutanix.alert.open` metric emissions when a resolution is missed (e.g. agent downtime spanning a resolution, or alert deletion).
1 change: 1 addition & 0 deletions nutanix/changelog.d/23508.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Split alert lifecycle gauge into per-state metrics: `nutanix.alert.open` (unacknowledged + unresolved), `nutanix.alert.acknowledged` (acknowledged + unresolved), and `nutanix.alert.resolved` (one-shot on resolution detection). State transitions emit an explicit 0 to the previous state's metric.
1 change: 1 addition & 0 deletions nutanix/changelog.d/23511.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add `ntnx_originating_cluster_name`, `ntnx_alert_user_defined`, and `ntnx_alert_service` (when present) tags to alert events and metrics. Drop redundant `ntnx_alert_status` tag from per-state metrics (state is encoded in the metric name).
2 changes: 1 addition & 1 deletion nutanix/datadog_checks/nutanix/__about__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# (C) Datadog, Inc. 2026-present
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)
__version__ = '1.1.0'
__version__ = '1.2.0+dev'
215 changes: 157 additions & 58 deletions nutanix/datadog_checks/nutanix/activity_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
from datadog_checks.nutanix.check import NutanixCheck


_SEVERITY_TO_ALERT_TYPE = {"CRITICAL": "error", "WARNING": "warning", "INFO": "info"}


class _SafeDict(dict):
"""Dict that returns missing keys as template placeholders for safe string formatting."""

Expand All @@ -31,7 +34,6 @@ def __init__(self, check: NutanixCheck):
self.last_event_collection_time = self.check.read_persistent_cache("last_event_collection_time")
self.last_task_collection_time = self.check.read_persistent_cache("last_task_collection_time")
self.last_audit_collection_time = self.check.read_persistent_cache("last_audit_collection_time")
self.last_alert_collection_time = self.check.read_persistent_cache("last_alert_collection_time")
# In-memory caches: id -> raw item (reset each check run)
self.events: dict[str, dict] = {}
self.audits: dict[str, dict] = {}
Expand All @@ -51,6 +53,9 @@ def __init__(self, check: NutanixCheck):
else:
self.alerts_v42_supported = None

# extId -> last-seen alert dict; reconciled each cycle against the unresolved-alerts API.
self._open_alerts: dict[str, dict] = {}

def reset_state(self) -> None:
"""Reset in-memory caches and counters for a new collection run."""
self.events = {}
Expand Down Expand Up @@ -191,16 +196,46 @@ def collect_audits(self) -> None:
)

def collect_alerts(self) -> None:
self.alerts_count = self._safe_collect(
"alert",
lambda: self._collect(
activity_kind="alert",
list_fn=self._list_alerts,
process_fn=self._process_alert,
time_field="creationTime",
cache_key="last_alert_collection_time",
),
)
self.alerts_count = self._safe_collect("alert", self._reconcile_alerts)

@staticmethod
def _alert_state(alert: dict) -> str:
return "acknowledged" if alert.get("isAcknowledged") else "open"

def _submit_state_metric(self, alert: dict, state: str, value: int) -> None:
"""Submit nutanix.alert.<state>=<value> (state encoded in metric name, not tag)."""
self.check.gauge(f"alert.{state}", value, tags=self._build_alert_tags(alert))

def _reconcile_alerts(self) -> int:
"""Reconcile open alerts against the unresolved-alerts API and emit per-state metrics."""
alerts = self._list_alerts_unresolved()
alerts = [a for a in alerts if self._should_collect_activity_item(a, "alert")]
api_alerts = {a.get("extId"): a for a in alerts if a.get("extId")}

count = 0
for ext_id in api_alerts.keys() - self._open_alerts.keys():
alert = api_alerts[ext_id]
self._open_alerts[ext_id] = alert
self._process_alert(alert)
count += 1

for ext_id in self._open_alerts.keys() - api_alerts.keys():
cached = self._open_alerts.pop(ext_id)
self._emit_resolution_event(self._get_alert(ext_id) or cached, cached_tags_alert=cached)
count += 1

for ext_id in api_alerts.keys() & self._open_alerts.keys():
old_alert = self._open_alerts[ext_id]
new_alert = api_alerts[ext_id]
old_state = self._alert_state(old_alert)
if old_state != self._alert_state(new_alert):
self._submit_state_metric(old_alert, old_state, 0)
self._open_alerts[ext_id] = new_alert

for alert in self._open_alerts.values():
self._submit_state_metric(alert, self._alert_state(alert), 1)

return count

def _list_activity(self, endpoint: str, time_field: str, start_time_str: str) -> list[dict]:
"""Fetch activity items from Prism Central."""
Expand All @@ -210,41 +245,52 @@ def _list_activity(self, endpoint: str, time_field: str, start_time_str: str) ->
}
return self.check._get_paginated_request_data(endpoint, params=params)

def _list_alerts(self, start_time_str: str) -> list[dict]:
"""Fetch alerts from Prism Central with v4.2/v4.0 fallback."""
def _list_alerts_unresolved(self) -> list[dict]:
"""Fetch all currently-unresolved alerts (the source of truth each cycle)."""
params = {
"$filter": f"creationTime gt {start_time_str}",
"$orderBy": "creationTime asc",
"$filter": "isResolved eq false",
"$orderBy": "lastUpdatedTime asc",
}
return self._list_alerts_with_fallback(params, fallback_filter=lambda a: not a.get("isResolved"))

def _list_alerts_with_fallback(
self,
params: dict,
fallback_filter: Callable[[dict], bool] | None = None,
) -> list[dict]:
"""Fetch alerts with v4.2/v4.0 fallback."""
if self.alerts_v42_supported is False:
self.check.log.debug("[%s] Using alerts API v4.0 (v4.2 not supported)", self._pc_label)
del params["$filter"]
return self.check._get_paginated_request_data("api/monitoring/v4.0/serviceability/alerts", params=params)
v40_params = {"$orderBy": params.get("$orderBy", "lastUpdatedTime asc")}
alerts = self.check._get_paginated_request_data(
"api/monitoring/v4.0/serviceability/alerts", params=v40_params
)
if fallback_filter:
alerts = [a for a in alerts if fallback_filter(a)]
return alerts

try:
self.check.log.debug("[%s] Attempting to use alerts API v4.2", self._pc_label)
result = self.check._get_paginated_request_data("api/monitoring/v4.2/serviceability/alerts", params=params)
if self.alerts_v42_supported is None:
self.check.log.debug(
"[%s] Alerts API v4.2 is supported, caching for future use",
self._pc_label,
)
self.check.log.debug("[%s] Alerts API v4.2 is supported, caching for future use", self._pc_label)
self.alerts_v42_supported = True
self.check.write_persistent_cache("alerts_v42_supported", "True")
return result
except HTTPError as e:
if e.response is not None and e.response.status_code == 404:
self.check.log.debug(
"[%s] Alerts API v4.2 not supported, falling back to v4.0 permanently",
self._pc_label,
"[%s] Alerts API v4.2 not supported, falling back to v4.0 permanently", self._pc_label
)
self.alerts_v42_supported = False
self.check.write_persistent_cache("alerts_v42_supported", "False")
del params["$filter"]
return self.check._get_paginated_request_data(
"api/monitoring/v4.0/serviceability/alerts", params=params
v40_params = {"$orderBy": params.get("$orderBy", "lastUpdatedTime asc")}
alerts = self.check._get_paginated_request_data(
"api/monitoring/v4.0/serviceability/alerts", params=v40_params
)
if fallback_filter:
alerts = [a for a in alerts if fallback_filter(a)]
return alerts
raise

def _get_alert(self, alert_ext_id: str) -> dict | None:
Expand Down Expand Up @@ -415,44 +461,50 @@ def _render_message(self, message: str, parameters: list[dict]) -> str:
self.check.log.debug("Failed to render alert message template: %s", e)
return message

def _build_alert_tags(self, alert: dict, status: str | None = None) -> list[str]:
"""Build the alert tag set; pass status only for events (metrics encode state in the name)."""
tags = self.check.base_tags.copy()
if ext_id := alert.get("extId"):
tags.append(f"ext_id:{ext_id}")
if alert_type := alert.get("alertType"):
tags.append(f"ntnx_alert_type:{alert_type}")
if severity := alert.get("severity"):
tags.append(f"ntnx_alert_severity:{severity}")
if (user_defined := alert.get("isUserDefined")) is not None:
tags.append(f"ntnx_alert_user_defined:{str(user_defined).lower()}")
if service_name := alert.get("serviceName"):
tags.append(f"ntnx_alert_service:{service_name}")

self._add_cluster_name_tag(tags, alert.get("clusterUUID"))
self._add_cluster_name_tag(tags, alert.get("originatingClusterUUID"), tag_name="ntnx_originating_cluster_name")

for classification in alert.get("classifications", []) or []:
tags.append(f"ntnx_alert_classification:{classification}")
for impact in alert.get("impactTypes", []) or []:
tags.append(f"ntnx_alert_impact:{impact}")

self._add_source_entity_tags(tags, alert)

tags.append("ntnx_type:alert")
if status is not None:
tags.append(f"ntnx_alert_status:{status}")
return tags

def _process_alert(self, alert: dict) -> None:
"""Process and send a single alert to Datadog."""
ext_id = alert.get("extId", "")
title = alert.get("title", "Nutanix Alert")
message = alert.get("message", "")
created_time = alert.get("creationTime")
severity = alert.get("severity")
alert_type = alert.get("alertType")
is_acknowledged = alert.get("isAcknowledged", False)

# Render template variables in title and message from parameters
if parameters := alert.get("parameters"):
title = self._render_message(title, parameters)
message = self._render_message(message, parameters)

# map severity to alert_type
severity_map = {
"CRITICAL": "error",
"WARNING": "warning",
"INFO": "info",
}
event_alert_type = severity_map.get(severity, "info")

alert_tags = self.check.base_tags.copy()
if alert_type:
alert_tags.append(f"ntnx_alert_type:{alert_type}")
if severity:
alert_tags.append(f"ntnx_alert_severity:{severity}")

self._add_cluster_name_tag(alert_tags, alert.get("clusterUUID"))

for classification in alert.get("classifications", []) or []:
alert_tags.append(f"ntnx_alert_classification:{classification}")

for impact in alert.get("impactTypes", []) or []:
alert_tags.append(f"ntnx_alert_impact:{impact}")

self._add_source_entity_tags(alert_tags, alert)

alert_tags.append("ntnx_type:alert")
# Acknowledged alerts soften to "warning" regardless of severity (operator already triaging).
event_alert_type = "warning" if is_acknowledged else _SEVERITY_TO_ALERT_TYPE.get(alert.get("severity"), "info")
alert_tags = self._build_alert_tags(alert, "acknowledged" if is_acknowledged else "open")

self.check.event(
{
Expand All @@ -465,9 +517,50 @@ def _process_alert(self, alert: dict) -> None:
"alert_type": event_alert_type,
"source_type_name": self.check.__NAMESPACE__,
"tags": alert_tags,
"aggregation_key": f"nutanix-alert-{ext_id}",
}
)

def _emit_resolution_event(self, alert: dict, cached_tags_alert: dict | None = None) -> None:
"""Emit resolution event, close prev-state gauge, and emit nutanix.alert.resolved=1."""
ext_id = alert.get("extId", "")
title = alert.get("title", "Nutanix Alert")
resolved_time = alert.get("resolvedTime")
resolved_by = alert.get("resolvedByUsername")
is_auto_resolved = alert.get("isAutoResolved", False)

if parameters := alert.get("parameters"):
title = self._render_message(title, parameters)

msg_text = "Auto-resolved" if is_auto_resolved else f"Resolved by {resolved_by}" if resolved_by else "Resolved"

prev_alert = cached_tags_alert or alert
prev_state = self._alert_state(prev_alert)
metric_tags = self._build_alert_tags(prev_alert)
event_tags = [
*metric_tags,
"ntnx_alert_status:resolved",
f"ntnx_alert_auto_resolved:{str(is_auto_resolved).lower()}",
]

self.check.event(
{
"timestamp": self._parse_timestamp(resolved_time)
if resolved_time
else get_timestamp(get_current_datetime()),
"event_type": self.check.__NAMESPACE__,
"msg_title": f"Alert Resolved: {title}",
"msg_text": msg_text,
"alert_type": "success",
"source_type_name": self.check.__NAMESPACE__,
"aggregation_key": f"nutanix-alert-{ext_id}",
"tags": event_tags,
}
)

self.check.gauge(f"alert.{prev_state}", 0, tags=metric_tags)
self.check.gauge("alert.resolved", 1, tags=metric_tags)

def _process_task(self, task: dict) -> None:
"""Process and send a single task to Datadog as an event."""
task_operation = task.get("operation", "Nutanix Task")
Expand Down Expand Up @@ -554,14 +647,20 @@ def _add_source_entity_tags(self, tags: list[str], item: dict) -> None:
if entity_name := source_entity.get("name"):
tags.append(f"ntnx_{entity_type}_name:{entity_name}")

def _add_cluster_name_tag(self, tags: list[str], cluster_id: str | None, fallback_name: str | None = None) -> None:
"""Add cluster name tag from ID lookup, with optional fallback."""
def _add_cluster_name_tag(
self,
tags: list[str],
cluster_id: str | None,
fallback_name: str | None = None,
tag_name: str = "ntnx_cluster_name",
) -> None:
"""Add a cluster name tag from ID lookup, with optional fallback."""
if not cluster_id:
return
if cluster_id in self.check.cluster_names:
tags.append(f"ntnx_cluster_name:{self.check.cluster_names[cluster_id]}")
tags.append(f"{tag_name}:{self.check.cluster_names[cluster_id]}")
elif fallback_name:
tags.append(f"ntnx_cluster_name:{fallback_name}")
tags.append(f"{tag_name}:{fallback_name}")

def _cluster_resource(self, cluster_id: str) -> tuple[str, dict]:
"""Build a cluster resource tuple from a cluster ID."""
Expand Down
4 changes: 3 additions & 1 deletion nutanix/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@
"Nutanix - Overview": "assets/dashboards/nutanix_overview.json",
"Nutanix - Activity Monitoring": "assets/dashboards/nutanix_activity_monitoring.json"
},
"monitors": {},
"monitors": {
"Nutanix alert is open": "assets/monitors/alerts.json"
},
"saved_views": {}
},
"author": {
Expand Down
3 changes: 3 additions & 0 deletions nutanix/metadata.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name,curated_metric,sample_tags
nutanix.alert.acknowledged,gauge,,,,1 while a Nutanix alert is acknowledged but not yet resolved; 0 emitted once when leaving the acknowledged state. Tagged per-alert via ext_id.,0,nutanix,alert acknowledged,,ext_id
nutanix.alert.open,gauge,,,,1 while a Nutanix alert is unresolved and unacknowledged; 0 emitted once when leaving the open state (acknowledged or resolved). Tagged per-alert via ext_id.,0,nutanix,alert open,,ext_id
nutanix.alert.resolved,gauge,,,,1 emitted once when a Nutanix alert is detected as resolved or deleted. One-shot signal useful for resolution-rate dashboards. Tagged per-alert via ext_id.,0,nutanix,alert resolved,,ext_id
nutanix.api.rate_limited,count,,,,Count of HTTP 429 rate limit responses from the Prism Central API.,0,nutanix,rate_limited,,
nutanix.cluster.aggregate_hypervisor.memory_usage,gauge,,,,Total memory usage across all hypervisors in the cluster.,0,nutanix,usage,,
nutanix.cluster.controller.avg_io_latency,gauge,,,,Average I/O latency of the cluster storage controller.,0,nutanix,latency,,
Expand Down
Loading
Loading