diff --git a/datadog_checks_base/changelog.d/23572.added b/datadog_checks_base/changelog.d/23572.added new file mode 100644 index 0000000000000..add49c0341158 --- /dev/null +++ b/datadog_checks_base/changelog.d/23572.added @@ -0,0 +1 @@ +Add Service/Port types and probe helpers (http_probe, tcp_probe, candidate_ports, verifier predicates) under datadog_checks.base.utils.discovery for advanced auto-config. \ No newline at end of file diff --git a/datadog_checks_base/changelog.d/23576.added b/datadog_checks_base/changelog.d/23576.added new file mode 100644 index 0000000000000..41ccc42d8d2e5 --- /dev/null +++ b/datadog_checks_base/changelog.d/23576.added @@ -0,0 +1 @@ +Add discover() rtloader bridge helper for advanced auto-config. \ No newline at end of file diff --git a/datadog_checks_base/datadog_checks/base/utils/discovery/__init__.pyi b/datadog_checks_base/datadog_checks/base/utils/discovery/__init__.pyi index 8f2479a6505cb..0fc56669ac3ef 100644 --- a/datadog_checks_base/datadog_checks/base/utils/discovery/__init__.pyi +++ b/datadog_checks_base/datadog_checks/base/utils/discovery/__init__.pyi @@ -1,6 +1,35 @@ # (C) Datadog, Inc. 2025-present # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) +from ._bridge import _run_discover from .discovery import Discovery +from .http import http_probe +from .ports import candidate_ports +from .service import Port, Service +from .tcp import tcp_probe +from .verifiers import ( + body_contains, + body_matches, + is_prometheus_exposition, + json_has, + response_equals, + response_starts_with, + status_2xx, +) -__all__ = ['Discovery'] +__all__ = [ + 'Discovery', + 'Port', + 'Service', + '_run_discover', + 'body_contains', + 'body_matches', + 'candidate_ports', + 'http_probe', + 'is_prometheus_exposition', + 'json_has', + 'response_equals', + 'response_starts_with', + 'status_2xx', + 'tcp_probe', +] diff --git a/datadog_checks_base/datadog_checks/base/utils/discovery/_bridge.py b/datadog_checks_base/datadog_checks/base/utils/discovery/_bridge.py new file mode 100644 index 0000000000000..677ce6d659ef5 --- /dev/null +++ b/datadog_checks_base/datadog_checks/base/utils/discovery/_bridge.py @@ -0,0 +1,56 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +"""Bridge entry point invoked from the Agent's rtloader to run a check class's +``discover(service)`` method. + +The Agent serializes the listeners.Service projection to JSON, calls this +function with the check class, and receives a JSON string in return: + +- ``"null"`` — discover returned None, raised, or the class has no discover(). +- ``"[]"`` — discover explicitly returned an empty list. +- ``"[{...}, {...}]"`` — one entry per resolved instance config. +""" +import json +import logging +from typing import Any + +from .service import Port, Service + +_log = logging.getLogger(__name__) + + +def _run_discover(check_class: Any, service_json: str) -> str: + """Run the discover() classmethod and return the JSON-encoded result. + + Never raises — any error is caught, logged, and returned as ``"null"``. + """ + try: + payload = json.loads(service_json) + ports = tuple( + Port(number=int(p["number"]), name=p.get("name", "")) + for p in payload.get("ports", []) + ) + service = Service(id=payload["id"], host=payload["host"], ports=ports) + except Exception: + _log.exception("discover bridge: failed to parse service payload") + return "null" + + discover = getattr(check_class, "discover", None) + if discover is None: + return "null" + + try: + result = discover(service) + except Exception: + _log.exception("discover bridge: %s.discover raised", getattr(check_class, "__name__", "?")) + return "null" + + if result is None: + return "null" + + try: + return json.dumps(list(result)) + except (TypeError, ValueError): + _log.exception("discover bridge: %s.discover returned non-JSON-serializable", check_class) + return "null" diff --git a/datadog_checks_base/datadog_checks/base/utils/discovery/http.py b/datadog_checks_base/datadog_checks/base/utils/discovery/http.py new file mode 100644 index 0000000000000..2b1072d965126 --- /dev/null +++ b/datadog_checks_base/datadog_checks/base/utils/discovery/http.py @@ -0,0 +1,33 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from collections.abc import Callable + +import requests + + +def http_probe( + host: str, + port: int, + path: str, + *, + verifier: Callable[[requests.Response], bool], + timeout: float = 0.5, +) -> bool: + """Perform a single GET probe and apply the verifier. + + Returns True iff the request completed and the verifier accepted the + response. All network exceptions yield False (probes are best-effort). + + The ``host`` is used verbatim in the URL — IPv6 hosts must already be + bracketed by the caller (the Agent-side bridge handles this). + """ + url = f"http://{host}:{port}{path}" + try: + response = requests.get(url, timeout=timeout) + except requests.RequestException: + return False + try: + return bool(verifier(response)) + finally: + response.close() diff --git a/datadog_checks_base/datadog_checks/base/utils/discovery/ports.py b/datadog_checks_base/datadog_checks/base/utils/discovery/ports.py new file mode 100644 index 0000000000000..6150a54c98d7b --- /dev/null +++ b/datadog_checks_base/datadog_checks/base/utils/discovery/ports.py @@ -0,0 +1,23 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from collections.abc import Iterable, Iterator + +from .service import Port, Service + + +def candidate_ports(service: Service, hints: Iterable[int]) -> Iterator[Port]: + """Yield ports to probe for a service, hint-first then remaining. + + Hints not exposed by the service are skipped; duplicates are collapsed. + """ + by_number = {p.number: p for p in service.ports} + seen: set[int] = set() + for h in hints: + if h in by_number and h not in seen: + seen.add(h) + yield by_number[h] + for p in service.ports: + if p.number not in seen: + seen.add(p.number) + yield p diff --git a/datadog_checks_base/datadog_checks/base/utils/discovery/service.py b/datadog_checks_base/datadog_checks/base/utils/discovery/service.py new file mode 100644 index 0000000000000..474b4b38046b3 --- /dev/null +++ b/datadog_checks_base/datadog_checks/base/utils/discovery/service.py @@ -0,0 +1,17 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from dataclasses import dataclass, field + + +@dataclass(frozen=True) +class Port: + number: int + name: str = "" + + +@dataclass(frozen=True) +class Service: + id: str + host: str + ports: tuple[Port, ...] = field(default_factory=tuple) diff --git a/datadog_checks_base/datadog_checks/base/utils/discovery/tcp.py b/datadog_checks_base/datadog_checks/base/utils/discovery/tcp.py new file mode 100644 index 0000000000000..9099514b288ab --- /dev/null +++ b/datadog_checks_base/datadog_checks/base/utils/discovery/tcp.py @@ -0,0 +1,44 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +import socket +from collections.abc import Callable + +_DEFAULT_READ_MAX = 4096 + + +def tcp_probe( + host: str, + port: int, + *, + send: bytes = b"", + verifier: Callable[[bytes], bool], + timeout: float = 0.5, + read_max: int = _DEFAULT_READ_MAX, +) -> bool: + """Open a TCP connection, optionally send bytes, read up to ``read_max``, + and apply the verifier. + + Returns True iff the connection succeeded and the verifier accepted the + bytes received within the timeout. All socket errors yield False. + """ + try: + with socket.create_connection((host, port), timeout=timeout) as sock: + sock.settimeout(timeout) + if send: + sock.sendall(send) + chunks: list[bytes] = [] + remaining = read_max + while remaining > 0: + try: + chunk = sock.recv(min(4096, remaining)) + except socket.timeout: + break + if not chunk: + break + chunks.append(chunk) + remaining -= len(chunk) + buf = b"".join(chunks) + except OSError: + return False + return bool(verifier(buf)) diff --git a/datadog_checks_base/datadog_checks/base/utils/discovery/verifiers.py b/datadog_checks_base/datadog_checks/base/utils/discovery/verifiers.py new file mode 100644 index 0000000000000..bf2fb30cbbf17 --- /dev/null +++ b/datadog_checks_base/datadog_checks/base/utils/discovery/verifiers.py @@ -0,0 +1,103 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +"""Predicate factories for discovery probe verification. + +Each public function returns a callable predicate. HTTP predicates take a +``requests.Response`` and return ``bool``. TCP predicates take ``bytes`` and +return ``bool``. The factory shape lets check classes declare verifiers as +class-level attributes, e.g. ``DISCOVERY_VERIFY = body_contains("Total Accesses:")``. +""" + +import re +from collections.abc import Callable, Iterable +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import requests + +_PROM_LINE = re.compile(r"^[a-zA-Z_:][a-zA-Z0-9_:]*(\{[^}]*\})?\s+[-+]?(\d+\.?\d*|\.\d+)([eE][-+]?\d+)?(\s|$)") + + +HTTPPredicate = Callable[["requests.Response"], bool] +TCPPredicate = Callable[[bytes], bool] + + +def status_2xx() -> HTTPPredicate: + def predicate(response: "requests.Response") -> bool: + return 200 <= response.status_code < 300 + + return predicate + + +def body_contains(needle: str) -> HTTPPredicate: + def predicate(response: "requests.Response") -> bool: + return 200 <= response.status_code < 300 and needle in response.text + + return predicate + + +def body_matches(pattern: str) -> HTTPPredicate: + compiled = re.compile(pattern, re.MULTILINE) + + def predicate(response: "requests.Response") -> bool: + if not (200 <= response.status_code < 300): + return False + return bool(compiled.search(response.text)) + + return predicate + + +def json_has(required_keys: Iterable[str]) -> HTTPPredicate: + keys = tuple(required_keys) + + def predicate(response: "requests.Response") -> bool: + if not (200 <= response.status_code < 300): + return False + try: + doc = response.json() + except ValueError: + return False + if not isinstance(doc, dict): + return False + return all(k in doc for k in keys) + + return predicate + + +def is_prometheus_exposition() -> HTTPPredicate: + """Verify a Prometheus / OpenMetrics exposition response. + + Status must be 2xx, Content-Type must be text/plain or + application/openmetrics-text, and at least one non-comment line must look + like a Prometheus metric line. + """ + + def predicate(response: "requests.Response") -> bool: + if not (200 <= response.status_code < 300): + return False + ctype = response.headers.get("Content-Type", "").lower() + if not (ctype.startswith("text/plain") or ctype.startswith("application/openmetrics-text")): + return False + for line in response.text.split("\n"): + stripped = line.strip() + if not stripped or stripped.startswith("#"): + continue + return bool(_PROM_LINE.match(stripped)) + return False + + return predicate + + +def response_equals(expected: bytes) -> TCPPredicate: + def predicate(buf: bytes) -> bool: + return buf == expected + + return predicate + + +def response_starts_with(prefix: bytes) -> TCPPredicate: + def predicate(buf: bytes) -> bool: + return buf.startswith(prefix) + + return predicate diff --git a/datadog_checks_base/tests/base/utils/discovery/test_bridge.py b/datadog_checks_base/tests/base/utils/discovery/test_bridge.py new file mode 100644 index 0000000000000..fd3ba1604d73b --- /dev/null +++ b/datadog_checks_base/tests/base/utils/discovery/test_bridge.py @@ -0,0 +1,81 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +import json + +from datadog_checks.base.utils.discovery._bridge import _run_discover +from datadog_checks.base.utils.discovery.service import Port, Service + + +class _Found: + @classmethod + def discover(cls, service: Service): + return [{"openmetrics_endpoint": f"http://{service.host}:{service.ports[0].number}/metrics"}] + + +class _NotFound: + @classmethod + def discover(cls, service: Service): + return None + + +class _EmptyList: + @classmethod + def discover(cls, service: Service): + return [] + + +class _Raises: + @classmethod + def discover(cls, service: Service): + raise RuntimeError("boom") + + +SVC_JSON = json.dumps({ + "id": "docker://abc", + "host": "10.0.0.1", + "ports": [{"number": 9090, "name": "metrics"}], +}) + + +def test_bridge_returns_json_list_on_match(): + out = _run_discover(_Found, SVC_JSON) + parsed = json.loads(out) + assert parsed == [{"openmetrics_endpoint": "http://10.0.0.1:9090/metrics"}] + + +def test_bridge_returns_null_on_no_match(): + assert _run_discover(_NotFound, SVC_JSON) == "null" + + +def test_bridge_returns_empty_list_on_explicit_empty(): + assert _run_discover(_EmptyList, SVC_JSON) == "[]" + + +def test_bridge_returns_null_on_exception(): + assert _run_discover(_Raises, SVC_JSON) == "null" + + +def test_bridge_constructs_service_correctly(): + captured = {} + + class C: + @classmethod + def discover(cls, service: Service): + captured["id"] = service.id + captured["host"] = service.host + captured["ports"] = [(p.number, p.name) for p in service.ports] + return None + + _run_discover(C, SVC_JSON) + assert captured == { + "id": "docker://abc", + "host": "10.0.0.1", + "ports": [(9090, "metrics")], + } + + +def test_bridge_handles_missing_discover_method(): + class NoDiscover: + pass + assert _run_discover(NoDiscover, SVC_JSON) == "null" diff --git a/datadog_checks_base/tests/base/utils/discovery/test_exports.py b/datadog_checks_base/tests/base/utils/discovery/test_exports.py new file mode 100644 index 0000000000000..cc8e624fd2df6 --- /dev/null +++ b/datadog_checks_base/tests/base/utils/discovery/test_exports.py @@ -0,0 +1,22 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +def test_public_exports(): + from datadog_checks.base.utils import discovery + + expected = { + "Discovery", + "Service", + "Port", + "candidate_ports", + "http_probe", + "tcp_probe", + "status_2xx", + "body_contains", + "body_matches", + "json_has", + "is_prometheus_exposition", + "response_equals", + "response_starts_with", + } + assert expected.issubset(set(dir(discovery))) diff --git a/datadog_checks_base/tests/base/utils/discovery/test_http.py b/datadog_checks_base/tests/base/utils/discovery/test_http.py new file mode 100644 index 0000000000000..abbd3d9e93550 --- /dev/null +++ b/datadog_checks_base/tests/base/utils/discovery/test_http.py @@ -0,0 +1,67 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from unittest.mock import Mock, patch + +import requests + +from datadog_checks.base.utils.discovery.http import http_probe +from datadog_checks.base.utils.discovery.verifiers import body_contains, status_2xx + + +def _ok_response(body="ok", status=200, content_type="text/plain"): + r = Mock() + r.status_code = status + r.text = body + r.headers = {"Content-Type": content_type} + return r + + +def test_http_probe_uses_correct_url_and_timeout(): + with patch("datadog_checks.base.utils.discovery.http.requests.get") as mock_get: + mock_get.return_value = _ok_response() + http_probe("10.0.0.1", 9090, "/metrics", verifier=status_2xx()) + mock_get.assert_called_once() + args, kwargs = mock_get.call_args + assert args[0] == "http://10.0.0.1:9090/metrics" + assert kwargs["timeout"] == 0.5 + + +def test_http_probe_passes_when_verify_passes(): + with patch("datadog_checks.base.utils.discovery.http.requests.get") as mock_get: + mock_get.return_value = _ok_response(body="Total Accesses: 42") + assert http_probe("h", 80, "/server-status?auto", verifier=body_contains("Total Accesses:")) + + +def test_http_probe_fails_when_verify_fails(): + with patch("datadog_checks.base.utils.discovery.http.requests.get") as mock_get: + mock_get.return_value = _ok_response(body="something else") + assert not http_probe("h", 80, "/x", verifier=body_contains("Total Accesses:")) + + +def test_http_probe_returns_false_on_connection_error(): + with patch("datadog_checks.base.utils.discovery.http.requests.get") as mock_get: + mock_get.side_effect = requests.exceptions.ConnectionError() + assert not http_probe("h", 80, "/x", verifier=status_2xx()) + + +def test_http_probe_returns_false_on_timeout(): + with patch("datadog_checks.base.utils.discovery.http.requests.get") as mock_get: + mock_get.side_effect = requests.exceptions.Timeout() + assert not http_probe("h", 80, "/x", verifier=status_2xx()) + + +def test_http_probe_brackets_ipv6_in_url(): + with patch("datadog_checks.base.utils.discovery.http.requests.get") as mock_get: + mock_get.return_value = _ok_response() + http_probe("[::1]", 80, "/x", verifier=status_2xx()) + args, _ = mock_get.call_args + assert args[0] == "http://[::1]:80/x" + + +def test_http_probe_custom_timeout(): + with patch("datadog_checks.base.utils.discovery.http.requests.get") as mock_get: + mock_get.return_value = _ok_response() + http_probe("h", 80, "/x", verifier=status_2xx(), timeout=1.0) + _, kwargs = mock_get.call_args + assert kwargs["timeout"] == 1.0 diff --git a/datadog_checks_base/tests/base/utils/discovery/test_ports.py b/datadog_checks_base/tests/base/utils/discovery/test_ports.py new file mode 100644 index 0000000000000..6563a6c1ba8ac --- /dev/null +++ b/datadog_checks_base/tests/base/utils/discovery/test_ports.py @@ -0,0 +1,39 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from datadog_checks.base.utils.discovery.ports import candidate_ports +from datadog_checks.base.utils.discovery.service import Port, Service + + +def _svc(*ports): + return Service(id="x", host="h", ports=tuple(ports)) + + +def test_hint_first_then_rest(): + svc = _svc(Port(8080), Port(9090), Port(80)) + assert list(candidate_ports(svc, [9090])) == [Port(9090), Port(8080), Port(80)] + + +def test_multiple_hints_in_order(): + svc = _svc(Port(80), Port(8080), Port(9090)) + assert list(candidate_ports(svc, [9090, 8080])) == [Port(9090), Port(8080), Port(80)] + + +def test_hint_not_exposed_skipped(): + svc = _svc(Port(80)) + assert list(candidate_ports(svc, [9090])) == [Port(80)] + + +def test_no_hints_returns_service_order(): + svc = _svc(Port(80), Port(9090)) + assert list(candidate_ports(svc, [])) == [Port(80), Port(9090)] + + +def test_no_ports_returns_empty(): + svc = _svc() + assert list(candidate_ports(svc, [9090])) == [] + + +def test_no_duplicates_when_hint_repeats(): + svc = _svc(Port(9090)) + assert list(candidate_ports(svc, [9090, 9090])) == [Port(9090)] diff --git a/datadog_checks_base/tests/base/utils/discovery/test_service.py b/datadog_checks_base/tests/base/utils/discovery/test_service.py new file mode 100644 index 0000000000000..2676992073800 --- /dev/null +++ b/datadog_checks_base/tests/base/utils/discovery/test_service.py @@ -0,0 +1,43 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +import pytest + +from datadog_checks.base.utils.discovery.service import Port, Service + + +def test_port_defaults(): + p = Port(number=9090) + assert p.number == 9090 + assert p.name == "" + + +def test_port_with_name(): + p = Port(number=9090, name="metrics") + assert p.name == "metrics" + + +def test_port_is_hashable(): + {Port(9090), Port(9091, "metrics")} + + +def test_port_is_immutable(): + p = Port(9090) + with pytest.raises(Exception): + p.number = 9091 # type: ignore[misc] + + +def test_service_basic(): + svc = Service(id="docker://abc", host="10.0.0.1", ports=(Port(9090),)) + assert svc.id == "docker://abc" + assert svc.host == "10.0.0.1" + assert svc.ports == (Port(9090),) + + +def test_service_is_hashable(): + {Service(id="a", host="h", ports=(Port(1),))} + + +def test_service_ports_is_tuple_not_list(): + svc = Service(id="a", host="h", ports=(Port(1), Port(2))) + assert isinstance(svc.ports, tuple) diff --git a/datadog_checks_base/tests/base/utils/discovery/test_tcp.py b/datadog_checks_base/tests/base/utils/discovery/test_tcp.py new file mode 100644 index 0000000000000..e4383d64819ce --- /dev/null +++ b/datadog_checks_base/tests/base/utils/discovery/test_tcp.py @@ -0,0 +1,96 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +import socket +import threading +from contextlib import contextmanager + +from datadog_checks.base.utils.discovery.tcp import tcp_probe +from datadog_checks.base.utils.discovery.verifiers import ( + response_equals, + response_starts_with, +) + + +@contextmanager +def _tcp_server(handler): + """Run a one-shot TCP server on 127.0.0.1 and return its bound port.""" + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.bind(("127.0.0.1", 0)) + sock.listen(1) + port = sock.getsockname()[1] + done = threading.Event() + + def serve(): + try: + conn, _ = sock.accept() + try: + handler(conn) + finally: + conn.close() + except OSError: + pass + finally: + done.set() + + thread = threading.Thread(target=serve, daemon=True) + thread.start() + try: + yield port + finally: + sock.close() + done.wait(timeout=1.0) + + +def test_tcp_probe_zookeeper_4lw_pattern(): + def handler(conn): + data = conn.recv(64) + if data == b"ruok": + conn.sendall(b"imok") + + with _tcp_server(handler) as port: + assert tcp_probe("127.0.0.1", port, send=b"ruok", verifier=response_equals(b"imok"), timeout=1.0) + + +def test_tcp_probe_redis_ping_pattern(): + def handler(conn): + conn.recv(64) + conn.sendall(b"+PONG\r\n") + + with _tcp_server(handler) as port: + assert tcp_probe("127.0.0.1", port, send=b"PING\r\n", verifier=response_starts_with(b"+PONG"), timeout=1.0) + + +def test_tcp_probe_server_speaks_first(): + def handler(conn): + conn.sendall(b'{"service":"nutcracker","source":"x","version":"0.5"}') + + with _tcp_server(handler) as port: + assert tcp_probe("127.0.0.1", port, verifier=response_starts_with(b'{"service":"nutcracker"'), timeout=1.0) + + +def test_tcp_probe_returns_false_when_verifier_rejects(): + def handler(conn): + conn.sendall(b"WRONG") + + with _tcp_server(handler) as port: + assert not tcp_probe("127.0.0.1", port, verifier=response_starts_with(b"+PONG"), timeout=1.0) + + +def test_tcp_probe_returns_false_on_refused_connection(): + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.bind(("127.0.0.1", 0)) + port = sock.getsockname()[1] + sock.close() # port is now free; nothing listening + assert not tcp_probe("127.0.0.1", port, verifier=response_starts_with(b"x"), timeout=1.0) + + +def test_tcp_probe_returns_false_on_timeout(): + def handler(conn): + # Stall: never send anything, never close (until the test releases us). + import time + + time.sleep(2.0) + + with _tcp_server(handler) as port: + assert not tcp_probe("127.0.0.1", port, verifier=response_starts_with(b"x"), timeout=0.1) diff --git a/datadog_checks_base/tests/base/utils/discovery/test_verifiers.py b/datadog_checks_base/tests/base/utils/discovery/test_verifiers.py new file mode 100644 index 0000000000000..3bce1b31865c5 --- /dev/null +++ b/datadog_checks_base/tests/base/utils/discovery/test_verifiers.py @@ -0,0 +1,108 @@ +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from unittest.mock import Mock + +from datadog_checks.base.utils.discovery.verifiers import ( + body_contains, + body_matches, + is_prometheus_exposition, + json_has, + response_equals, + response_starts_with, + status_2xx, +) + + +def _resp(status=200, content_type="text/plain", body="", json_body=None): + r = Mock() + r.status_code = status + r.headers = {"Content-Type": content_type} + r.text = body + if json_body is not None: + r.json = Mock(return_value=json_body) + else: + r.json = Mock(side_effect=ValueError("not json")) + return r + + +def test_status_2xx_pass(): + assert status_2xx()(_resp(status=200)) + assert status_2xx()(_resp(status=204)) + + +def test_status_2xx_fail(): + assert not status_2xx()(_resp(status=301)) + assert not status_2xx()(_resp(status=500)) + + +def test_body_contains_pass(): + assert body_contains("Total Accesses:")(_resp(body="Total Accesses: 42\n")) + + +def test_body_contains_fail_on_substring_absent(): + assert not body_contains("Total Accesses:")(_resp(body="something else")) + + +def test_body_contains_fail_on_non_2xx(): + assert not body_contains("anything")(_resp(status=500, body="anything")) + + +def test_body_matches_pass(): + assert body_matches(r"^Active connections:")(_resp(body="Active connections: 7\nblah")) + + +def test_body_matches_anchored_to_start_of_a_line_using_multiline_flag(): + # Demonstrates the convention: callers pass plain re patterns; we apply re.MULTILINE. + assert body_matches(r"^server: nginx$")(_resp(body="HTTP/1.1 200 OK\nserver: nginx\n")) + + +def test_body_matches_fail(): + assert not body_matches(r"^Active connections:")(_resp(body="not nginx")) + + +def test_json_has_pass_top_level_keys(): + assert json_has(["version", "leader"])(_resp(json_body={"version": "1.7.0", "leader": "h1"})) + + +def test_json_has_fail_missing_key(): + assert not json_has(["version", "leader"])(_resp(json_body={"version": "1.7.0"})) + + +def test_json_has_fail_not_json(): + assert not json_has(["x"])(_resp(body="")) + + +def test_is_prometheus_exposition_pass_text_plain(): + body = "# HELP foo bar\nfoo 1\n" + assert is_prometheus_exposition()(_resp(content_type="text/plain; version=0.0.4", body=body)) + + +def test_is_prometheus_exposition_pass_openmetrics(): + body = "foo_total 42\n" + assert is_prometheus_exposition()(_resp(content_type="application/openmetrics-text", body=body)) + + +def test_is_prometheus_exposition_rejects_html(): + assert not is_prometheus_exposition()(_resp(content_type="text/html", body="")) + + +def test_is_prometheus_exposition_rejects_garbage_body(): + body = "this is not prometheus" + assert not is_prometheus_exposition()(_resp(content_type="text/plain", body=body)) + + +def test_response_equals_tcp_pass(): + assert response_equals(b"imok")(b"imok") + + +def test_response_equals_tcp_fail(): + assert not response_equals(b"imok")(b"imnotok") + + +def test_response_starts_with_tcp_pass(): + assert response_starts_with(b"+PONG")(b"+PONG\r\n") + + +def test_response_starts_with_tcp_fail(): + assert not response_starts_with(b"+PONG")(b"-ERR\r\n") diff --git a/docs/superpowers/plans/2026-05-05-advanced-autoconfig-experiment.md b/docs/superpowers/plans/2026-05-05-advanced-autoconfig-experiment.md new file mode 100644 index 0000000000000..1f9d8dd00f344 --- /dev/null +++ b/docs/superpowers/plans/2026-05-05-advanced-autoconfig-experiment.md @@ -0,0 +1,1555 @@ +# Advanced Auto-Config — KrakenD Experiment Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Demonstrate end-to-end that the Datadog Agent can discover the OpenMetrics endpoint of a KrakenD container via a declarative `auto_conf_discovery.yaml`, schedule the krakend check with the discovered port, and emit metrics — without any per-integration discovery code. + +**Architecture:** Add a `discovery:` block to a new `auto_conf_discovery.yaml` file format that the file config provider reads. When AutoDiscovery matches a service to such a template, run a generic OpenMetrics prober against the container's exposed ports (hint ports first, full scan as fallback), verify the response is Prometheus-format, and resolve a new `%%discovered_port%%` template variable from the probe result. The resolution happens by wrapping the matched `Service` with a `serviceWithProbeResult` shim so existing call sites of `configresolver.Resolve` are unchanged. + +**Tech Stack:** Go 1.22+ (`datadog-agent`), YAML, Python 3.13 (krakend integration), Docker. Build via `dda inv agent.build`. Tests via `dda inv test --targets=...` (never raw `go test` — see `datadog-agent/AGENTS.md`). + +**Spec:** `docs/superpowers/specs/2026-05-05-advanced-autoconfig-experiment-design.md` in `integrations-core`. + +**Repos involved:** +- `/home/vagrant/go/src/github.com/DataDog/integrations-core` — branch `vitkyrka/disco-autoconfig`. One new YAML file. +- `/home/vagrant/go/src/github.com/DataDog/datadog-agent` — feature branch to be created. Most of the work lives here. + +**Commit policy (per `datadog-agent/CLAUDE_PERSONAL.md`):** Never amend commits — make new fixup commits on top instead. Never disable signing. Never bypass hooks with `--no-verify`. PRs as drafts only. + +--- + +## File structure + +### `integrations-core` (branch `vitkyrka/disco-autoconfig`) +- Create: `krakend/datadog_checks/krakend/data/auto_conf_discovery.yaml` + +### `datadog-agent` (new feature branch) +- Modify: `comp/core/autodiscovery/integration/config.go` — add `DiscoveryConfig` struct + `Discovery *DiscoveryConfig` field on `Config`. +- Modify: `comp/core/autodiscovery/providers/config_reader.go` — add `auto_conf_discovery.yaml` to the file lookup; parse `discovery:` block. +- Modify: `comp/core/autodiscovery/providers/config_reader_test.go` — round-trip test for the new file. +- Create: `comp/core/autodiscovery/discovery/types.go` — `DiscoveryConfig` mirror, `ProbeResult` struct, `Prober` interface. +- Create: `comp/core/autodiscovery/discovery/candidates.go` — port ordering helper. +- Create: `comp/core/autodiscovery/discovery/candidates_test.go`. +- Create: `comp/core/autodiscovery/discovery/openmetrics_prober.go` — HTTP probe + Prometheus verification + cache. +- Create: `comp/core/autodiscovery/discovery/openmetrics_prober_test.go`. +- Create: `comp/core/autodiscovery/discovery/cache.go` — TTL cache used by the prober. +- Create: `comp/core/autodiscovery/discovery/cache_test.go`. +- Create: `comp/core/autodiscovery/discovery/service_wrapper.go` — `serviceWithProbeResult` shim that injects `discovered_port` via `GetExtraConfig`. +- Modify: `pkg/util/tmplvar/resolver.go` — register `"discovered"` → `GetDiscoveredPort`. +- Modify: `pkg/util/tmplvar/resolver_test.go` — tests for the new variable. +- Modify: `comp/core/autodiscovery/autodiscoveryimpl/configmgr.go` — call `Prober.Probe(...)` before `configresolver.Resolve`; wrap service with the probe result. + +`configresolver.Resolve(tpl, svc)` keeps its current two-argument signature. The probe result reaches the resolver via the wrapped `Service` rather than a new function parameter — simpler than the spec's working assumption. + +--- + +## Task 1: Add `auto_conf_discovery.yaml` to the krakend integration + +Sets up the integration-side trigger artifact. Standalone — no Agent code needed yet. + +**Files:** +- Create: `/home/vagrant/go/src/github.com/DataDog/integrations-core/krakend/datadog_checks/krakend/data/auto_conf_discovery.yaml` + +- [ ] **Step 1: Create the file.** + +```yaml +ad_identifiers: + - krakend +discovery: + type: openmetrics + ports: [8090] + path: /metrics +init_config: +instances: + - openmetrics_endpoint: "http://%%host%%:%%discovered_port%%/metrics" +``` + +- [ ] **Step 2: Verify it's valid YAML.** + +Run: `python3 -c "import yaml,sys; yaml.safe_load(open('/home/vagrant/go/src/github.com/DataDog/integrations-core/krakend/datadog_checks/krakend/data/auto_conf_discovery.yaml'))"` +Expected: no output, exit 0. + +- [ ] **Step 3: Commit.** + +```bash +cd /home/vagrant/go/src/github.com/DataDog/integrations-core +git add krakend/datadog_checks/krakend/data/auto_conf_discovery.yaml +git commit -m "$(cat <<'EOF' +krakend: add auto_conf_discovery.yaml for advanced auto-config experiment + +Declares the krakend ad_identifier with an OpenMetrics probe spec +(default port 8090, /metrics path). Consumed by the new +auto_conf_discovery file format in datadog-agent. + +Co-Authored-By: Claude Opus 4.7 (1M context) +EOF +)" +``` + +--- + +## Task 2: Create a feature branch in datadog-agent + +**Files:** none. Branching only. + +- [ ] **Step 1: Create the feature branch.** + +Run: +```bash +cd /home/vagrant/go/src/github.com/DataDog/datadog-agent +git fetch origin +git checkout -b vitkyrka/advanced-autoconfig-krakend origin/main +``` + +Expected: switched to a new branch off `origin/main`. + +- [ ] **Step 2: Verify clean tree.** + +Run: `git status --short` +Expected: empty output. + +--- + +## Task 3: Add the `DiscoveryConfig` struct to `integration.Config` + +Defines the canonical type the rest of the system consumes. + +**Files:** +- Modify: `/home/vagrant/go/src/github.com/DataDog/datadog-agent/comp/core/autodiscovery/integration/config.go` +- Modify: `/home/vagrant/go/src/github.com/DataDog/datadog-agent/comp/core/autodiscovery/integration/config_test.go` (or create if absent — verify with `ls`). + +- [ ] **Step 1: Write the failing test.** + +Append to `comp/core/autodiscovery/integration/config_test.go`: + +```go +func TestDiscoveryConfig_FieldsAndZeroValue(t *testing.T) { + var c Config + if c.Discovery != nil { + t.Fatalf("Discovery should default to nil, got %+v", c.Discovery) + } + + c.Discovery = &DiscoveryConfig{ + Type: "openmetrics", + Ports: []int{8090}, + Path: "/metrics", + } + if c.Discovery.Type != "openmetrics" { + t.Fatalf("Type round-trip failed: %s", c.Discovery.Type) + } + if got, want := len(c.Discovery.Ports), 1; got != want { + t.Fatalf("Ports length: got %d want %d", got, want) + } + if c.Discovery.Path != "/metrics" { + t.Fatalf("Path round-trip failed: %s", c.Discovery.Path) + } +} +``` + +- [ ] **Step 2: Run the test and confirm it fails.** + +Run: `dda inv test --targets=./comp/core/autodiscovery/integration/ -- -run TestDiscoveryConfig_FieldsAndZeroValue` +Expected: build error — `undefined: DiscoveryConfig`. + +- [ ] **Step 3: Implement the struct.** + +In `comp/core/autodiscovery/integration/config.go`, find the `Config` struct (around line 47). Add a new field at the end of the struct, just before the closing `}`: + +```go + // Discovery, when non-nil, signals that this config is a discovery + // template: AutoDiscovery must run a probe against the matched service + // before substituting %%discovered_port%%. + Discovery *DiscoveryConfig `json:"discovery"` // (include in digest: true) +``` + +Then, after the `Config` type declaration (before the next type), add: + +```go +// DiscoveryConfig describes how to probe a service to find its check +// endpoint. Currently only Type=="openmetrics" is supported. +type DiscoveryConfig struct { + Type string `yaml:"type" json:"type"` + Ports []int `yaml:"ports,omitempty" json:"ports,omitempty"` + Path string `yaml:"path,omitempty" json:"path,omitempty"` +} +``` + +- [ ] **Step 4: Run the test and confirm it passes.** + +Run: `dda inv test --targets=./comp/core/autodiscovery/integration/ -- -run TestDiscoveryConfig_FieldsAndZeroValue` +Expected: PASS. + +- [ ] **Step 5: Run the full integration package test to make sure nothing else broke.** + +Run: `dda inv test --targets=./comp/core/autodiscovery/integration/` +Expected: PASS. + +- [ ] **Step 6: Commit.** + +```bash +git add comp/core/autodiscovery/integration/config.go comp/core/autodiscovery/integration/config_test.go +git commit -m "autodiscovery: add DiscoveryConfig type to integration.Config + +For the advanced auto-config experiment. New optional field on +integration.Config, populated by the auto_conf_discovery.yaml provider +in a follow-up commit. + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +## Task 4: Parse `auto_conf_discovery.yaml` in the file config provider + +Make the file provider recognise the new file alongside `auto_conf.yaml` and populate `Config.Discovery`. + +**Files:** +- Modify: `comp/core/autodiscovery/providers/config_reader.go` +- Modify: `comp/core/autodiscovery/providers/config_reader_test.go` + +- [ ] **Step 1: Write the failing test.** + +Append to `comp/core/autodiscovery/providers/config_reader_test.go`: + +```go +func TestReadConfigFiles_AutoConfDiscovery(t *testing.T) { + tmp := t.TempDir() + intDir := filepath.Join(tmp, "krakend.d") + if err := os.MkdirAll(intDir, 0755); err != nil { + t.Fatal(err) + } + yamlBody := []byte(`ad_identifiers: + - krakend +discovery: + type: openmetrics + ports: [8090] + path: /metrics +init_config: +instances: + - openmetrics_endpoint: "http://%%host%%:%%discovered_port%%/metrics" +`) + if err := os.WriteFile(filepath.Join(intDir, "auto_conf_discovery.yaml"), yamlBody, 0644); err != nil { + t.Fatal(err) + } + + pkgconfigsetup.Datadog().SetWithoutSource("confd_path", tmp) + t.Cleanup(func() { + pkgconfigsetup.Datadog().SetWithoutSource("confd_path", "") + }) + + configs, _, _ := ReadConfigFiles(GetAll) + var found *integration.Config + for i := range configs { + if configs[i].Name == "krakend" && configs[i].Discovery != nil { + found = &configs[i] + break + } + } + if found == nil { + t.Fatalf("did not find krakend config with Discovery set; got %d configs", len(configs)) + } + if found.Discovery.Type != "openmetrics" { + t.Fatalf("Type: got %q want %q", found.Discovery.Type, "openmetrics") + } + if !reflect.DeepEqual(found.Discovery.Ports, []int{8090}) { + t.Fatalf("Ports: got %+v want [8090]", found.Discovery.Ports) + } + if found.Discovery.Path != "/metrics" { + t.Fatalf("Path: got %q want %q", found.Discovery.Path, "/metrics") + } + if got := len(found.ADIdentifiers); got != 1 || found.ADIdentifiers[0] != "krakend" { + t.Fatalf("ADIdentifiers: got %+v", found.ADIdentifiers) + } +} +``` + +If `reflect` is not yet imported in the test file, add it. + +- [ ] **Step 2: Run the test and confirm it fails.** + +Run: `dda inv test --targets=./comp/core/autodiscovery/providers/ -- -run TestReadConfigFiles_AutoConfDiscovery` +Expected: FAIL — config not found, or `Discovery` is nil because the new field is not parsed yet. + +- [ ] **Step 3: Implement the parsing.** + +In `comp/core/autodiscovery/providers/config_reader.go`: + +3a. Find the `configFormat` struct (around line 34). Add a `Discovery` field at the end: + +```go + Discovery *integration.DiscoveryConfig `yaml:"discovery,omitempty"` +``` + +3b. Find the function that copies parsed YAML fields onto the returned `integration.Config` (around line 490, where `conf.ADIdentifiers = cf.ADIdentifiers` and `conf.AdvancedADIdentifiers = cf.AdvancedADIdentifiers` are set). Add immediately after those lines: + +```go + conf.Discovery = cf.Discovery +``` + +3c. The file lookup currently includes `auto_conf.yaml` because of the loop in `collectEntry`/`collectDir` that iterates *all* `.yaml` files. `auto_conf_discovery.yaml` ends in `.yaml`, so it is already eligible. Verify by reading lines 290–340 of `config_reader.go`. If a special-case branch references `"auto_conf.yaml"` *exclusively* (other than the existing `ignore_autoconf` early-return at line 301), broaden it to also accept `"auto_conf_discovery.yaml"`. The existing `ignore_autoconf` early-return is independent and does not need changing for this experiment. + +- [ ] **Step 4: Run the test and confirm it passes.** + +Run: `dda inv test --targets=./comp/core/autodiscovery/providers/ -- -run TestReadConfigFiles_AutoConfDiscovery` +Expected: PASS. + +- [ ] **Step 5: Run the full provider package test.** + +Run: `dda inv test --targets=./comp/core/autodiscovery/providers/` +Expected: PASS. + +- [ ] **Step 6: Commit.** + +```bash +git add comp/core/autodiscovery/providers/config_reader.go comp/core/autodiscovery/providers/config_reader_test.go +git commit -m "autodiscovery/providers: parse auto_conf_discovery.yaml + +Recognise the discovery: block in the file format and populate +integration.Config.Discovery. The file is picked up via the same .yaml +filename matcher that handles auto_conf.yaml today. + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +## Task 5: Discovery package — types and `ProbeResult` + +Lay down the package skeleton: shared types used by all later tasks. Tests come with the prober task, not here. + +**Files:** +- Create: `comp/core/autodiscovery/discovery/types.go` + +- [ ] **Step 1: Create the file.** + +```go +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +// Package discovery implements probe-based "advanced auto-config" — running +// a verifying probe against a discovered Service to derive instance config +// values that cannot be expressed by template substitution alone. +package discovery + +import ( + "context" + + "github.com/DataDog/datadog-agent/comp/core/autodiscovery/integration" + "github.com/DataDog/datadog-agent/comp/core/autodiscovery/listeners" +) + +// ProbeResult is the outcome of a successful probe. +type ProbeResult struct { + // Port is the discovered TCP port that responded successfully to the + // probe. + Port uint16 +} + +// Prober probes a Service against a DiscoveryConfig and returns a result +// when one of the candidate (host, port, path) tuples verifies. If no +// candidate verifies within the budget, ok is false. +type Prober interface { + Probe(ctx context.Context, cfg *integration.DiscoveryConfig, svc listeners.Service) (result ProbeResult, ok bool) +} +``` + +- [ ] **Step 2: Verify it builds.** + +Run: `dda inv test --targets=./comp/core/autodiscovery/discovery/` +Expected: PASS (no tests yet, but the package must compile). + +- [ ] **Step 3: Commit.** + +```bash +git add comp/core/autodiscovery/discovery/types.go +git commit -m "autodiscovery/discovery: scaffold package with ProbeResult/Prober types + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +## Task 6: Discovery package — candidate port ordering + +Pure function. Trivial to test in isolation. + +**Files:** +- Create: `comp/core/autodiscovery/discovery/candidates.go` +- Create: `comp/core/autodiscovery/discovery/candidates_test.go` + +- [ ] **Step 1: Write the failing test.** + +Create `comp/core/autodiscovery/discovery/candidates_test.go`: + +```go +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package discovery + +import ( + "reflect" + "testing" + + workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def" +) + +func TestCandidatePorts(t *testing.T) { + exposed := []workloadmeta.ContainerPort{{Port: 9000}, {Port: 8090}, {Port: 9001}} + + tests := []struct { + name string + hints []int + want []uint16 + }{ + {"no hints — fallback only", nil, []uint16{9000, 8090, 9001}}, + {"hint matches one exposed", []int{8090}, []uint16{8090, 9000, 9001}}, + {"hint not exposed is dropped", []int{1234}, []uint16{9000, 8090, 9001}}, + {"two hints, declared order preserved", []int{8090, 9000}, []uint16{8090, 9000, 9001}}, + {"empty exposed yields empty", nil, []uint16{}}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + ex := exposed + if tc.name == "empty exposed yields empty" { + ex = nil + } + got := candidatePorts(tc.hints, ex) + if !reflect.DeepEqual(got, tc.want) { + t.Fatalf("got %+v want %+v", got, tc.want) + } + }) + } +} +``` + +- [ ] **Step 2: Run the test and confirm it fails.** + +Run: `dda inv test --targets=./comp/core/autodiscovery/discovery/ -- -run TestCandidatePorts` +Expected: FAIL — `undefined: candidatePorts`. + +- [ ] **Step 3: Implement.** + +Create `comp/core/autodiscovery/discovery/candidates.go`: + +```go +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package discovery + +import ( + workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def" +) + +func candidatePorts(hints []int, exposed []workloadmeta.ContainerPort) []uint16 { + exposedSet := make(map[uint16]struct{}, len(exposed)) + for _, p := range exposed { + exposedSet[uint16(p.Port)] = struct{}{} + } + + out := make([]uint16, 0, len(exposed)) + seen := make(map[uint16]struct{}, len(exposed)) + + for _, h := range hints { + p := uint16(h) + if _, ok := exposedSet[p]; !ok { + continue + } + if _, dup := seen[p]; dup { + continue + } + out = append(out, p) + seen[p] = struct{}{} + } + + for _, p := range exposed { + port := uint16(p.Port) + if _, dup := seen[port]; dup { + continue + } + out = append(out, port) + seen[port] = struct{}{} + } + + return out +} +``` + +- [ ] **Step 4: Run the test and confirm it passes.** + +Run: `dda inv test --targets=./comp/core/autodiscovery/discovery/ -- -run TestCandidatePorts` +Expected: PASS. + +- [ ] **Step 5: Commit.** + +```bash +git add comp/core/autodiscovery/discovery/candidates.go comp/core/autodiscovery/discovery/candidates_test.go +git commit -m "autodiscovery/discovery: candidate port ordering + +Hints first (when exposed), then remaining exposed ports in declared +order. Dedup-aware. + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +## Task 7: Discovery package — TTL cache + +Records probe outcomes per `(serviceID, configHash)` so we don't re-probe a known-good (or recently-failed) service on every reconcile. + +**Files:** +- Create: `comp/core/autodiscovery/discovery/cache.go` +- Create: `comp/core/autodiscovery/discovery/cache_test.go` + +- [ ] **Step 1: Write the failing test.** + +Create `comp/core/autodiscovery/discovery/cache_test.go`: + +```go +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package discovery + +import ( + "testing" + "time" +) + +func TestProbeCache_HitAndExpiry(t *testing.T) { + now := time.Unix(1_700_000_000, 0) + clock := func() time.Time { return now } + c := newProbeCache(clock) + + // Empty cache — miss. + if _, _, ok := c.get("svc1", "h1"); ok { + t.Fatal("expected miss on empty cache") + } + + // Successful probe entry, never expires. + c.putSuccess("svc1", "h1", ProbeResult{Port: 8090}) + if r, success, ok := c.get("svc1", "h1"); !ok || !success || r.Port != 8090 { + t.Fatalf("expected hit success(8090); got ok=%v success=%v port=%d", ok, success, r.Port) + } + + // Failed probe entry, expires after 30s. + c.putFailure("svc1", "h2", 30*time.Second) + if _, success, ok := c.get("svc1", "h2"); !ok || success { + t.Fatal("expected hit failure") + } + now = now.Add(31 * time.Second) + if _, _, ok := c.get("svc1", "h2"); ok { + t.Fatal("expected miss after expiry") + } +} + +func TestProbeCache_DifferentKeysIsolated(t *testing.T) { + now := time.Unix(0, 0) + c := newProbeCache(func() time.Time { return now }) + c.putSuccess("svcA", "h1", ProbeResult{Port: 1}) + c.putSuccess("svcB", "h1", ProbeResult{Port: 2}) + if r, _, _ := c.get("svcA", "h1"); r.Port != 1 { + t.Fatalf("svcA: got %d", r.Port) + } + if r, _, _ := c.get("svcB", "h1"); r.Port != 2 { + t.Fatalf("svcB: got %d", r.Port) + } +} +``` + +- [ ] **Step 2: Run the test and confirm it fails.** + +Run: `dda inv test --targets=./comp/core/autodiscovery/discovery/ -- -run TestProbeCache` +Expected: FAIL — `undefined: newProbeCache` etc. + +- [ ] **Step 3: Implement.** + +Create `comp/core/autodiscovery/discovery/cache.go`: + +```go +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package discovery + +import ( + "sync" + "time" +) + +type cacheEntry struct { + result ProbeResult + success bool + expiresAt time.Time // zero = never +} + +type probeCache struct { + mu sync.Mutex + entries map[string]cacheEntry + now func() time.Time +} + +func newProbeCache(now func() time.Time) *probeCache { + if now == nil { + now = time.Now + } + return &probeCache{entries: make(map[string]cacheEntry), now: now} +} + +func cacheKey(svcID, cfgHash string) string { + return svcID + "|" + cfgHash +} + +func (c *probeCache) get(svcID, cfgHash string) (ProbeResult, bool, bool) { + c.mu.Lock() + defer c.mu.Unlock() + e, ok := c.entries[cacheKey(svcID, cfgHash)] + if !ok { + return ProbeResult{}, false, false + } + if !e.expiresAt.IsZero() && c.now().After(e.expiresAt) { + delete(c.entries, cacheKey(svcID, cfgHash)) + return ProbeResult{}, false, false + } + return e.result, e.success, true +} + +func (c *probeCache) putSuccess(svcID, cfgHash string, r ProbeResult) { + c.mu.Lock() + defer c.mu.Unlock() + c.entries[cacheKey(svcID, cfgHash)] = cacheEntry{result: r, success: true} +} + +func (c *probeCache) putFailure(svcID, cfgHash string, ttl time.Duration) { + c.mu.Lock() + defer c.mu.Unlock() + c.entries[cacheKey(svcID, cfgHash)] = cacheEntry{success: false, expiresAt: c.now().Add(ttl)} +} +``` + +- [ ] **Step 4: Run the test and confirm it passes.** + +Run: `dda inv test --targets=./comp/core/autodiscovery/discovery/ -- -run TestProbeCache` +Expected: PASS. + +- [ ] **Step 5: Commit.** + +```bash +git add comp/core/autodiscovery/discovery/cache.go comp/core/autodiscovery/discovery/cache_test.go +git commit -m "autodiscovery/discovery: TTL probe cache + +Per-(serviceID, configHash) cache. Successes never expire; +failures expire after caller-supplied TTL. + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +## Task 8: Discovery package — OpenMetrics prober + +The HTTP probe + Prometheus-line verification + budget loop. Uses `httptest` for unit tests. + +**Files:** +- Create: `comp/core/autodiscovery/discovery/openmetrics_prober.go` +- Create: `comp/core/autodiscovery/discovery/openmetrics_prober_test.go` + +- [ ] **Step 1: Write the failing test.** + +Create `comp/core/autodiscovery/discovery/openmetrics_prober_test.go`: + +```go +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package discovery + +import ( + "context" + "net" + "net/http" + "net/http/httptest" + "strconv" + "testing" + "time" + + "github.com/DataDog/datadog-agent/comp/core/autodiscovery/integration" + workloadmeta "github.com/DataDog/datadog-agent/comp/core/workloadmeta/def" +) + +func TestVerifyOpenMetricsResponse(t *testing.T) { + cases := []struct { + name string + status int + contentType string + body string + want bool + }{ + {"prom-text", 200, "text/plain; version=0.0.4", "go_goroutines 5\n", true}, + {"openmetrics-text", 200, "application/openmetrics-text; version=1.0.0", "go_goroutines 5\n", true}, + {"json", 200, "application/json", `{"a":1}`, false}, + {"html", 200, "text/html", "", false}, + {"401", 401, "text/plain", "go_goroutines 5\n", false}, + {"prom-no-line", 200, "text/plain", "# HELP only\n# TYPE only\n", false}, + {"prom-with-labels", 200, "text/plain", `http_requests_total{code="200"} 1027` + "\n", true}, + {"prom-with-comments-first", 200, "text/plain", "# HELP foo bar\n# TYPE foo counter\nfoo 1\n", true}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := verifyOpenMetricsResponse(tc.status, tc.contentType, []byte(tc.body)); got != tc.want { + t.Fatalf("got %v want %v", got, tc.want) + } + }) + } +} + +// fakeService implements listeners.Service minimally for the prober. +type fakeService struct { + id string + hosts map[string]string + ports []workloadmeta.ContainerPort +} + +func (f *fakeService) GetServiceID() string { return f.id } +func (f *fakeService) GetADIdentifiers() []string { return []string{"krakend"} } +func (f *fakeService) GetHosts() (map[string]string, error) { return f.hosts, nil } +func (f *fakeService) GetPorts() ([]workloadmeta.ContainerPort, error) { + return f.ports, nil +} +func (f *fakeService) GetTags() ([]string, error) { return nil, nil } +func (f *fakeService) GetTagsWithCardinality(string) ([]string, error) { return nil, nil } +func (f *fakeService) GetPid() (int, error) { return 0, nil } +func (f *fakeService) GetHostname() (string, error) { return "", nil } +func (f *fakeService) IsReady() bool { return true } +func (f *fakeService) GetCheckNames() []string { return nil } +func (f *fakeService) HasFilter(any) bool { return false } +func (f *fakeService) GetExtraConfig(string) (string, error) { return "", nil } +func (f *fakeService) FilterTemplates(map[string]integration.Config) {} +func (f *fakeService) GetImageName() string { return "krakend:test" } +func (f *fakeService) Equal(other any) bool { return false } + +func TestProbe_HintMatchesFirst(t *testing.T) { + bad := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(404) + })) + defer bad.Close() + good := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Type", "text/plain; version=0.0.4") + w.Write([]byte("go_goroutines 5\n")) + })) + defer good.Close() + + badHost, badPortStr, _ := net.SplitHostPort(bad.Listener.Addr().String()) + goodHost, goodPortStr, _ := net.SplitHostPort(good.Listener.Addr().String()) + badPort, _ := strconv.Atoi(badPortStr) + goodPort, _ := strconv.Atoi(goodPortStr) + if badHost != goodHost { + t.Fatalf("test assumption: both servers on same host (got %s, %s)", badHost, goodHost) + } + + svc := &fakeService{ + id: "container_id://abc", + hosts: map[string]string{"bridge": badHost}, + ports: []workloadmeta.ContainerPort{{Port: badPort}, {Port: goodPort}}, + } + cfg := &integration.DiscoveryConfig{ + Type: "openmetrics", + Ports: []int{goodPort}, + Path: "/metrics", + } + + p := NewOpenMetricsProber(WithFailureTTL(time.Second)) + r, ok := p.Probe(context.Background(), cfg, svc) + if !ok { + t.Fatal("expected probe success") + } + if int(r.Port) != goodPort { + t.Fatalf("port: got %d want %d", r.Port, goodPort) + } +} + +func TestProbe_AllFailReturnsFalse(t *testing.T) { + bad := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(404) + })) + defer bad.Close() + host, portStr, _ := net.SplitHostPort(bad.Listener.Addr().String()) + port, _ := strconv.Atoi(portStr) + + svc := &fakeService{ + id: "container_id://xyz", + hosts: map[string]string{"bridge": host}, + ports: []workloadmeta.ContainerPort{{Port: port}}, + } + cfg := &integration.DiscoveryConfig{Type: "openmetrics", Path: "/metrics"} + + p := NewOpenMetricsProber(WithFailureTTL(time.Second)) + if _, ok := p.Probe(context.Background(), cfg, svc); ok { + t.Fatal("expected probe failure") + } +} +``` + +If the `listeners.Service` interface requires more methods than the stub provides, expand the stub to satisfy it. Run `go doc github.com/DataDog/datadog-agent/comp/core/autodiscovery/listeners.Service` to read the full interface. + +- [ ] **Step 2: Run the test and confirm it fails.** + +Run: `dda inv test --targets=./comp/core/autodiscovery/discovery/ -- -run TestVerifyOpenMetricsResponse` +Expected: FAIL — `undefined: verifyOpenMetricsResponse`. + +- [ ] **Step 3: Implement.** + +Create `comp/core/autodiscovery/discovery/openmetrics_prober.go`: + +```go +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package discovery + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "fmt" + "io" + "net" + "net/http" + "regexp" + "strconv" + "strings" + "time" + + "github.com/DataDog/datadog-agent/comp/core/autodiscovery/integration" + "github.com/DataDog/datadog-agent/comp/core/autodiscovery/listeners" + "github.com/DataDog/datadog-agent/pkg/util/log" +) + +const ( + defaultPath = "/metrics" + defaultPerProbe = 500 * time.Millisecond + defaultBudget = 2 * time.Second + defaultMaxAttempts = 8 + defaultFailureTTL = 30 * time.Second +) + +var promLineRe = regexp.MustCompile(`^[a-zA-Z_:][a-zA-Z0-9_:]*(\{[^}]*\})?\s+\S+`) + +// OpenMetricsProberOption configures an OpenMetricsProber. +type OpenMetricsProberOption func(*openMetricsProber) + +// WithFailureTTL overrides the negative-cache TTL. +func WithFailureTTL(d time.Duration) OpenMetricsProberOption { + return func(p *openMetricsProber) { p.failureTTL = d } +} + +type openMetricsProber struct { + client *http.Client + cache *probeCache + perProbe time.Duration + totalBudget time.Duration + maxAttempts int + failureTTL time.Duration +} + +// NewOpenMetricsProber returns a Prober that verifies OpenMetrics endpoints. +func NewOpenMetricsProber(opts ...OpenMetricsProberOption) Prober { + p := &openMetricsProber{ + client: &http.Client{Transport: &http.Transport{DisableKeepAlives: true}}, + cache: newProbeCache(time.Now), + perProbe: defaultPerProbe, + totalBudget: defaultBudget, + maxAttempts: defaultMaxAttempts, + failureTTL: defaultFailureTTL, + } + for _, o := range opts { + o(p) + } + return p +} + +func (p *openMetricsProber) Probe(ctx context.Context, cfg *integration.DiscoveryConfig, svc listeners.Service) (ProbeResult, bool) { + if cfg == nil || cfg.Type != "openmetrics" { + return ProbeResult{}, false + } + host, ok := pickHost(svc) + if !ok { + log.Debugf("autodiscovery/discovery: %s has no host, skipping", svc.GetServiceID()) + return ProbeResult{}, false + } + exposed, err := svc.GetPorts() + if err != nil || len(exposed) == 0 { + return ProbeResult{}, false + } + + cfgHash := hashDiscoveryConfig(cfg) + if r, success, hit := p.cache.get(svc.GetServiceID(), cfgHash); hit { + return r, success + } + + path := cfg.Path + if path == "" { + path = defaultPath + } + candidates := candidatePorts(cfg.Ports, exposed) + deadline := time.Now().Add(p.totalBudget) + + attempts := 0 + for _, port := range candidates { + if attempts >= p.maxAttempts || time.Now().After(deadline) { + break + } + attempts++ + if p.tryPort(ctx, host, port, path) { + r := ProbeResult{Port: port} + p.cache.putSuccess(svc.GetServiceID(), cfgHash, r) + log.Infof("autodiscovery/discovery: probe matched %s:%d%s for %s", host, port, path, svc.GetServiceID()) + return r, true + } + } + + p.cache.putFailure(svc.GetServiceID(), cfgHash, p.failureTTL) + log.Debugf("autodiscovery/discovery: %d candidate(s) for %s did not match", len(candidates), svc.GetServiceID()) + return ProbeResult{}, false +} + +func (p *openMetricsProber) tryPort(ctx context.Context, host string, port uint16, path string) bool { + url := "http://" + net.JoinHostPort(host, strconv.Itoa(int(port))) + path + tctx, cancel := context.WithTimeout(ctx, p.perProbe) + defer cancel() + req, err := http.NewRequestWithContext(tctx, http.MethodGet, url, nil) + if err != nil { + return false + } + resp, err := p.client.Do(req) + if err != nil { + return false + } + defer resp.Body.Close() + body, err := io.ReadAll(io.LimitReader(resp.Body, 64*1024)) + if err != nil { + return false + } + return verifyOpenMetricsResponse(resp.StatusCode, resp.Header.Get("Content-Type"), body) +} + +func verifyOpenMetricsResponse(status int, contentType string, body []byte) bool { + if status != http.StatusOK { + return false + } + ct := strings.ToLower(contentType) + if !strings.HasPrefix(ct, "text/plain") && !strings.HasPrefix(ct, "application/openmetrics-text") { + return false + } + for _, line := range strings.Split(string(body), "\n") { + s := strings.TrimSpace(line) + if s == "" || strings.HasPrefix(s, "#") { + continue + } + return promLineRe.MatchString(s) + } + return false +} + +func pickHost(svc listeners.Service) (string, bool) { + hosts, err := svc.GetHosts() + if err != nil || len(hosts) == 0 { + return "", false + } + if h, ok := hosts["bridge"]; ok && h != "" { + return h, true + } + for _, h := range hosts { + if h != "" { + return h, true + } + } + return "", false +} + +func hashDiscoveryConfig(cfg *integration.DiscoveryConfig) string { + h := sha256.New() + fmt.Fprintf(h, "%s|%s|", cfg.Type, cfg.Path) + for _, p := range cfg.Ports { + fmt.Fprintf(h, "%d,", p) + } + return hex.EncodeToString(h.Sum(nil)) +} +``` + +- [ ] **Step 4: Run the tests.** + +Run: `dda inv test --targets=./comp/core/autodiscovery/discovery/` +Expected: PASS. + +- [ ] **Step 5: Commit.** + +```bash +git add comp/core/autodiscovery/discovery/openmetrics_prober.go comp/core/autodiscovery/discovery/openmetrics_prober_test.go +git commit -m "autodiscovery/discovery: OpenMetrics prober + +HTTP-GET each candidate port + path with a 500ms per-probe budget +and a 2s overall budget. Verify Content-Type is text/plain or +application/openmetrics-text and that the body's first non-comment +line is a Prometheus exposition line. Cache success/failure per +(serviceID, config hash). + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +## Task 9: Service wrapper that injects `discovered_port` via `GetExtraConfig` + +A small adapter so `%%discovered_port%%` lookup goes through the existing `GetExtraConfig` path on `listeners.Service`. Keeps the resolver unchanged. + +**Files:** +- Create: `comp/core/autodiscovery/discovery/service_wrapper.go` +- Create: `comp/core/autodiscovery/discovery/service_wrapper_test.go` + +- [ ] **Step 1: Write the failing test.** + +Create `comp/core/autodiscovery/discovery/service_wrapper_test.go`: + +```go +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package discovery + +import "testing" + +func TestServiceWithProbeResult_GetExtraConfig(t *testing.T) { + base := &fakeService{id: "svc"} + w := WrapWithProbeResult(base, ProbeResult{Port: 8090}) + + v, err := w.GetExtraConfig("discovered_port") + if err != nil { + t.Fatalf("error: %v", err) + } + if v != "8090" { + t.Fatalf("got %q want 8090", v) + } + + if _, err := w.GetExtraConfig("unknown"); err == nil { + t.Fatal("expected error for unknown extra key") + } +} +``` + +- [ ] **Step 2: Run the test and confirm it fails.** + +Run: `dda inv test --targets=./comp/core/autodiscovery/discovery/ -- -run TestServiceWithProbeResult` +Expected: FAIL — `undefined: WrapWithProbeResult`. + +- [ ] **Step 3: Implement.** + +Create `comp/core/autodiscovery/discovery/service_wrapper.go`: + +```go +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +package discovery + +import ( + "strconv" + + "github.com/DataDog/datadog-agent/comp/core/autodiscovery/listeners" +) + +// WrapWithProbeResult returns a Service that overlays ProbeResult-derived +// values on the underlying Service via GetExtraConfig. Today only +// "discovered_port" is exposed. +func WrapWithProbeResult(svc listeners.Service, r ProbeResult) listeners.Service { + return &serviceWithProbeResult{Service: svc, result: r} +} + +type serviceWithProbeResult struct { + listeners.Service + result ProbeResult +} + +func (s *serviceWithProbeResult) GetExtraConfig(key string) (string, error) { + if key == "discovered_port" { + return strconv.Itoa(int(s.result.Port)), nil + } + return s.Service.GetExtraConfig(key) +} +``` + +- [ ] **Step 4: Run the test and confirm it passes.** + +Run: `dda inv test --targets=./comp/core/autodiscovery/discovery/ -- -run TestServiceWithProbeResult` +Expected: PASS. + +- [ ] **Step 5: Commit.** + +```bash +git add comp/core/autodiscovery/discovery/service_wrapper.go comp/core/autodiscovery/discovery/service_wrapper_test.go +git commit -m "autodiscovery/discovery: service wrapper exposing discovered_port + +Tiny shim so %%discovered_port%% resolution can flow through the +existing GetExtraConfig path; no resolver signature change required. + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +## Task 10: Add `%%discovered_port%%` template variable to `tmplvar` + +Register the new top-level variable. Behavioural minimum: `%%discovered_port%%` resolves via `GetExtraConfig("discovered_port")`. + +**Files:** +- Modify: `pkg/util/tmplvar/resolver.go` +- Modify: `pkg/util/tmplvar/resolver_test.go` + +- [ ] **Step 1: Write the failing test.** + +Append to `pkg/util/tmplvar/resolver_test.go`: + +```go +func TestResolveDiscoveredPort(t *testing.T) { + res := &mockResolvable{ + extraConfig: map[string]string{ + "discovered_port": "8090", + }, + } + r := NewTemplateResolver(YAMLParser, nil, false) + out, err := r.ResolveDataWithTemplateVars([]byte(`url: "http://example:%%discovered_port%%/metrics"`+"\n"), res) + if err != nil { + t.Fatalf("err: %v", err) + } + if got, want := strings.TrimSpace(string(out)), `url: http://example:8090/metrics`; got != want { + t.Fatalf("got %q want %q", got, want) + } +} + +func TestResolveDiscoveredPort_MissingErrors(t *testing.T) { + res := &mockResolvable{} + r := NewTemplateResolver(YAMLParser, nil, false) + _, err := r.ResolveDataWithTemplateVars([]byte(`url: "http://example:%%discovered_port%%/metrics"`+"\n"), res) + if err == nil { + t.Fatal("expected error when discovered_port is unavailable") + } +} +``` + +If `mockResolvable` does not yet have an `extraConfig` field, augment it. Inspect the existing definition (around line 50) and extend. + +- [ ] **Step 2: Run the test and confirm it fails.** + +Run: `dda inv test --targets=./pkg/util/tmplvar/ -- -run TestResolveDiscoveredPort` +Expected: FAIL — `discovered` is not a known template variable; substitution error. + +- [ ] **Step 3: Implement.** + +3a. In `pkg/util/tmplvar/resolver.go`, find `NewTemplateResolver` (line ~95). Inside the `templateVariables` map, add a new entry: + +```go + "discovered": GetDiscoveredPort, +``` + +3b. After the existing `GetAdditionalTplVariables` function (around line 467–479), add: + +```go +// GetDiscoveredPort resolves the %%discovered_port%% template variable. It is +// populated by the autodiscovery/discovery package when a probe matches a +// service. The value flows in via GetExtraConfig on a Service wrapper. +func GetDiscoveredPort(tplVar string, res Resolvable) (string, error) { + if tplVar != "port" { + return "", noResolverError(fmt.Sprintf("unsupported %%discovered_%s%% variable; only %%discovered_port%% is recognised", tplVar)) + } + v, err := res.GetExtraConfig("discovered_port") + if err != nil || v == "" { + return "", noResolverError("discovered_port not available — autodiscovery probe did not run or did not match") + } + return v, nil +} +``` + +- [ ] **Step 4: Run the test and confirm it passes.** + +Run: `dda inv test --targets=./pkg/util/tmplvar/` +Expected: PASS (the new tests AND the full pre-existing suite). + +- [ ] **Step 5: Commit.** + +```bash +git add pkg/util/tmplvar/resolver.go pkg/util/tmplvar/resolver_test.go +git commit -m "tmplvar: add %%discovered_port%% template variable + +Routes via Resolvable.GetExtraConfig("discovered_port"). Populated by +autodiscovery/discovery's serviceWithProbeResult wrapper after a +successful probe. + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +## Task 11: Wire the prober into `configmgr` reconcile path + +This is the integration step where the prober actually runs. + +**Files:** +- Modify: `comp/core/autodiscovery/autodiscoveryimpl/configmgr.go` + +- [ ] **Step 1: Read the surrounding code first.** + +Open `comp/core/autodiscovery/autodiscoveryimpl/configmgr.go` and read the `resolveTemplateForService` function (around line 409) and where it's called from (search for `resolveTemplateForService(`). Also locate the constructor for `reconcilingConfigManager` (search for `func newReconcilingConfigManager` or `func NewReconciling`). Identify how dependencies (logger, secrets resolver, healthPlatform) are injected — we'll add the Prober alongside. + +- [ ] **Step 2: Add a `prober` field on `reconcilingConfigManager`.** + +Find the `reconcilingConfigManager` struct definition (likely at the top of `configmgr.go`). Add a field: + +```go + prober discovery.Prober +``` + +Add the import: + +```go + "github.com/DataDog/datadog-agent/comp/core/autodiscovery/discovery" +``` + +In the constructor that builds `reconcilingConfigManager`, add a parameter `prober discovery.Prober` and assign `cm.prober = prober`. Update all call sites of the constructor (use `git grep -n "newReconcilingConfigManager\|NewReconcilingConfigManager"` to find them) — pass `discovery.NewOpenMetricsProber()` from the AutoConfig wiring (the agent main composer file in `comp/core/autodiscovery/autodiscoveryimpl/autoconfig.go` is the natural site). + +- [ ] **Step 3: Modify `resolveTemplateForService` to run the prober when the template demands it.** + +Replace the existing `resolveTemplateForService` body (lines ~409–428) with: + +```go +func (cm *reconcilingConfigManager) resolveTemplateForService(tpl integration.Config, svc listeners.Service) (integration.Config, bool) { + digest := tpl.Digest() + resolvedSvc := svc + + if tpl.Discovery != nil { + result, ok := cm.prober.Probe(context.Background(), tpl.Discovery, svc) + if !ok { + msg := fmt.Sprintf("discovery probe did not match for template %s and service %s", tpl.Name, svc.GetServiceID()) + log.Debugf("autodiscovery: %s", msg) + errorStats.setResolveWarning(tpl.Name, msg) + return tpl, false + } + resolvedSvc = discovery.WrapWithProbeResult(svc, result) + } + + config, err := configresolver.Resolve(tpl, resolvedSvc) + if err != nil { + msg := fmt.Sprintf("error resolving template %s for service %s: %v", tpl.Name, svc.GetServiceID(), err) + log.Errorf("autodiscovery: skipping config - %s", msg) + errorStats.setResolveWarning(tpl.Name, msg) + cm.reportTemplateResolutionFailure(tpl, svc, err) + return tpl, false + } + resolvedConfig, err := decryptConfig(config, cm.secretResolver, digest) + if err != nil { + msg := fmt.Sprintf("error decrypting secrets in config %s for service %s: %v", config.Name, svc.GetServiceID(), err) + errorStats.setResolveWarning(tpl.Name, msg) + return config, false + } + errorStats.removeResolveWarnings(tpl.Name) + cm.clearTemplateResolutionFailure(tpl, svc) + return resolvedConfig, true +} +``` + +Add `"context"` to the import block if it isn't already imported. + +- [ ] **Step 4: Build and lint.** + +Run: `dda inv test --targets=./comp/core/autodiscovery/autodiscoveryimpl/` +Expected: PASS. + +Then run the linter: +Run: `dda inv linter.go --targets=./comp/core/autodiscovery/autodiscoveryimpl/ ./comp/core/autodiscovery/discovery/ ./comp/core/autodiscovery/integration/ ./comp/core/autodiscovery/providers/ ./pkg/util/tmplvar/` +Expected: PASS. + +- [ ] **Step 5: Commit.** + +```bash +git add comp/core/autodiscovery/autodiscoveryimpl/configmgr.go comp/core/autodiscovery/autodiscoveryimpl/autoconfig.go +git commit -m "autodiscovery: run discovery probe before resolving discovery templates + +When a Config has Discovery set, run the OpenMetrics prober against +the matched Service before configresolver.Resolve. On match wrap the +service so %%discovered_port%% resolves; on no match skip scheduling +the check (logged at DEBUG). + +Co-Authored-By: Claude Opus 4.7 (1M context) " +``` + +--- + +## Task 12: Build the agent + +Verify the full Agent compiles before we run a docker container. + +**Files:** none. + +- [ ] **Step 1: Run the full unit test sweep across touched packages.** + +Run: +```bash +dda inv test --targets=./comp/core/autodiscovery/integration/ \ + ./comp/core/autodiscovery/providers/ \ + ./comp/core/autodiscovery/discovery/ \ + ./comp/core/autodiscovery/autodiscoveryimpl/ \ + ./pkg/util/tmplvar/ +``` +Expected: PASS. + +- [ ] **Step 2: Build the agent.** + +Run: `dda inv agent.build --build-exclude=systemd` +Expected: agent binary at `bin/agent/agent`. + +- [ ] **Step 3: Sanity check the binary.** + +Run: `./bin/agent/agent version` +Expected: prints a version string and exits 0. + +- [ ] **Step 4: No commit needed.** (The build is verification only.) + +--- + +## Task 13: Demo scenario 1 — default port + +End-to-end run with KrakenD on port 8090. Demonstrates the hint-port path. + +**Files:** none. Manual test. + +- [ ] **Step 1: Start KrakenD via its dev-env compose file.** + +Run: +```bash +cd /home/vagrant/go/src/github.com/DataDog/integrations-core/krakend/tests/docker +docker compose up -d +``` +Expected: krakend, backend, and any sidecars come up healthy. Verify with `docker compose ps`. + +- [ ] **Step 2: Confirm the OpenMetrics endpoint is live.** + +Run: `curl -s -o /dev/null -w "%{http_code} %{content_type}\n" http://localhost:8090/metrics` +Expected: `200 text/plain; ...`. If different, abort and investigate. + +- [ ] **Step 3: Run the local Agent docker container with the locally built binary + krakend integration source bind-mounted.** + +Use the helper script per `integrations-core/reference_docker_integration_testing.md`: + +```bash +/home/vagrant/go/src/github.com/DataDog/experimental/users/vincent.whitchurch/hacks/bin/docker-agent-run.sh \ + -v "/home/vagrant/go/src/github.com/DataDog/integrations-core/krakend/datadog_checks/krakend:/opt/datadog-agent/embedded/lib/python3.13/site-packages/datadog_checks/krakend" \ + -v "/home/vagrant/go/src/github.com/DataDog/datadog-agent/bin/agent/agent:/opt/datadog-agent/bin/agent/agent" \ + -d datadog/agent:nightly-main-py3-jmx +``` + +Expected: container `dd-agent-foo` running. Find the krakend container's IP on the docker network (`docker inspect | grep IPAddress`) — the agent container needs to be on the same network or the krakend container's published 8090 port must be reachable. If they aren't on the same network, attach the agent: `docker network connect dd-agent-foo`. + +- [ ] **Step 4: Wait ~30s for AD reconciliation, then check the agent status.** + +Run: `docker exec dd-agent-foo agent status | sed -n '/krakend (/,/^[A-Z]/p'` +Expected: a `krakend` instance section appears with `Instance ID: krakend: [OK]`. The `Configuration source` shows the path to `auto_conf_discovery.yaml`. The instance config block contains `openmetrics_endpoint: http://:8090/metrics`. + +- [ ] **Step 5: Confirm metrics flow.** + +Run: `docker logs dd-agent-foo 2>&1 | grep -iE "krakend|discovery: probe matched"` +Expected: at least one line `autodiscovery/discovery: probe matched :8090/metrics for ...`. Check that the krakend check itself is running (no `[ERROR]` or `[WARNING]` mentions of the check). + +- [ ] **Step 6: No commit. Capture observations as a quick note for follow-up.** + +--- + +## Task 14: Demo scenario 2 — non-default port + +Re-run with KrakenD listening on port 9000 instead of 8090. Demonstrates fallback scan. + +**Files:** none. + +- [ ] **Step 1: Stop scenario 1.** + +Run: +```bash +cd /home/vagrant/go/src/github.com/DataDog/integrations-core/krakend/tests/docker +docker compose down +docker rm -f dd-agent-foo +``` + +- [ ] **Step 2: Reconfigure KrakenD to listen on 9000.** + +Edit `integrations-core/krakend/tests/docker/krakend.json` (the field is the top-level `port`). Save the change locally — do not commit it. Update the docker-compose port mapping (`ports:`) accordingly: `"9000:9000"` and the EXPOSE in the compose file. + +- [ ] **Step 3: Bring KrakenD back up.** + +Run: `docker compose up -d` from the same directory. + +- [ ] **Step 4: Verify the metrics endpoint is at 9000.** + +Run: `curl -s -o /dev/null -w "%{http_code}\n" http://localhost:9000/metrics` +Expected: `200`. + +- [ ] **Step 5: Re-run the agent.** + +Same `docker-agent-run.sh` invocation as in Task 13 Step 3. + +- [ ] **Step 6: Verify the agent discovered port 9000 via the fallback scan.** + +Run: `docker exec dd-agent-foo agent status | sed -n '/krakend (/,/^[A-Z]/p'` +Expected: `openmetrics_endpoint: http://:9000/metrics`. + +Run: `docker logs dd-agent-foo 2>&1 | grep "probe matched"` +Expected: a single line referencing port 9000. + +- [ ] **Step 7: Revert the krakend.json + compose changes (do not commit them).** + +Run: `git -C /home/vagrant/go/src/github.com/DataDog/integrations-core checkout -- krakend/tests/docker/` +Expected: changes reverted. + +--- + +## Task 15: Demo scenario 3 — negative case + +A container that matches the `krakend` ad_identifier but does not serve OpenMetrics. The probe should fail and no check should be scheduled. + +**Files:** none. + +- [ ] **Step 1: Stop the previous scenario.** + +Run: +```bash +docker compose -f /home/vagrant/go/src/github.com/DataDog/integrations-core/krakend/tests/docker/docker-compose.yml down || true +docker rm -f dd-agent-foo || true +``` + +- [ ] **Step 2: Start a non-KrakenD container labelled with the krakend ad_identifier.** + +Run: +```bash +docker run -d --name fake-krakend --label com.datadoghq.ad.check_names='["krakend"]' --label com.datadoghq.ad.init_configs='[{}]' --label com.datadoghq.ad.instances='[{}]' nginx:alpine +``` +This labels nginx so AutoDiscovery sees `krakend` as the ad_identifier match (via the labels listener). Nginx serves HTML at `/`, not OpenMetrics — so the probe must fail. + +- [ ] **Step 3: Run the agent.** + +Same docker-agent-run.sh invocation as in Task 13 Step 3. + +- [ ] **Step 4: Verify the negative outcome.** + +Run: `docker exec dd-agent-foo agent status | grep -A3 'krakend'` +Expected: NO running krakend instance. (The check should be unconfigured.) + +Run: `docker logs dd-agent-foo 2>&1 | grep -iE "discovery probe did not match|did not match|krakend"` +Expected: a DEBUG line `autodiscovery: discovery probe did not match for template krakend and service ...`. No traceback, no error spam. + +- [ ] **Step 5: Tear down.** + +Run: +```bash +docker rm -f fake-krakend dd-agent-foo +``` + +- [ ] **Step 6: No commit. Record results in a follow-up note for the user.** + +--- + +## Task 16: Open draft PRs + +Two PRs (one per repo). Both as drafts per `datadog-agent/CLAUDE_PERSONAL.md`. + +- [ ] **Step 1: Push the integrations-core branch.** + +```bash +cd /home/vagrant/go/src/github.com/DataDog/integrations-core +git push -u origin vitkyrka/disco-autoconfig +``` + +- [ ] **Step 2: Open the integrations-core draft PR.** + +```bash +gh pr create --draft --title "krakend: declarative auto_conf_discovery.yaml" --body "$(cat <<'EOF' +## Summary +- Adds `auto_conf_discovery.yaml` to the krakend integration declaring an OpenMetrics probe spec (port 8090, /metrics). +- Pairs with the agent-side change in datadog-agent that consumes this file format. + +## Test plan +- [ ] Build the matching agent change in datadog-agent. +- [ ] Bring up krakend dev env (`tests/docker/docker-compose.yml`). +- [ ] Run agent with locally built binary + this integration source bind-mounted; confirm krakend check schedules with `openmetrics_endpoint: http://:8090/metrics`. +- [ ] Repeat with krakend on port 9000; confirm fallback scan finds it. + +🤖 Generated with [Claude Code](https://claude.com/claude-code) +EOF +)" +``` + +- [ ] **Step 3: Push the datadog-agent branch.** + +```bash +cd /home/vagrant/go/src/github.com/DataDog/datadog-agent +git push -u origin vitkyrka/advanced-autoconfig-krakend +``` + +- [ ] **Step 4: Open the datadog-agent draft PR.** + +```bash +gh pr create --draft --title "autodiscovery: declarative discovery probes (KrakenD experiment)" --body "$(cat <<'EOF' +## Summary +- New file format `auto_conf_discovery.yaml` parsed by the file config provider. +- New `comp/core/autodiscovery/discovery` package with an OpenMetrics prober (HTTP GET + Prometheus-line verification + 30s negative cache). +- New `%%discovered_port%%` template variable, populated via a Service wrapper after a successful probe. +- AutoDiscovery's reconcile path now runs the prober before resolving any template that has a `discovery:` block; on no-match the check is not scheduled (logged at DEBUG). + +Targets the `generic-openmetrics-scan` bucket from the integrations-core analysis (51/260 integrations, 20%). + +## Test plan +- [ ] `dda inv test --targets=./comp/core/autodiscovery/...` and `./pkg/util/tmplvar/` pass. +- [ ] `dda inv linter.go` clean on touched packages. +- [ ] End-to-end with the krakend dev env (default port 8090, non-default port 9000, negative case with mislabelled container) — see DSCVR/6650004331. + +## Companion PR +- integrations-core: + +🤖 Generated with [Claude Code](https://claude.com/claude-code) +EOF +)" +``` + +- [ ] **Step 5: Cross-link the PRs.** + +After both URLs exist, edit each PR body and replace the `` placeholder. Use `gh pr edit --body "$(cat <<'EOF' ... EOF)"`. + +- [ ] **Step 6: Report PR URLs back to the user.** + +--- + +## Self-review notes + +- **Spec coverage check:** Architecture (Tasks 3, 4, 11), file format (Task 1), probe semantics (Tasks 6, 7, 8), `%%discovered_port%%` (Task 10), demo (Tasks 13–15). Risks-to-verify section is exercised by Task 13 Step 3 (network attach) and Step 4 (reconciliation timing). +- **Spec deviation:** the spec's "extended `Resolve` signature" is replaced with a `serviceWithProbeResult` wrapper. Strictly cleaner — no API change. Recorded in the file structure section above. +- **One spec item not in a task:** "Cluster-agent / kube_service / kube_endpoints listeners" — explicitly out of scope per the spec's non-goals; intentionally not in any task. +- **Type consistency:** `DiscoveryConfig` is in `comp/core/autodiscovery/integration` and used (not redefined) by `comp/core/autodiscovery/discovery`. `ProbeResult` is `discovery`-package-only. `Prober` is the only interface; the prober tests use it through `NewOpenMetricsProber`. +- **Test discipline:** Tasks 3–10 are TDD (test → fail → implement → pass → commit). Tasks 11–15 are integration/manual. +- **No `go test`:** every test command goes through `dda inv test --targets=...` per `datadog-agent/CLAUDE_PERSONAL.md`. diff --git a/docs/superpowers/plans/2026-05-06-discover-demo-integrations.md b/docs/superpowers/plans/2026-05-06-discover-demo-integrations.md new file mode 100644 index 0000000000000..5af4b635987ab --- /dev/null +++ b/docs/superpowers/plans/2026-05-06-discover-demo-integrations.md @@ -0,0 +1,547 @@ +# Plan C: Demo Integrations for Python `discover()` Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Demonstrate the Python `discover()` advanced auto-config path against three real integrations covering distinct discovery patterns: airflow (HTTP multi-step version detection), twemproxy (TCP banner JSON), hdfs_namenode (HTTP JSON-shape verification). Each ships a working `discover()` classmethod, a presence-marker `auto_conf_discovery.yaml`, and an `@pytest.mark.e2e` test that exercises end-to-end discovery against the integration's existing docker-compose fixture. + +**Architecture:** Each integration adds a small `discover(service)` classmethod on its existing check class that uses Plan A helpers (`candidate_ports`, `http_probe`/`tcp_probe`, verifier predicates). The `auto_conf_discovery.yaml` carries `ad_identifiers` + `discovery: {}` + `instances: []` (the parser change in Plan B Task 11 accepts the empty-instances form when `discovery` is set). E2e tests use `dd_agent_check(..., discovery_min_instances=1, discovery_timeout=30)` against the existing `tests/compose/docker-compose.yaml` fixture, mirroring the krakend e2e (`krakend/tests/test_e2e.py:test_e2e_discovery`). + +**Tech Stack:** Python 3.13, `datadog_checks_base.utils.discovery` (Plan A helpers), pytest + ddev e2e harness (`@pytest.mark.e2e` + `dd_agent_check`). + +**Spec / context:** +- Design spec: [`docs/superpowers/specs/2026-05-06-advanced-autoconfig-discover-design.md`](../specs/2026-05-06-advanced-autoconfig-discover-design.md). +- Plan A (Python helpers) shipped: `vitkyrka/disco-autoconfig` branch on this repo. +- Plan B (agent-side bridge + lazy-init) shipped on `datadog-agent` branch `vitkyrka/advanced-autoconfig-krakend`. +- Krakend reference e2e: `krakend/tests/test_e2e.py:test_e2e_discovery`. + +**Working directory:** `/home/vagrant/go/src/github.com/DataDog/integrations-core`. Branch: `vitkyrka/disco-autoconfig`. + +## File Structure + +For each integration ``: +- Modify: `/datadog_checks//.py` — add `discover(cls, service)` classmethod on the existing check class. +- Create: `/datadog_checks//data/auto_conf_discovery.yaml` — `ad_identifiers`, `discovery: {}`, empty `instances: []`. +- Modify: `/tests/test_e2e.py` (or create if absent) — add `test_e2e_discovery`. +- Create: `/changelog.d/.added` — one-line entry via `ddev release changelog new added `. + +Plus one shared change: +- `datadog_checks_base/datadog_checks/base/utils/discovery/verifiers.py` — add `response_json_keys(required_keys)` TCP verifier (twemproxy needs it; mirrors HTTP `json_has`). Plus tests. + +## Test Command + +The user's invocation pattern: + +```bash +DDEV_E2E_AGENT=datadog/agent-dev:discovery-local DDEV_E2E_DOCKER_NO_PULL=1 \ + ddev env test --dev +``` + +Where `` is one of the integration's hatch envs (e.g. `py3.13-2.10` for krakend). Use `ddev env show ` to list envs. + +The custom image `datadog/agent-dev:discovery-local` is a local agent build with the Plan B changes; the user has already produced this image. The plan assumes it remains available across plan execution. + +For unit-only tests during development, use the Plan A test workflow: +```bash +hatch -e datadog-harbor run pytest /tests/test_unit.py -v +``` + +--- + +### Task 1: Add `response_json_keys` TCP verifier to datadog_checks_base + +twemproxy's stats port emits a JSON document on TCP connect; we need a TCP-side equivalent of the HTTP `json_has` predicate. + +**Files:** +- Modify: `datadog_checks_base/datadog_checks/base/utils/discovery/verifiers.py` +- Modify: `datadog_checks_base/tests/base/utils/discovery/test_verifiers.py` +- Modify: `datadog_checks_base/datadog_checks/base/utils/discovery/__init__.pyi` + +- [ ] **Step 1: Write failing tests** + +Add to `test_verifiers.py`: + +```python +def test_response_json_keys_pass(): + from datadog_checks.base.utils.discovery.verifiers import response_json_keys + body = b'{"service":"nutcracker","source":"x","version":"0.5","total_connections":12}' + assert response_json_keys(["service", "source", "version"])(body) + + +def test_response_json_keys_missing_key(): + from datadog_checks.base.utils.discovery.verifiers import response_json_keys + body = b'{"service":"nutcracker"}' + assert not response_json_keys(["service", "source", "version"])(body) + + +def test_response_json_keys_not_json(): + from datadog_checks.base.utils.discovery.verifiers import response_json_keys + assert not response_json_keys(["x"])(b"not json") +``` + +Update the imports at the top of the file: + +```python +from datadog_checks.base.utils.discovery.verifiers import ( + body_contains, + body_matches, + is_prometheus_exposition, + json_has, + response_equals, + response_json_keys, + response_starts_with, + status_2xx, +) +``` + +```bash +hatch -e datadog-harbor run pytest datadog_checks_base/tests/base/utils/discovery/test_verifiers.py::test_response_json_keys_pass -v +``` + +Expected: ImportError on `response_json_keys`. + +- [ ] **Step 2: Implement** + +Append to `verifiers.py`: + +```python +def response_json_keys(required_keys: Iterable[str]) -> TCPPredicate: + """Verify the TCP response decodes as a JSON object containing all the + required top-level keys. Mirror of ``json_has`` for raw bytes. + """ + keys = tuple(required_keys) + + def predicate(buf: bytes) -> bool: + try: + doc = json.loads(buf.decode("utf-8", errors="strict")) + except (ValueError, UnicodeDecodeError): + return False + if not isinstance(doc, dict): + return False + return all(k in doc for k in keys) + + return predicate +``` + +Add `import json` to the top of the file (next to `import re`). + +- [ ] **Step 3: Update __init__.pyi** + +Add `response_json_keys` to the verifiers re-export block and to `__all__` (alphabetical): + +```python +from .verifiers import ( + body_contains, + body_matches, + is_prometheus_exposition, + json_has, + response_equals, + response_json_keys, + response_starts_with, + status_2xx, +) + +__all__ = [ + 'Discovery', + 'Port', + 'Service', + 'body_contains', + 'body_matches', + 'candidate_ports', + 'http_probe', + 'is_prometheus_exposition', + 'json_has', + 'response_equals', + 'response_json_keys', + 'response_starts_with', + 'status_2xx', + 'tcp_probe', + '_run_discover', +] +``` + +- [ ] **Step 4: Run tests** + +```bash +hatch -e datadog-harbor run pytest datadog_checks_base/tests/base/utils/discovery/ -v +``` + +Expected: all existing tests + 3 new tests pass. + +- [ ] **Step 5: Add changelog entry** + +```bash +ddev release changelog new added datadog_checks_base \ + -m "Add response_json_keys TCP verifier under datadog_checks.base.utils.discovery for advanced auto-config of integrations whose stats port emits JSON on connect (e.g. twemproxy)." +``` + +- [ ] **Step 6: Commit** + +```bash +git add datadog_checks_base/datadog_checks/base/utils/discovery/verifiers.py \ + datadog_checks_base/datadog_checks/base/utils/discovery/__init__.pyi \ + datadog_checks_base/tests/base/utils/discovery/test_verifiers.py \ + datadog_checks_base/changelog.d/*.added +git commit -m "datadog_checks_base: add response_json_keys TCP verifier" +``` + +--- + +### Task 2: Airflow `discover()` — HTTP multi-step version detection + +**Pattern:** `http-multi-path`. Probes `/api/v1/version` first; if 2xx, the integration is Airflow 2.x. Otherwise probes `/api/experimental/test`; if 2xx, it's 1.x. Returns a single instance with `url` set to the base URL. + +**Files:** +- Modify: `airflow/datadog_checks/airflow/airflow.py` — add `discover` classmethod to `AirflowCheck`. +- Create: `airflow/datadog_checks/airflow/data/auto_conf_discovery.yaml`. +- Modify: `airflow/tests/test_e2e.py` — add `test_e2e_discovery`. +- Create: `airflow/changelog.d/.added`. + +- [ ] **Step 1: Add the `discover` classmethod** + +In `airflow/datadog_checks/airflow/airflow.py`, find `class AirflowCheck(AgentCheck):` and add this method to the class body (anywhere; top of class is conventional): + +```python + @classmethod + def discover(cls, service): + from datadog_checks.base.utils.discovery import ( + candidate_ports, + http_probe, + status_2xx, + ) + + for port in candidate_ports(service, [8080]): + url = f"http://{service.host}:{port.number}" + # Airflow 2.x: stable REST API at /api/v1. + if http_probe(service.host, port.number, "/api/v1/version", + verifier=status_2xx()): + return [{"url": url}] + # Airflow 1.x: experimental API. + if http_probe(service.host, port.number, "/api/experimental/test", + verifier=status_2xx()): + return [{"url": url}] + return None +``` + +- [ ] **Step 2: Create auto_conf_discovery.yaml** + +`airflow/datadog_checks/airflow/data/auto_conf_discovery.yaml`: + +```yaml +ad_identifiers: + - airflow +discovery: {} +init_config: +instances: [] +``` + +- [ ] **Step 3: Add the e2e test** + +Read the existing `airflow/tests/test_e2e.py` first to understand the integration's current e2e shape and metadata-metrics pattern. Then add a sibling test, mirroring `krakend/tests/test_e2e.py:test_e2e_discovery`: + +```python +@pytest.mark.e2e +def test_e2e_discovery(dd_agent_check): + aggregator = dd_agent_check( + {"init_config": {}, "instances": []}, + check_rate=True, + discovery_min_instances=1, + discovery_timeout=30, + ) + # Airflow's metric set varies by version and the StatsD plugin path; + # at minimum, assert the check ran and submitted *something*. + assert aggregator.metric_names, "expected at least one metric submitted" +``` + +(Use `assert_metrics_using_metadata` if the existing tests in this file already use it and the metadata file is reliable across Airflow versions; otherwise the looser metric-name presence check above is sufficient for proving the discovery path works.) + +- [ ] **Step 4: Run unit tests** + +```bash +hatch -e datadog-harbor run pytest airflow/tests/test_unit.py -v +``` + +Expected: existing tests still pass; the `discover` classmethod doesn't affect the existing check. + +- [ ] **Step 5: Run the e2e test** + +```bash +ddev env show airflow +# pick an env name, e.g. py3.13-2.10 +DDEV_E2E_AGENT=datadog/agent-dev:discovery-local DDEV_E2E_DOCKER_NO_PULL=1 \ + ddev env test --dev airflow +``` + +Expected: `test_e2e_discovery` passes; aggregator received at least one metric. + +If the discovery probe fails, troubleshoot by inspecting the agent container's logs: + +```bash +docker logs $(docker ps --filter ancestor=datadog/agent-dev:discovery-local -q | head -1) 2>&1 | grep -iE "airflow|discoverer|run python check" +``` + +- [ ] **Step 6: Add changelog entry** + +```bash +ddev release changelog new added airflow \ + -m "Support advanced auto-config discovery: discover() probes the webserver REST API to detect Airflow 1.x vs 2.x and returns a resolved instance config without a static auto_conf.yaml template." +``` + +- [ ] **Step 7: Commit** + +```bash +git add airflow/ +git commit -m "airflow: add Python discover() for advanced auto-config" +``` + +--- + +### Task 3: Twemproxy `discover()` — TCP banner with JSON shape + +**Pattern:** `tcp-banner-server-greets`. Twemproxy's stats port (default 22222 per upstream; 2222 in the agent's example/code default) emits a JSON document on TCP connect, no client send needed. The verifier checks the well-known top-level keys `service`, `source`, `version`, `total_connections`. + +**Files:** +- Modify: `twemproxy/datadog_checks/twemproxy/twemproxy.py` — add `discover` classmethod to `Twemproxy`. +- Create: `twemproxy/datadog_checks/twemproxy/data/auto_conf_discovery.yaml`. +- Modify: `twemproxy/tests/test_twemproxy.py` (or create `test_e2e.py` if e2e tests live separately). +- Create: `twemproxy/changelog.d/.added`. + +- [ ] **Step 1: Add the `discover` classmethod** + +In `twemproxy/datadog_checks/twemproxy/twemproxy.py`, find `class Twemproxy(AgentCheck):` and add: + +```python + @classmethod + def discover(cls, service): + from datadog_checks.base.utils.discovery import ( + candidate_ports, + response_json_keys, + tcp_probe, + ) + + for port in candidate_ports(service, [22222, 2222]): + verifier = response_json_keys( + ["service", "source", "version", "total_connections"] + ) + if tcp_probe(service.host, port.number, verifier=verifier, timeout=1.0): + return [{"host": service.host, "port": port.number}] + return None +``` + +(`timeout=1.0` is generous because some twemproxy builds buffer the JSON briefly.) + +- [ ] **Step 2: Create auto_conf_discovery.yaml** + +`twemproxy/datadog_checks/twemproxy/data/auto_conf_discovery.yaml`: + +```yaml +ad_identifiers: + - twemproxy + - nutcracker +discovery: {} +init_config: +instances: [] +``` + +(Both `twemproxy` and `nutcracker` ad-identifiers are listed because the upstream image names vary.) + +- [ ] **Step 3: Add the e2e test** + +Read `twemproxy/tests/test_twemproxy.py` for the existing pattern. If the file has only unit tests (`@pytest.mark.unit`), append an `@pytest.mark.e2e` test at the bottom; if there's already an `@pytest.mark.e2e` test, append a `test_e2e_discovery` sibling. + +```python +@pytest.mark.e2e +def test_e2e_discovery(dd_agent_check): + aggregator = dd_agent_check( + {"init_config": {}, "instances": []}, + check_rate=True, + discovery_min_instances=1, + discovery_timeout=30, + ) + # Twemproxy's most reliable metric is the per-pool client connection + # count, which is non-zero whenever the test backends are connected. + assert aggregator.metric_names, "expected at least one metric submitted" +``` + +The compose file maps the stats port `6222:22222`; the discoverer's hint `[22222, 2222]` will match the container's `22222` (internal port). + +- [ ] **Step 4: Run the e2e test** + +```bash +ddev env show twemproxy +# pick an env, e.g. py3.13-0.4.1 +DDEV_E2E_AGENT=datadog/agent-dev:discovery-local DDEV_E2E_DOCKER_NO_PULL=1 \ + ddev env test --dev twemproxy +``` + +If the test fails because the agent can't reach the twemproxy container's stats port, verify the docker network: the test fixture uses `docker_default` (or similar); the agent container needs to be on the same network. The `dd_agent_check` harness handles this by default. + +- [ ] **Step 5: Add changelog entry + commit** + +```bash +ddev release changelog new added twemproxy \ + -m "Support advanced auto-config discovery: discover() opens a TCP probe on the stats port and verifies the JSON banner emitted on connect." + +git add twemproxy/ +git commit -m "twemproxy: add Python discover() for advanced auto-config" +``` + +--- + +### Task 4: HDFS NameNode `discover()` — HTTP JSON-shape verification + +**Pattern:** `http-json-shape`. The NameNode's HTTP servlet at `/jmx` (port 9870 in Hadoop 3) returns a JSON document `{"beans": [...]}` containing Hadoop MBeans. The verifier requires the top-level `beans` key. + +**Files:** +- Modify: `hdfs_namenode/datadog_checks/hdfs_namenode/hdfs_namenode.py` — add `discover` to `HDFSNameNode`. +- Create: `hdfs_namenode/datadog_checks/hdfs_namenode/data/auto_conf_discovery.yaml`. +- Modify: `hdfs_namenode/tests/test_e2e.py`. +- Create: `hdfs_namenode/changelog.d/.added`. + +- [ ] **Step 1: Add the `discover` classmethod** + +In `hdfs_namenode/datadog_checks/hdfs_namenode/hdfs_namenode.py`, find `class HDFSNameNode(AgentCheck):` and add: + +```python + @classmethod + def discover(cls, service): + from datadog_checks.base.utils.discovery import ( + candidate_ports, + http_probe, + json_has, + ) + + # Hadoop 3 default; Hadoop 2 uses 50070 — listed second so a + # mixed-version cluster prefers Hadoop 3 when both ports + # respond. + for port in candidate_ports(service, [9870, 50070]): + if http_probe(service.host, port.number, "/jmx", + verifier=json_has(["beans"])): + return [{ + "hdfs_namenode_jmx_uri": f"http://{service.host}:{port.number}", + }] + return None +``` + +- [ ] **Step 2: Create auto_conf_discovery.yaml** + +`hdfs_namenode/datadog_checks/hdfs_namenode/data/auto_conf_discovery.yaml`: + +```yaml +ad_identifiers: + - hadoop-namenode + - hdfs-namenode +discovery: {} +init_config: +instances: [] +``` + +(Common image names. The integration's analysis flags the bde2020 image used in tests/compose has `bde2020/hadoop-namenode`; the AD identifier-from-image mapping will match the `hadoop-namenode` slug.) + +- [ ] **Step 3: Add the e2e test** + +Read `hdfs_namenode/tests/test_e2e.py`. Add: + +```python +@pytest.mark.e2e +def test_e2e_discovery(dd_agent_check): + aggregator = dd_agent_check( + {"init_config": {}, "instances": []}, + check_rate=True, + discovery_min_instances=1, + discovery_timeout=30, + ) + assert aggregator.metric_names, "expected at least one metric submitted" +``` + +- [ ] **Step 4: Run the e2e test** + +```bash +ddev env show hdfs_namenode +DDEV_E2E_AGENT=datadog/agent-dev:discovery-local DDEV_E2E_DOCKER_NO_PULL=1 \ + ddev env test --dev hdfs_namenode +``` + +The compose file exposes the NameNode at `9870:9870` and a separate datanode at `9864:9864`. The discoverer's `[9870, 50070]` hints will match port 9870 first. + +A subtlety: the bde2020/hadoop image takes ~30 s to fully initialise. The compose's healthcheck/log-pattern handles that on the integration test side; the e2e test should set `discovery_timeout=30` (it does). If the test still fails on timing, bump to `60`. + +- [ ] **Step 5: Add changelog entry + commit** + +```bash +ddev release changelog new added hdfs_namenode \ + -m "Support advanced auto-config discovery: discover() probes the JMX HTTP servlet at /jmx and verifies the Hadoop-shaped JSON response." + +git add hdfs_namenode/ +git commit -m "hdfs_namenode: add Python discover() for advanced auto-config" +``` + +--- + +### Task 5: Whole-implementation review + +A final pass before declaring Plan C done. + +- [ ] **Step 1: Run the full discovery test suite to confirm no regression** + +```bash +hatch -e datadog-harbor run pytest datadog_checks_base/tests/base/utils/discovery/ -v +``` + +Expected: all Plan A + Task 1 tests pass. + +- [ ] **Step 2: Run all four e2e tests in sequence** + +```bash +DDEV_E2E_AGENT=datadog/agent-dev:discovery-local DDEV_E2E_DOCKER_NO_PULL=1 ddev env test --dev krakend py3.13-2.10 +DDEV_E2E_AGENT=datadog/agent-dev:discovery-local DDEV_E2E_DOCKER_NO_PULL=1 ddev env test --dev airflow +DDEV_E2E_AGENT=datadog/agent-dev:discovery-local DDEV_E2E_DOCKER_NO_PULL=1 ddev env test --dev twemproxy +DDEV_E2E_AGENT=datadog/agent-dev:discovery-local DDEV_E2E_DOCKER_NO_PULL=1 ddev env test --dev hdfs_namenode +``` + +Expected: all four pass. Krakend is the regression sentinel; the new three are the demo expansion. + +- [ ] **Step 3: Confirm no static `auto_conf.yaml` was introduced** + +```bash +ls airflow/datadog_checks/airflow/data/ twemproxy/datadog_checks/twemproxy/data/ hdfs_namenode/datadog_checks/hdfs_namenode/data/ +``` + +Each directory should have `auto_conf_discovery.yaml` and **not** `auto_conf.yaml`. The point of these demos is integrations that didn't already have a working auto-config. + +- [ ] **Step 4: Confirm the four `discover()` methods exhibit four distinct shapes** + +A quick read of each integration's `discover` method should show: + +| Integration | Probe type | Verifier | Verifier source | +|---|---|---|---| +| krakend | HTTP single-path | `is_prometheus_exposition()` | (Plan A) | +| airflow | HTTP multi-path with version branching | `status_2xx()` x2 | (Plan A) | +| twemproxy | TCP banner (server speaks first) | `response_json_keys([...])` | (Task 1, new) | +| hdfs_namenode | HTTP single-path with JSON shape | `json_has(["beans"])` | (Plan A) | + +Four patterns, four verifiers, three buckets covered. The point is to exercise the abstraction across its surface, not to maximise integration count. + +## Self-Review + +**Spec coverage:** +- Plan A's `Service`/`Port` types and helpers are exercised by all four integrations. +- Plan A's verifier predicates (`status_2xx`, `is_prometheus_exposition`, `json_has`) are each used at least once. +- Task 1's `response_json_keys` predicate fills the only verifier gap (JSON-shape over raw TCP bytes). +- Plan B's lazy-init bridge is exercised on every e2e run. + +**Placeholder scan:** Each `discover()` body is concrete (5–15 lines). Each `auto_conf_discovery.yaml` is concrete. Each e2e test is concrete. The `` placeholder in test commands is intentional — it's the per-integration hatch env name (`ddev env show ` lists them). + +**Type consistency:** `service.host`, `service.ports`, `port.number` used the same way in all four `discover` methods, matching the Plan A `Service`/`Port` dataclass shape. + +**Scope:** Plan C is intentionally smaller than Plans A/B. It demonstrates the abstraction works for distinct discovery shapes; it is **not** an exhaustive rollout to all 92 integrations in the targeted analysis buckets. Bulk rollout is a separate effort once the experiment has been reviewed and approved. + +--- + +## Execution Handoff + +Plan complete and saved to `docs/superpowers/plans/2026-05-06-discover-demo-integrations.md`. Two execution options: + +1. **Subagent-Driven (recommended)** — Dispatch a fresh subagent per task, review between tasks. Each task is self-contained and TDD-friendly. +2. **Inline Execution** — Execute tasks in this session via executing-plans. + +Which approach? diff --git a/docs/superpowers/plans/2026-05-06-discover-python-library.md b/docs/superpowers/plans/2026-05-06-discover-python-library.md new file mode 100644 index 0000000000000..05cd953a4b1bd --- /dev/null +++ b/docs/superpowers/plans/2026-05-06-discover-python-library.md @@ -0,0 +1,1071 @@ +# Plan A: Python Discovery Library Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a Python discovery library to `datadog_checks_base` providing the `Service`/`Port` types, candidate-port iteration, HTTP/TCP probe helpers, and verifier predicates that integrations will use to implement `discover(service)` classmethods. + +**Architecture:** Add new modules under the existing `datadog_checks_base/datadog_checks/base/utils/discovery/` package (alongside the existing `Discovery` class for intra-check item filtering, which is unrelated). All helpers are pure-Python, fully unit-testable without the Agent. The Agent-side bridge (Plan B) will populate `Service` instances from `listeners.Service`; until then, tests construct `Service` instances directly. + +**Tech Stack:** Python (datadog_checks_base), pytest, mock, the standard library `requests` and `socket`. + +**Spec:** [`docs/superpowers/specs/2026-05-06-advanced-autoconfig-discover-design.md`](../specs/2026-05-06-advanced-autoconfig-discover-design.md) + +## File Structure + +New files: +- `datadog_checks_base/datadog_checks/base/utils/discovery/service.py` — `Service` and `Port` dataclasses. +- `datadog_checks_base/datadog_checks/base/utils/discovery/ports.py` — `candidate_ports(service, hints)` iterator. +- `datadog_checks_base/datadog_checks/base/utils/discovery/verifiers.py` — predicate factories: `status_2xx`, `body_contains`, `body_matches`, `json_has`, `is_prometheus_exposition`, `response_equals`, `response_starts_with`. +- `datadog_checks_base/datadog_checks/base/utils/discovery/http.py` — `http_probe(host, port, path, *, verify, timeout=0.5)`. +- `datadog_checks_base/datadog_checks/base/utils/discovery/tcp.py` — `tcp_probe(host, port, *, send=b"", verify, timeout=0.5)`. +- `datadog_checks_base/tests/base/utils/discovery/test_service.py` +- `datadog_checks_base/tests/base/utils/discovery/test_ports.py` +- `datadog_checks_base/tests/base/utils/discovery/test_verifiers.py` +- `datadog_checks_base/tests/base/utils/discovery/test_http.py` +- `datadog_checks_base/tests/base/utils/discovery/test_tcp.py` + +Modified: +- `datadog_checks_base/datadog_checks/base/utils/discovery/__init__.pyi` — re-export the new public names. +- `datadog_checks_base/changelog.d/.added` — one-line changelog entry. + +Existing files NOT modified: +- `discovery/discovery.py`, `discovery/cache.py`, `discovery/filter.py` — unrelated (intra-check item filtering); leave alone. + +## Test command + +All tests in this plan run via: + +```bash +ddev --no-interactive test datadog_checks_base -- -k -s +``` + +`-s` keeps stdout visible; `-k ` filters by test name. Without `-k`, the full base test suite runs — useful at the end of each task to confirm no regression. + +--- + +### Task 1: `Service` and `Port` dataclasses + +**Files:** +- Create: `datadog_checks_base/datadog_checks/base/utils/discovery/service.py` +- Create: `datadog_checks_base/tests/base/utils/discovery/test_service.py` + +- [ ] **Step 1: Write failing tests** + +`datadog_checks_base/tests/base/utils/discovery/test_service.py`: + +```python +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +import pytest + +from datadog_checks.base.utils.discovery.service import Port, Service + + +def test_port_defaults(): + p = Port(number=9090) + assert p.number == 9090 + assert p.name == "" + + +def test_port_with_name(): + p = Port(number=9090, name="metrics") + assert p.name == "metrics" + + +def test_port_is_hashable(): + {Port(9090), Port(9091, "metrics")} + + +def test_port_is_immutable(): + p = Port(9090) + with pytest.raises(Exception): + p.number = 9091 # type: ignore[misc] + + +def test_service_basic(): + svc = Service(id="docker://abc", host="10.0.0.1", ports=(Port(9090),)) + assert svc.id == "docker://abc" + assert svc.host == "10.0.0.1" + assert svc.ports == (Port(9090),) + + +def test_service_is_hashable(): + {Service(id="a", host="h", ports=(Port(1),))} + + +def test_service_ports_is_tuple_not_list(): + svc = Service(id="a", host="h", ports=(Port(1), Port(2))) + assert isinstance(svc.ports, tuple) +``` + +- [ ] **Step 2: Run tests to confirm they fail** + +```bash +ddev --no-interactive test datadog_checks_base -- -k test_service -s +``` + +Expected: ImportError / ModuleNotFoundError on `discovery.service`. + +- [ ] **Step 3: Implement the dataclasses** + +`datadog_checks_base/datadog_checks/base/utils/discovery/service.py`: + +```python +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from dataclasses import dataclass, field + + +@dataclass(frozen=True) +class Port: + number: int + name: str = "" + + +@dataclass(frozen=True) +class Service: + id: str + host: str + ports: tuple[Port, ...] = field(default_factory=tuple) +``` + +- [ ] **Step 4: Run tests to confirm they pass** + +```bash +ddev --no-interactive test datadog_checks_base -- -k test_service -s +``` + +Expected: PASS for all 7 tests. + +- [ ] **Step 5: Commit** + +```bash +git add datadog_checks_base/datadog_checks/base/utils/discovery/service.py \ + datadog_checks_base/tests/base/utils/discovery/test_service.py +git commit -m "datadog_checks_base: add Service and Port dataclasses for discovery" +``` + +--- + +### Task 2: `candidate_ports(service, hints)` + +Iterates ports in this order: hint ports that the service actually exposes (in hint order), then remaining service ports in their original order. Skips duplicates. Hints not exposed by the service are skipped (not probed) — there's nothing to probe. + +**Files:** +- Create: `datadog_checks_base/datadog_checks/base/utils/discovery/ports.py` +- Create: `datadog_checks_base/tests/base/utils/discovery/test_ports.py` + +- [ ] **Step 1: Write failing tests** + +`datadog_checks_base/tests/base/utils/discovery/test_ports.py`: + +```python +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from datadog_checks.base.utils.discovery.ports import candidate_ports +from datadog_checks.base.utils.discovery.service import Port, Service + + +def _svc(*ports): + return Service(id="x", host="h", ports=tuple(ports)) + + +def test_hint_first_then_rest(): + svc = _svc(Port(8080), Port(9090), Port(80)) + assert list(candidate_ports(svc, [9090])) == [Port(9090), Port(8080), Port(80)] + + +def test_multiple_hints_in_order(): + svc = _svc(Port(80), Port(8080), Port(9090)) + assert list(candidate_ports(svc, [9090, 8080])) == [Port(9090), Port(8080), Port(80)] + + +def test_hint_not_exposed_skipped(): + svc = _svc(Port(80)) + assert list(candidate_ports(svc, [9090])) == [Port(80)] + + +def test_no_hints_returns_service_order(): + svc = _svc(Port(80), Port(9090)) + assert list(candidate_ports(svc, [])) == [Port(80), Port(9090)] + + +def test_no_ports_returns_empty(): + svc = _svc() + assert list(candidate_ports(svc, [9090])) == [] + + +def test_no_duplicates_when_hint_repeats(): + svc = _svc(Port(9090)) + assert list(candidate_ports(svc, [9090, 9090])) == [Port(9090)] +``` + +- [ ] **Step 2: Run tests to confirm they fail** + +```bash +ddev --no-interactive test datadog_checks_base -- -k test_ports -s +``` + +Expected: ImportError on `discovery.ports`. + +- [ ] **Step 3: Implement** + +`datadog_checks_base/datadog_checks/base/utils/discovery/ports.py`: + +```python +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from collections.abc import Iterable, Iterator + +from .service import Port, Service + + +def candidate_ports(service: Service, hints: Iterable[int]) -> Iterator[Port]: + """Yield ports to probe for a service, hint-first then remaining. + + Hints not exposed by the service are skipped; duplicates are collapsed. + """ + by_number = {p.number: p for p in service.ports} + seen: set[int] = set() + for h in hints: + if h in by_number and h not in seen: + seen.add(h) + yield by_number[h] + for p in service.ports: + if p.number not in seen: + seen.add(p.number) + yield p +``` + +- [ ] **Step 4: Run tests to confirm they pass** + +```bash +ddev --no-interactive test datadog_checks_base -- -k test_ports -s +``` + +Expected: PASS for all 6 tests. + +- [ ] **Step 5: Commit** + +```bash +git add datadog_checks_base/datadog_checks/base/utils/discovery/ports.py \ + datadog_checks_base/tests/base/utils/discovery/test_ports.py +git commit -m "datadog_checks_base: add candidate_ports() for discovery probe ordering" +``` + +--- + +### Task 3: Verifier predicates + +Each verifier is a factory that returns a predicate. HTTP verifiers are predicates over `requests.Response`; TCP verifiers are predicates over `bytes`. Predicate factories let the caller compose configuration at class-definition time (`DISCOVERY_VERIFY = body_contains("Total Accesses:")`). + +**Files:** +- Create: `datadog_checks_base/datadog_checks/base/utils/discovery/verifiers.py` +- Create: `datadog_checks_base/tests/base/utils/discovery/test_verifiers.py` + +- [ ] **Step 1: Write failing tests** + +`datadog_checks_base/tests/base/utils/discovery/test_verifiers.py`: + +```python +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from unittest.mock import Mock + +import pytest + +from datadog_checks.base.utils.discovery.verifiers import ( + body_contains, + body_matches, + is_prometheus_exposition, + json_has, + response_equals, + response_starts_with, + status_2xx, +) + + +def _resp(status=200, content_type="text/plain", body="", json_body=None): + r = Mock() + r.status_code = status + r.headers = {"Content-Type": content_type} + r.text = body + if json_body is not None: + r.json = Mock(return_value=json_body) + else: + r.json = Mock(side_effect=ValueError("not json")) + return r + + +def test_status_2xx_pass(): + assert status_2xx()(_resp(status=200)) + assert status_2xx()(_resp(status=204)) + + +def test_status_2xx_fail(): + assert not status_2xx()(_resp(status=301)) + assert not status_2xx()(_resp(status=500)) + + +def test_body_contains_pass(): + assert body_contains("Total Accesses:")(_resp(body="Total Accesses: 42\n")) + + +def test_body_contains_fail_on_substring_absent(): + assert not body_contains("Total Accesses:")(_resp(body="something else")) + + +def test_body_contains_fail_on_non_2xx(): + assert not body_contains("anything")(_resp(status=500, body="anything")) + + +def test_body_matches_pass(): + assert body_matches(r"^Active connections:")(_resp(body="Active connections: 7\nblah")) + + +def test_body_matches_anchored_to_start_of_a_line_using_multiline_flag(): + # Demonstrates the convention: callers pass plain re patterns; we apply re.MULTILINE. + assert body_matches(r"^server: nginx$")(_resp(body="HTTP/1.1 200 OK\nserver: nginx\n")) + + +def test_body_matches_fail(): + assert not body_matches(r"^Active connections:")(_resp(body="not nginx")) + + +def test_json_has_pass_top_level_keys(): + assert json_has(["version", "leader"])(_resp(json_body={"version": "1.7.0", "leader": "h1"})) + + +def test_json_has_fail_missing_key(): + assert not json_has(["version", "leader"])(_resp(json_body={"version": "1.7.0"})) + + +def test_json_has_fail_not_json(): + assert not json_has(["x"])(_resp(body="")) + + +def test_is_prometheus_exposition_pass_text_plain(): + body = "# HELP foo bar\nfoo 1\n" + assert is_prometheus_exposition()(_resp(content_type="text/plain; version=0.0.4", body=body)) + + +def test_is_prometheus_exposition_pass_openmetrics(): + body = "foo_total 42\n" + assert is_prometheus_exposition()(_resp(content_type="application/openmetrics-text", body=body)) + + +def test_is_prometheus_exposition_rejects_html(): + assert not is_prometheus_exposition()(_resp(content_type="text/html", body="")) + + +def test_is_prometheus_exposition_rejects_garbage_body(): + body = "this is not prometheus" + assert not is_prometheus_exposition()(_resp(content_type="text/plain", body=body)) + + +def test_response_equals_tcp_pass(): + assert response_equals(b"imok")(b"imok") + + +def test_response_equals_tcp_fail(): + assert not response_equals(b"imok")(b"imnotok") + + +def test_response_starts_with_tcp_pass(): + assert response_starts_with(b"+PONG")(b"+PONG\r\n") + + +def test_response_starts_with_tcp_fail(): + assert not response_starts_with(b"+PONG")(b"-ERR\r\n") +``` + +- [ ] **Step 2: Run tests to confirm they fail** + +```bash +ddev --no-interactive test datadog_checks_base -- -k test_verifiers -s +``` + +Expected: ImportError on `discovery.verifiers`. + +- [ ] **Step 3: Implement the verifier predicates** + +`datadog_checks_base/datadog_checks/base/utils/discovery/verifiers.py`: + +```python +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +"""Predicate factories for discovery probe verification. + +Each public function returns a callable predicate. HTTP predicates take a +``requests.Response`` and return ``bool``. TCP predicates take ``bytes`` and +return ``bool``. The factory shape lets check classes declare verifiers as +class-level attributes, e.g. ``DISCOVERY_VERIFY = body_contains("Total Accesses:")``. +""" +import re +from collections.abc import Callable, Iterable + +_PROM_LINE = re.compile(r"^[a-zA-Z_:][a-zA-Z0-9_:]*(\{[^}]*\})?\s+\S+") + + +HTTPPredicate = Callable[["requests.Response"], bool] # noqa: F821 (forward ref for typing) +TCPPredicate = Callable[[bytes], bool] + + +def status_2xx() -> HTTPPredicate: + def predicate(response) -> bool: + return 200 <= response.status_code < 300 + return predicate + + +def body_contains(needle: str) -> HTTPPredicate: + def predicate(response) -> bool: + return 200 <= response.status_code < 300 and needle in response.text + return predicate + + +def body_matches(pattern: str) -> HTTPPredicate: + compiled = re.compile(pattern, re.MULTILINE) + def predicate(response) -> bool: + if not (200 <= response.status_code < 300): + return False + return bool(compiled.search(response.text)) + return predicate + + +def json_has(required_keys: Iterable[str]) -> HTTPPredicate: + keys = tuple(required_keys) + def predicate(response) -> bool: + if not (200 <= response.status_code < 300): + return False + try: + doc = response.json() + except (ValueError, Exception): + return False + if not isinstance(doc, dict): + return False + return all(k in doc for k in keys) + return predicate + + +def is_prometheus_exposition() -> HTTPPredicate: + """Verify a Prometheus / OpenMetrics exposition response. + + Status must be 2xx, Content-Type must be text/plain or + application/openmetrics-text, and at least one non-comment line must look + like a Prometheus metric line. + """ + def predicate(response) -> bool: + if not (200 <= response.status_code < 300): + return False + ctype = response.headers.get("Content-Type", "").lower() + if not (ctype.startswith("text/plain") or ctype.startswith("application/openmetrics-text")): + return False + for line in response.text.split("\n"): + stripped = line.strip() + if not stripped or stripped.startswith("#"): + continue + return bool(_PROM_LINE.match(stripped)) + return False + return predicate + + +def response_equals(expected: bytes) -> TCPPredicate: + def predicate(buf: bytes) -> bool: + return buf == expected + return predicate + + +def response_starts_with(prefix: bytes) -> TCPPredicate: + def predicate(buf: bytes) -> bool: + return buf.startswith(prefix) + return predicate +``` + +- [ ] **Step 4: Run tests to confirm they pass** + +```bash +ddev --no-interactive test datadog_checks_base -- -k test_verifiers -s +``` + +Expected: PASS for all 17 tests. + +- [ ] **Step 5: Commit** + +```bash +git add datadog_checks_base/datadog_checks/base/utils/discovery/verifiers.py \ + datadog_checks_base/tests/base/utils/discovery/test_verifiers.py +git commit -m "datadog_checks_base: add verifier predicates for discovery probes" +``` + +--- + +### Task 4: `http_probe(host, port, path, *, verify, timeout=0.5)` + +Performs a single GET request, swallows network errors as `False`, returns the predicate's verdict. IPv6 hosts are bracketed for URL use; the caller is expected to pass an already-bracketed host (the Agent-side bridge does this). The default timeout (500 ms) is the per-attempt budget. + +**Files:** +- Create: `datadog_checks_base/datadog_checks/base/utils/discovery/http.py` +- Create: `datadog_checks_base/tests/base/utils/discovery/test_http.py` + +- [ ] **Step 1: Write failing tests** + +`datadog_checks_base/tests/base/utils/discovery/test_http.py`: + +```python +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from unittest.mock import Mock, patch + +import requests + +from datadog_checks.base.utils.discovery.http import http_probe +from datadog_checks.base.utils.discovery.verifiers import body_contains, status_2xx + + +def _ok_response(body="ok", status=200, content_type="text/plain"): + r = Mock() + r.status_code = status + r.text = body + r.headers = {"Content-Type": content_type} + return r + + +def test_http_probe_uses_correct_url_and_timeout(): + with patch("datadog_checks.base.utils.discovery.http.requests.get") as mock_get: + mock_get.return_value = _ok_response() + http_probe("10.0.0.1", 9090, "/metrics", verify=status_2xx()) + mock_get.assert_called_once() + args, kwargs = mock_get.call_args + assert args[0] == "http://10.0.0.1:9090/metrics" + assert kwargs["timeout"] == 0.5 + + +def test_http_probe_passes_when_verify_passes(): + with patch("datadog_checks.base.utils.discovery.http.requests.get") as mock_get: + mock_get.return_value = _ok_response(body="Total Accesses: 42") + assert http_probe("h", 80, "/server-status?auto", verify=body_contains("Total Accesses:")) + + +def test_http_probe_fails_when_verify_fails(): + with patch("datadog_checks.base.utils.discovery.http.requests.get") as mock_get: + mock_get.return_value = _ok_response(body="something else") + assert not http_probe("h", 80, "/x", verify=body_contains("Total Accesses:")) + + +def test_http_probe_returns_false_on_connection_error(): + with patch("datadog_checks.base.utils.discovery.http.requests.get") as mock_get: + mock_get.side_effect = requests.exceptions.ConnectionError() + assert not http_probe("h", 80, "/x", verify=status_2xx()) + + +def test_http_probe_returns_false_on_timeout(): + with patch("datadog_checks.base.utils.discovery.http.requests.get") as mock_get: + mock_get.side_effect = requests.exceptions.Timeout() + assert not http_probe("h", 80, "/x", verify=status_2xx()) + + +def test_http_probe_brackets_ipv6_in_url(): + with patch("datadog_checks.base.utils.discovery.http.requests.get") as mock_get: + mock_get.return_value = _ok_response() + http_probe("[::1]", 80, "/x", verify=status_2xx()) + args, _ = mock_get.call_args + assert args[0] == "http://[::1]:80/x" + + +def test_http_probe_custom_timeout(): + with patch("datadog_checks.base.utils.discovery.http.requests.get") as mock_get: + mock_get.return_value = _ok_response() + http_probe("h", 80, "/x", verify=status_2xx(), timeout=1.0) + _, kwargs = mock_get.call_args + assert kwargs["timeout"] == 1.0 +``` + +- [ ] **Step 2: Run tests to confirm they fail** + +```bash +ddev --no-interactive test datadog_checks_base -- -k test_http and discovery -s +``` + +Expected: ImportError on `discovery.http`. + +- [ ] **Step 3: Implement** + +`datadog_checks_base/datadog_checks/base/utils/discovery/http.py`: + +```python +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from collections.abc import Callable + +import requests + + +def http_probe( + host: str, + port: int, + path: str, + *, + verify: Callable[[requests.Response], bool], + timeout: float = 0.5, +) -> bool: + """Perform a single GET probe and apply the verifier. + + Returns True iff the request completed and the verifier accepted the + response. All network exceptions yield False (probes are best-effort). + + The ``host`` is used verbatim in the URL — IPv6 hosts must already be + bracketed by the caller (the Agent-side bridge handles this). + """ + url = f"http://{host}:{port}{path}" + try: + response = requests.get(url, timeout=timeout) + except requests.RequestException: + return False + try: + return bool(verify(response)) + finally: + response.close() +``` + +- [ ] **Step 4: Run tests to confirm they pass** + +```bash +ddev --no-interactive test datadog_checks_base -- -k test_http and discovery -s +``` + +Expected: PASS for all 7 tests. + +- [ ] **Step 5: Commit** + +```bash +git add datadog_checks_base/datadog_checks/base/utils/discovery/http.py \ + datadog_checks_base/tests/base/utils/discovery/test_http.py +git commit -m "datadog_checks_base: add http_probe() for discovery" +``` + +--- + +### Task 5: `tcp_probe(host, port, *, send=b"", verify, timeout=0.5)` + +Open a TCP socket, optionally send bytes, read up to `read_max` bytes (default 4096) within the timeout, apply the verifier. EOF is fine — verifier inspects whatever we got. All socket exceptions yield `False`. + +**Files:** +- Create: `datadog_checks_base/datadog_checks/base/utils/discovery/tcp.py` +- Create: `datadog_checks_base/tests/base/utils/discovery/test_tcp.py` + +- [ ] **Step 1: Write failing tests** + +`datadog_checks_base/tests/base/utils/discovery/test_tcp.py`: + +```python +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +import socket +import threading +from contextlib import contextmanager + +import pytest + +from datadog_checks.base.utils.discovery.tcp import tcp_probe +from datadog_checks.base.utils.discovery.verifiers import ( + response_equals, + response_starts_with, +) + + +@contextmanager +def _tcp_server(handler): + """Run a one-shot TCP server on 127.0.0.1 and return its bound port.""" + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.bind(("127.0.0.1", 0)) + sock.listen(1) + port = sock.getsockname()[1] + done = threading.Event() + + def serve(): + try: + conn, _ = sock.accept() + try: + handler(conn) + finally: + conn.close() + except OSError: + pass + finally: + done.set() + + thread = threading.Thread(target=serve, daemon=True) + thread.start() + try: + yield port + finally: + sock.close() + done.wait(timeout=1.0) + + +def test_tcp_probe_zookeeper_4lw_pattern(): + def handler(conn): + data = conn.recv(64) + if data == b"ruok": + conn.sendall(b"imok") + with _tcp_server(handler) as port: + assert tcp_probe("127.0.0.1", port, send=b"ruok", + verify=response_equals(b"imok"), timeout=1.0) + + +def test_tcp_probe_redis_ping_pattern(): + def handler(conn): + conn.recv(64) + conn.sendall(b"+PONG\r\n") + with _tcp_server(handler) as port: + assert tcp_probe("127.0.0.1", port, send=b"PING\r\n", + verify=response_starts_with(b"+PONG"), timeout=1.0) + + +def test_tcp_probe_server_speaks_first(): + def handler(conn): + conn.sendall(b'{"service":"nutcracker","source":"x","version":"0.5"}') + with _tcp_server(handler) as port: + assert tcp_probe("127.0.0.1", port, + verify=response_starts_with(b'{"service":"nutcracker"'), + timeout=1.0) + + +def test_tcp_probe_returns_false_when_verifier_rejects(): + def handler(conn): + conn.sendall(b"WRONG") + with _tcp_server(handler) as port: + assert not tcp_probe("127.0.0.1", port, + verify=response_starts_with(b"+PONG"), timeout=1.0) + + +def test_tcp_probe_returns_false_on_refused_connection(): + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.bind(("127.0.0.1", 0)) + port = sock.getsockname()[1] + sock.close() # port is now free; nothing listening + assert not tcp_probe("127.0.0.1", port, + verify=response_starts_with(b"x"), timeout=1.0) + + +def test_tcp_probe_returns_false_on_timeout(): + def handler(conn): + # Stall: never send anything, never close (until the test releases us). + import time + time.sleep(2.0) + with _tcp_server(handler) as port: + assert not tcp_probe("127.0.0.1", port, + verify=response_starts_with(b"x"), timeout=0.1) +``` + +- [ ] **Step 2: Run tests to confirm they fail** + +```bash +ddev --no-interactive test datadog_checks_base -- -k test_tcp and discovery -s +``` + +Expected: ImportError on `discovery.tcp`. + +- [ ] **Step 3: Implement** + +`datadog_checks_base/datadog_checks/base/utils/discovery/tcp.py`: + +```python +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +import socket +from collections.abc import Callable + +_DEFAULT_READ_MAX = 4096 + + +def tcp_probe( + host: str, + port: int, + *, + send: bytes = b"", + verify: Callable[[bytes], bool], + timeout: float = 0.5, + read_max: int = _DEFAULT_READ_MAX, +) -> bool: + """Open a TCP connection, optionally send bytes, read up to ``read_max``, + and apply the verifier. + + Returns True iff the connection succeeded and the verifier accepted the + bytes received within the timeout. All socket errors yield False. + """ + try: + with socket.create_connection((host, port), timeout=timeout) as sock: + sock.settimeout(timeout) + if send: + sock.sendall(send) + chunks: list[bytes] = [] + remaining = read_max + while remaining > 0: + try: + chunk = sock.recv(min(4096, remaining)) + except socket.timeout: + break + if not chunk: + break + chunks.append(chunk) + remaining -= len(chunk) + buf = b"".join(chunks) + except OSError: + return False + return bool(verify(buf)) +``` + +- [ ] **Step 4: Run tests to confirm they pass** + +```bash +ddev --no-interactive test datadog_checks_base -- -k test_tcp and discovery -s +``` + +Expected: PASS for all 6 tests. (The timeout test runs for ~0.1 s; the stall server is left to die when the test releases its enclosing context.) + +- [ ] **Step 5: Commit** + +```bash +git add datadog_checks_base/datadog_checks/base/utils/discovery/tcp.py \ + datadog_checks_base/tests/base/utils/discovery/test_tcp.py +git commit -m "datadog_checks_base: add tcp_probe() for discovery" +``` + +--- + +### Task 6: Re-export the new public names from `discovery.__init__` + +The existing `__init__.py` uses `lazy_loader.attach_stub`, which means exports are declared in `__init__.pyi`. + +**Files:** +- Modify: `datadog_checks_base/datadog_checks/base/utils/discovery/__init__.pyi` + +- [ ] **Step 1: Read the current stub** + +```bash +cat datadog_checks_base/datadog_checks/base/utils/discovery/__init__.pyi +``` + +Expected current content: + +```python +# (C) Datadog, Inc. 2025-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from .discovery import Discovery + +__all__ = ['Discovery'] +``` + +- [ ] **Step 2: Write a failing import test** + +`datadog_checks_base/tests/base/utils/discovery/test_exports.py`: + +```python +# (C) Datadog, Inc. 2026-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +def test_public_exports(): + from datadog_checks.base.utils import discovery + + expected = { + "Discovery", + "Service", + "Port", + "candidate_ports", + "http_probe", + "tcp_probe", + "status_2xx", + "body_contains", + "body_matches", + "json_has", + "is_prometheus_exposition", + "response_equals", + "response_starts_with", + } + assert expected.issubset(set(dir(discovery))) +``` + +```bash +ddev --no-interactive test datadog_checks_base -- -k test_public_exports -s +``` + +Expected: FAIL — only `Discovery` exported. + +- [ ] **Step 3: Update the stub** + +Replace `datadog_checks_base/datadog_checks/base/utils/discovery/__init__.pyi` with: + +```python +# (C) Datadog, Inc. 2025-present +# All rights reserved +# Licensed under a 3-clause BSD style license (see LICENSE) +from .discovery import Discovery +from .http import http_probe +from .ports import candidate_ports +from .service import Port, Service +from .tcp import tcp_probe +from .verifiers import ( + body_contains, + body_matches, + is_prometheus_exposition, + json_has, + response_equals, + response_starts_with, + status_2xx, +) + +__all__ = [ + 'Discovery', + 'Port', + 'Service', + 'body_contains', + 'body_matches', + 'candidate_ports', + 'http_probe', + 'is_prometheus_exposition', + 'json_has', + 'response_equals', + 'response_starts_with', + 'status_2xx', + 'tcp_probe', +] +``` + +- [ ] **Step 4: Run the test** + +```bash +ddev --no-interactive test datadog_checks_base -- -k test_public_exports -s +``` + +Expected: PASS. + +- [ ] **Step 5: Run the full discovery test suite to confirm nothing regressed** + +```bash +ddev --no-interactive test datadog_checks_base -- -k discovery -s +``` + +Expected: all tests from Tasks 1–5 plus the existing `test_discovery.py` tests pass. + +- [ ] **Step 6: Commit** + +```bash +git add datadog_checks_base/datadog_checks/base/utils/discovery/__init__.pyi \ + datadog_checks_base/tests/base/utils/discovery/test_exports.py +git commit -m "datadog_checks_base: export discovery probe helpers" +``` + +--- + +### Task 7: Changelog entry + +Per `CLAUDE.md` in this repo: changelogs MUST be created via `ddev release changelog new`, never edited by hand. + +**Files:** +- Create: `datadog_checks_base/changelog.d/.added` (created by the command). + +- [ ] **Step 1: Add the entry** + +The PR number isn't known yet — placeholder is the GitHub PR number once the branch is pushed and the PR opened. Until then, use `0` and rename later, or add it after opening the PR. + +```bash +ddev release changelog new added datadog_checks_base \ + -m "Add Service/Port types and probe helpers (http_probe, tcp_probe, candidate_ports, verifier predicates) under datadog_checks.base.utils.discovery for advanced auto-config." +``` + +- [ ] **Step 2: Verify the file appeared** + +```bash +ls datadog_checks_base/changelog.d/*.added | head -1 +``` + +Expected: a new `.added` file. + +- [ ] **Step 3: Commit** + +```bash +git add datadog_checks_base/changelog.d/*.added +git commit -m "datadog_checks_base: changelog entry for discovery probe helpers" +``` + +--- + +### Task 8: Whole-suite confidence run + +A final unfiltered run to confirm no regression elsewhere in `datadog_checks_base`. + +- [ ] **Step 1: Format** + +```bash +ddev test -fs datadog_checks_base +``` + +Expected: clean / formats applied if needed. + +- [ ] **Step 2: Run the full base test suite** + +```bash +ddev --no-interactive test datadog_checks_base +``` + +Expected: all tests pass. New tests from Tasks 1–6 are included; existing tests (including `test_discovery.py` for the unrelated `Discovery` class) are unaffected. + +- [ ] **Step 3: If formatting changed anything, commit** + +```bash +git status +# if there are formatting fixups: +git add -p +git commit -m "datadog_checks_base: apply formatter to discovery helpers" +``` + +--- + +## Self-Review + +**Spec coverage:** +- `Service` / `Port` types crossed into Python — Task 1. +- Helpers `http_probe`, `tcp_probe`, `candidate_ports`, verifiers — Tasks 2–5. +- Public re-export — Task 6. +- Changelog — Task 7. +- Full-suite confidence — Task 8. + +NOT covered by this plan (intentionally — they belong to Plan B and Plan C): +- Per-pattern base classes (`OpenMetrics` discovery mixin, `HTTPStaticDiscoverable`, `TCPDiscoverable`, etc.). These are deferred until the rtloader bridge in Plan B exists, because the base-class tests need a `Service` shape that crosses cleanly between Python tests and the Go bridge. Doing them in Plan A risks designing the surface twice. The helpers in this plan are sufficient for any per-integration `discover()` to be written by hand. +- Any per-integration `discover()` method. Plan C. +- Agent-side rtloader bridge, `discoverer` package, `configmgr` integration, krakend artifact removal. Plan B. + +**Placeholder scan:** No `TBD`, `TODO`, `implement later`, or "similar to Task N" references. Each step shows the actual code or command. + +**Type consistency:** +- `Service.ports` is `tuple[Port, ...]` everywhere it appears. +- `Port` constructor: `Port(number, name="")` — Task 1 defines, Tasks 2–5 use consistently. +- `candidate_ports(service, hints) -> Iterator[Port]` — Task 2 defines, downstream tasks (in Plan C) will iterate the result. +- `http_probe(host, port, path, *, verify, timeout=0.5) -> bool` — matches the spec verbatim. +- `tcp_probe(host, port, *, send=b"", verify, timeout=0.5, read_max=4096) -> bool` — adds `read_max` as a kwarg with a documented default; spec mentions `read_max: 4096` in the YAML form discussion, harmless to surface as a kwarg. +- Verifier names match the spec: `is_prometheus_exposition`, `status_2xx`, `body_contains`, `body_matches`, `json_has`, `response_equals`, `response_starts_with`. + +**Scope:** This plan is one-PR-sized: ~5 small modules, ~5 small test files, one changelog entry. No cross-repo dependencies. Plan B and Plan C will follow. + +--- + +## Execution Handoff + +Plan complete and saved to `docs/superpowers/plans/2026-05-06-discover-python-library.md`. Two execution options: + +1. **Subagent-Driven (recommended)** — Dispatch a fresh subagent per task, review between tasks. Fast iteration; good for plans with many small tasks like this one. +2. **Inline Execution** — Execute tasks in this session via executing-plans. Batch with checkpoints for review. + +Which approach? diff --git a/docs/superpowers/specs/2026-05-05-advanced-autoconfig-experiment-design.md b/docs/superpowers/specs/2026-05-05-advanced-autoconfig-experiment-design.md new file mode 100644 index 0000000000000..e62dd4ecaebfe --- /dev/null +++ b/docs/superpowers/specs/2026-05-05-advanced-autoconfig-experiment-design.md @@ -0,0 +1,164 @@ +# Advanced auto-config — KrakenD experiment + +Status: design, not yet implemented. +Tracks Confluence ticket [DSCVR/6650004331](https://datadoghq.atlassian.net/wiki/spaces/DSCVR/pages/6650004331/Integrations+advanced+auto+config+exploration) and the per-integration analysis on the [`vitkykra/autoconfig-analysis` branch](https://github.com/DataDog/integrations-core/blob/vitkykra/autoconfig-analysis/analysis/RESULTS.md). + +## Goal + +Prove end-to-end, against a real Agent build and a real running container, that a declarative probe spec stored alongside an integration's static config files is enough to discover the integration's correct check config without any per-integration discovery code on the integration side. + +The bucket targeted is `generic-openmetrics-scan` (51 of 260 integrations, 20%). The experiment carries one of those — `krakend` — through the full path. Other buckets (multi-path, JSON-shape, TCP handshake, credentialled) are explicit non-goals. + +## Non-goals + +- Cluster-agent / `kube_service` / `kube_endpoints` flows. Container listener path only. +- Probe types beyond `openmetrics`. No TCP, no `http-text-format`, no JSON-shape verification. +- Multi-path / multi-port logic. Single `path` and a port list, that's it. `http-multi-path` integrations are a follow-up experiment. +- Migrating any existing `auto_conf.yaml` to the new file. Only `krakend` gets the new file. +- Probe-result persistence across Agent restarts. In-memory cache only. +- Authenticated probes. No headers, no TLS. KrakenD's `/metrics` is unauthenticated. +- Concurrency tuning. Probes run sequentially per service. +- Telemetry / metrics about the prober itself. Logs only. +- Python `discover()` callback per integration. Out of scope by design — see "Approaches considered" below. + +## Approaches considered + +**A. Declarative probe spec + generic Go prober (chosen).** A new file `auto_conf_discovery.yaml` carries `ad_identifiers`, a `discovery:` block with `(type, ports, path)`, and the instance template. The Agent core has one prober that reads the block, probes the matched container, and substitutes a new `%%discovered_port%%` template variable. Per-integration data: a port/path table. Per-integration code: none. + +**B. Python `discover(container) -> [Configs]` per integration.** Each integration ships a Python callable. The Agent invokes it via a new rtloader entry point. More flexible but is 51 near-identical files for the openmetrics bucket and requires new rtloader plumbing. + +**C. Hybrid.** Declarative for the easy buckets, Python callback for the hard ones. + +A was chosen because it is the smallest change that proves the concept end-to-end on a real OpenMetrics integration with dev-env support, and it exactly matches what the analysis says is achievable for the largest fully-generic bucket. C is the natural follow-up if a later experiment targets the harder buckets. + +## Architecture + +The current Agent autodiscovery pipeline (`comp/core/autodiscovery/` in `datadog-agent`): + +``` +Listeners ─► Service (host, ports, ad_identifiers, image) + │ + ▼ +File provider ─► Config{ ADIdentifiers, Instances=template } from auto_conf.yaml + │ + ▼ match by ad_identifier + ▼ +configresolver.Resolve(tpl, svc) ──► substitutes %%host%%, %%port%%, ... + │ + ▼ +MetaScheduler ─► concrete config ─► check scheduler +``` + +With this change: + +``` +Listeners ─► Service + │ + ▼ +File provider ─► Config + Discovery{type, ports, path} from auto_conf_discovery.yaml + │ + ▼ match by ad_identifier + ▼ +[NEW] discovery.Probe(tpl.Discovery, svc) ──► discoveredPort or "no match" + │ (synchronous, bounded, per-port timeout) + │ (cached per (service ID, probe spec) for some TTL) + ▼ +configresolver.Resolve(tpl, svc, probeResult) ──► substitutes %%discovered_port%% too + │ + ▼ +MetaScheduler ─► ... +``` + +The change is local: one file-format parser, one prober package, one template variable, one new branch in the matching loop. No listener change, no scheduler change, no rtloader change. + +## File format + +Path: `/datadog_checks//data/auto_conf_discovery.yaml`. Same lookup logic as `auto_conf.yaml` today. If both files exist for an integration the Agent logs a warning and prefers the discovery file. + +For the experiment, krakend has neither file today — the conflict path is hypothetical here but worth specifying so the failure mode is defined. + +```yaml +ad_identifiers: + - krakend +discovery: + type: openmetrics # only "openmetrics" supported in this experiment + ports: [8090] # optional. tried first, in order + path: /metrics # optional. default: /metrics +init_config: +instances: + - openmetrics_endpoint: "http://%%host%%:%%discovered_port%%/metrics" +``` + +The shape is `auto_conf.yaml` plus a `discovery:` block. Existing fields (`init_config`, `instances`, `ad_identifiers`) keep their meaning. + +## Probe semantics + +For a matched (template, service) pair where `tpl.Discovery != nil`: + +1. Resolve `host`: take the first IP from `svc.GetHosts()`. If empty, abort with "no probe target" and don't emit a config. +2. Build the candidate port list: + - Start with `tpl.Discovery.Ports ∩ svc.GetPorts()`, in declared order. `Ports` are integer port numbers matched against the numeric `Port` field of `workloadmeta.ContainerPort`. + - Append remaining `svc.GetPorts()` (the fallback scan). + - Skip ports already in the negative cache for this service. +3. For each candidate, in order: + - HTTP GET `http://:` with a 500 ms per-attempt timeout. + - Verify response: status 200 AND `Content-Type` matches one of: + - `text/plain` (Prometheus exposition; version parameter optional) + - `application/openmetrics-text` (OpenMetrics 1.0) + - AND the body's first non-comment line parses as a Prometheus exposition line (loose regex `^[a-zA-Z_:][a-zA-Z0-9_:]*(\{[^}]*\})?\s+\S+`). The regex is deliberately permissive — it's a probe, not a parser. The check itself does strict parsing once it owns the endpoint. +4. Bound the total budget: stop after 2 s of cumulative probing or 8 candidates, whichever comes first. +5. Cache results in-memory keyed by `(service ID, probe spec hash)`: + - On success: cache the discovered port for the lifetime of the service. + - On failure: cache for ~30 s, then expire. +6. On success the resolver gets `discovered_port` set and substitutes it into the instance template. +7. On failure no config is emitted. The service may match other templates; this template just doesn't apply. + +## `%%discovered_port%%` template variable + +New entry in `pkg/util/tmplvar`, sibling to `%%port%%`. Resolves only if the prober succeeded. If a template references it without a probe result available, substitution fails and the config is rejected with a clear log line. The existing `%%port%%` semantics are unchanged. + +`configresolver.Resolve` gains an extended signature accepting an optional probe result (e.g. `Resolve(tpl, svc, probeResult)`). The probe result carries the discovered port; the resolver passes it to the template-variable substitution path so `%%discovered_port%%` resolves. Templates without a `Discovery` block don't go through the prober and don't see the new variable. + +## Demo + +1. Add `auto_conf_discovery.yaml` to `integrations-core/krakend/datadog_checks/krakend/data/` with `ports: [8090]` and `path: /metrics`. +2. Implement Agent-side changes in `datadog-agent`: + - Parse `auto_conf_discovery.yaml` in `comp/core/autodiscovery/providers/config_reader.go`. + - Add a `Discovery` field to `integration.Config`. + - New `comp/core/autodiscovery/discovery/openmetrics_prober.go` (probe + verify + cache). + - Hook into `AutoConfig` matching to call the prober before `configresolver.Resolve`. + - Add `%%discovered_port%%` to `pkg/util/tmplvar`. +3. Build: `dda inv agent.build`. +4. Start KrakenD via its dev-env docker-compose (`integrations-core/krakend/tests/docker/`). +5. Run the Agent in the nightly Docker image with the locally built binary plus the local `krakend` integration source bind-mounted, per `integrations-core/reference_docker_integration_testing.md`. +6. Verify `agent status` shows the `krakend` check scheduled with `openmetrics_endpoint: http://:8090/metrics` and metrics flowing. + +### Three success scenarios + +- Default port: KrakenD exposes 8090. Hint port matches, one probe succeeds, check runs. +- Non-default port: restart KrakenD on port 9000. Hint port 8090 closed. Agent falls back to scanning exposed ports, finds 9000, check runs. +- Negative case: a non-KrakenD container labelled with the `krakend` ad_identifier but not serving OpenMetrics. Probes fail, no check is scheduled, only DEBUG-level log lines per probe failure. + +## Risks to verify during implementation + +- **Listener port visibility.** The container listener exposes `ContainerPort` entries from container metadata. If the docker-compose file does not expose 8090 explicitly the Agent may not see it. The realistic deployment shape exposes the port; verify at the start of implementation. +- **Container IP reachability.** The Agent container must reach the krakend container on the docker network. Standard nightly image plus krakend's compose network should suffice; confirm before claiming the demo works. +- **Probe timing vs container readiness.** A probe that fires before krakend is listening will fail. The 30 s negative cache means no re-probe for 30 s. The AD reconciliation loop runs frequently enough that the next service event (container becomes ready) re-triggers matching and bypasses the cache. Confirm during scenario 1. + +## File-level summary of the change + +| Repo | Path | Change | +|------|------|--------| +| `integrations-core` | `krakend/datadog_checks/krakend/data/auto_conf_discovery.yaml` | New file with the discovery block and instance template. | +| `datadog-agent` | `comp/core/autodiscovery/integration/config.go` | Add `Discovery` field on `Config`. | +| `datadog-agent` | `comp/core/autodiscovery/providers/config_reader.go` | Parse `auto_conf_discovery.yaml`; populate `Discovery`. | +| `datadog-agent` | `comp/core/autodiscovery/discovery/` (new package) | OpenMetrics prober, candidate-port ordering, cache. | +| `datadog-agent` | `comp/core/autodiscovery/autodiscoveryimpl/` | Call prober before `configresolver.Resolve`; pass result into resolver. | +| `datadog-agent` | `comp/core/autodiscovery/configresolver/configresolver.go` | Accept the probe result; substitute `%%discovered_port%%`. | +| `datadog-agent` | `pkg/util/tmplvar/` | Add `%%discovered_port%%` resolver. | + +## Out of scope but worth noting for follow-up + +- A second experiment targeting `http-multi-path` (nginx, rabbitmq, envoy) would add a list-of-paths form and verification that picks the first responsive path. The `Discovery` field shape leaves room for that without breaking the format. +- A third experiment targeting Python `discover()` callbacks would only matter if a real integration's discovery cannot be expressed declaratively. The analysis suggests this is a small set; better revisit after experiments 1 and 2. +- Cluster-agent integration (`kube_service` / `kube_endpoints` listeners) is the natural next plug-in point once the container case is solid. Probes from the cluster agent to a service IP work the same way; the listener change is the open question. diff --git a/docs/superpowers/specs/2026-05-06-advanced-autoconfig-discover-design.md b/docs/superpowers/specs/2026-05-06-advanced-autoconfig-discover-design.md new file mode 100644 index 0000000000000..6fd524df607d2 --- /dev/null +++ b/docs/superpowers/specs/2026-05-06-advanced-autoconfig-discover-design.md @@ -0,0 +1,271 @@ +# Advanced auto-config — Python `discover()` callback + +Status: design, not yet implemented. Successor to the krakend experiment ([`2026-05-05-advanced-autoconfig-experiment-design.md`](2026-05-05-advanced-autoconfig-experiment-design.md)). Tracks Confluence ticket [DSCVR/6650004331](https://datadoghq.atlassian.net/wiki/spaces/DSCVR/pages/6650004331/Integrations+advanced+auto+config+exploration) and the per-integration analysis on the [`vitkykra/autoconfig-analysis` branch](https://github.com/DataDog/integrations-core/blob/vitkykra/autoconfig-analysis/analysis/RESULTS.md). + +## Goal + +Generalise the krakend experiment to cover the next two analysis buckets: + +- **HTTP probe with integration-specific verification** (35 integrations) — `http-text-format`, `http-json-shape`, `http-multi-path`. +- **TCP probe with integration-specific protocol** (6 integrations) — `tcp-banner-server-greets`, `tcp-protocol-handshake`. + +Combined with the existing `generic-openmetrics-scan` bucket (51), this experiment establishes a single mechanism that handles 92 of the 260 integrations (35%) — every bucket the analysis classified as "discoverable on the wire without credentials." + +## Approach + +Each integration's check class gains a `discover(service)` classmethod that the Agent invokes when a `Service` matches the integration's `ad_identifiers`. `discover` probes the service, performs integration-specific verification in Python, and returns the concrete list of instance configs to schedule. No template substitution for discovered values. + +Common discovery primitives (HTTP probe, TCP probe, candidate-port iteration, response verifiers) live in `datadog_checks_base`. Per-pattern base classes (`OpenMetricsBaseCheckV2`, an `HTTPDiscoverable` mixin, a `TCPDiscoverable` mixin) carry the default `discover` implementation, so most integrations need zero per-integration discovery code. + +## Non-goals + +- Cluster-agent / `kube_service` / `kube_endpoints` flows. Container + process listener path only. +- Credentialled integrations (`creds-*` buckets). Out of scope for the on-the-wire approach. +- Local-detection integrations (`local-cli-binary`, `local-config-file`, `cloud-task-metadata`, `local-scm-enumeration`, `generic-windows-perf`, `generic-linux-procfs`). They have no network probe surface; a separate mechanism applies. +- Migrating existing `auto_conf.yaml` files. New discovery is opt-in per integration via `auto_conf_discovery.yaml`. +- Probe-result persistence across Agent restarts. In-memory cache only. +- Inferring `ad_identifiers` from check metadata. The discovery file is required and explicit. Revisit independently. + +## Approaches considered + +**A. Declarative verification DSL.** Widen `auto_conf_discovery.yaml` with verifier predicates (status, content-type, body regex, JSON-keys-present, fixed-bytes prefix) plus `%%discovered_path%%`/`%%discovered_scheme%%` template variables. Agent ships HTTP and TCP probers in Go. + +**B. Pluggable Go `Verifier` interface with a registry.** Hybrid of A and integration-specific Go code. Per-integration Go in `datadog-agent` core for the awkward cases. + +**C. Python `discover(service)` callback per integration (chosen).** Each integration's check class implements (or inherits) a `discover` classmethod. Common probe + verifier helpers in `datadog_checks_base`. Per-pattern base classes carry the defaults. + +C was chosen because: + +- It does not grow a DSL. Every messy integration in the analysis (multi-step version detection in airflow/gitlab, multi-component enumeration in druid/kubeflow/spark, JSON-shape depth in hdfs JMX servlets) would have pushed A toward JSONPath, conditional rules, multi-step probes, etc. Python doesn't grow. +- The integration-specific knowledge (response shape, version detection logic, port semantics) already lives in the integration's Python check. Reusing it for discovery puts the verifier next to the parser that consumes the same response. +- Per-integration cost is small. For the 51 OpenMetrics integrations, a base-class default with a `DISCOVERY_PORTS` class attribute is enough — zero per-integration code. For the 41 verification-bucket integrations, per-integration `discover` overrides are 5–15 lines using the shared helpers. +- The `discover` return value is the literal instance-config list. No template substitution layer for discovered values, no `%%discovered_*%%` template variable zoo. + +The cluster-agent flow is the one place A would have been clearly easier — the cluster agent does not run Python checks today. The krakend experiment already excludes cluster-agent flow as a non-goal; this experiment inherits that exclusion. When cluster-agent autoconfig is taken on, options include: probe runs on a node agent and ships results, Python-in-cluster-agent, or a small declarative fallback for cluster-agent-only. + +## Architecture + +Pipeline with this change (compare to the krakend experiment design): + +``` +Listeners ─► Service (host, ports, id, ad_identifiers, ...) + │ + ▼ +File provider ─► Config from auto_conf_discovery.yaml + │ { ad_identifiers, init_config, optional default instance template (unused for discovery) } + ▼ match by ad_identifier + ▼ +[NEW] discoverer.Discover(integrationName, svc) + │ - Cross svc into Python as a Service object (id, host, ports) + │ - Invoke .discover(service) on the Python runner + │ - Receive list[dict] | None + │ - Bound by per-call timeout; cached per (service ID, integration) + ▼ +For each returned dict: build a concrete integration.Config and schedule it. +``` + +The change is local: a new file-format file (still `auto_conf_discovery.yaml`), a new rtloader entry point, a new `discoverer` package on the Agent side, the per-pattern Python base classes plus shared helpers in `datadog_checks_base`. Listeners and scheduler are unchanged. The existing `auto_conf.yaml` template path is unchanged for static-config integrations. + +The Go-side prober from the krakend experiment (`comp/core/autodiscovery/discovery/openmetrics_prober.go`) is removed in favour of the Python entry point. The candidate-port ordering, cache, and time-budget logic are kept — they live in the new `discoverer` package and apply to all integrations regardless of which patterns their `discover` uses. + +## Service surface crossed into Python + +The Agent's `listeners.Service` interface is the existing abstraction over containers, processes, K8s services, K8s endpoints, SNMP, DBM cloud services, and others. `ProcessService` (`comp/core/autodiscovery/listeners/process.go`) implements the same interface for processes, so process autodiscovery is supported by this design without any extra plumbing. + +The Python-facing surface is a deliberately narrow read-only projection. For this experiment only three accessors: + +```python +class Service: + @property + def id(self) -> str: + """Opaque service identifier; for log correlation only.""" + @property + def host(self) -> str: + """Eagerly resolved single host string; IPv6 is bracketed for URL use.""" + @property + def ports(self) -> list[Port]: + """Ordered list of (number, name) pairs. `name` is empty for non-K8s ports.""" + +class Port: + number: int + name: str +``` + +`host` is resolved Agent-side using the same fallback policy that `tmplvar.GetHost` uses today (single-network → bridge → error). The Python side never re-implements host resolution. + +The interface name `Service` is kept for the experiment to match `listeners.Service` on the Go side. Renaming (`Workload`, `DiscoveryTarget`, etc.) is deferred — easy to revisit before GA. + +Fields deliberately not exposed in this experiment: `pid`, `hostname`, `image_name`, `tags`, `ad_identifiers`, `extra_config`. None of the 92 targeted integrations need them for the discovery decision. They are the natural extension points for future experiments: + +- `pid` for process-mode discovery (read `/proc//...`, exec in process namespace). +- `image_name` for stricter pre-probe filtering than `ad_identifiers` provides. +- `extra_config(key)` for K8s-metadata-driven discovery (`kube_namespace`, etc.). +- `tags` rarely needed inside `discover` since the tagger merge happens after; included only when a concrete case requires it. + +## File format + +Path: `/datadog_checks//data/auto_conf_discovery.yaml`. Same lookup logic as `auto_conf.yaml`. The file is required for discovery to apply — there is no inference from check metadata in this experiment. + +```yaml +ad_identifiers: + - krakend +init_config: +instances: [] +``` + +The instance template is intentionally absent — `discover` returns concrete instance configs. `instances: []` (or omitted) is the correct shape. `init_config` may be set if the integration needs init-time configuration; it is passed through verbatim alongside each discovered instance. + +If both `auto_conf.yaml` and `auto_conf_discovery.yaml` exist for the same integration the Agent logs a warning and prefers the discovery file. + +## `discover` contract + +```python +class MyCheck(AgentCheck): + @classmethod + def discover(cls, service: Service) -> list[dict] | None: + ... +``` + +Return values: + +- `list[dict]` — one instance config per dict. Each is the literal payload that would otherwise come from a resolved `instances:` template entry. +- `None` — probe ran but did not match. Don't schedule. Negative-cache for ~30 s. +- `[]` — probed and explicitly nothing applies (e.g. multi-component umbrella found no components on this host). Don't schedule. Negative-cache for ~30 s. +- Raised exception — discovery itself failed (network error other than verifier rejection, malformed response, bug). Don't schedule. Negative-cache for ~30 s. Log at error. + +Tagger merge: the Agent merges AD/tagger-derived tags into each returned instance dict before scheduling, the same way it does for resolved templates today. `discover` returns integration-specific fields only; pod/container/cluster tags layer on after. + +Determinism: `discover` must be a pure function of `service`. The Agent caches results per `(service ID, integration name)`; non-deterministic returns will thrash the scheduler. + +Optional config-model validation: integrations with a generated `config_models/` (Pydantic from `spec.yaml`) can validate before returning: + +```python +@classmethod +def discover(cls, service: Service) -> list[dict] | None: + raw = cls._discover_raw(service) + return [cls._instance_model(**i).model_dump() for i in raw] if raw else None +``` + +Opt-in at first; the base classes can adopt it once the helper proves stable. + +## Shared helpers in `datadog_checks_base` + +``` +datadog_checks/base/utils/discovery/ + __init__.py + http.py # http_probe(host, port, path, *, verify, timeout=0.5) -> bool + tcp.py # tcp_probe(host, port, *, send=b"", verify, timeout=0.5) -> bool + ports.py # candidate_ports(service, hints) -> Iterator[Port] + verifiers.py # is_prometheus_exposition, status_2xx, body_contains, body_matches, + # json_has, response_equals, response_starts_with, ... +``` + +All helpers are pure functions or thin wrappers around `requests` / `socket`. No global state. Each unit-tested in isolation. + +## Per-pattern base classes + +``` +datadog_checks/base/checks/discovery/ + openmetrics.py # mixin for OpenMetricsBaseCheckV2 — DISCOVERY_PORTS, DISCOVERY_PATH + http_static.py # one fixed (path, verifier) — apache, kyototycoon, lighttpd, squid, + # mesos_*, riak, traffic_server, fluentd, hdfs_*, yarn, mapreduce, consul + http_multi.py # list of (path, verifier) candidates — nginx, rabbitmq, envoy, ... + tcp_handshake.py # send + verifier — redis, memcached, zookeeper, gearmand, statsd + tcp_banner.py # server speaks first — twemproxy +``` + +Each base class implements `discover(cls, service)` using the shared helpers and class-level configuration. An integration that fits a pattern declares the configuration as class attributes and inherits the default `discover`. An integration that doesn't fit overrides `discover` directly. + +Worked examples: + +```python +# OpenMetrics — 51 integrations get this for free via OpenMetricsBaseCheckV2 +class KrakenD(OpenMetricsBaseCheckV2): + DISCOVERY_PORTS = [9090] + +# http-text-format +class Apache(AgentCheck, HTTPStaticDiscoverable): + DISCOVERY_PORTS = [80] + DISCOVERY_PATH = "/server-status?auto" + DISCOVERY_VERIFY = body_contains("Total Accesses:") + DISCOVERY_FIELD = "apache_status_url" # how to name the URL in the returned instance + +# http-multi-path +class Nginx(AgentCheck): + @classmethod + def discover(cls, service: Service) -> list[dict] | None: + for port in candidate_ports(service, [80, 8080]): + for path, verifier in [ + ("/nginx_status", body_matches(r"^Active connections:")), + ("/api/9", json_has(["version", "processes"])), + ("/status/format/json", json_has(["nginxVersion"])), + ]: + if http_probe(service.host, port.number, path, verify=verifier): + return [{"nginx_status_url": f"http://{service.host}:{port.number}{path}"}] + return None + +# tcp-protocol-handshake +class Redis(AgentCheck, TCPDiscoverable): + DISCOVERY_PORTS = [6379] + DISCOVERY_SEND = b"PING\r\n" + DISCOVERY_VERIFY = starts_with(b"+PONG") + DISCOVERY_INSTANCE = lambda host, port: {"host": host, "port": port} +``` + +## Probe semantics + +Owned by the Agent (Go side, in the new `discoverer` package); the Python `discover` runs inside this envelope. + +1. Resolve `host` from `svc.GetHosts()` using the existing fallback policy. If empty, log "no probe target," skip the integration for this service. +2. Build the Python `Service` object: `id = svc.GetServiceID()`, `host = resolved`, `ports = [Port(p.Port, p.Name) for p in svc.GetPorts()]`. +3. Cache lookup keyed by `(svc.GetServiceID(), integrationName)`. On hit: short-circuit. +4. Invoke `.discover(service)` via rtloader with a per-call deadline (default 2 s). +5. Bound the Python call: hard timeout, cancel on Agent shutdown. +6. On `list[dict]` result: for each dict, build a concrete `integration.Config` (name = integration, instances = [marshalled dict], init_config = from auto_conf_discovery.yaml) and schedule. Cache hit for the lifetime of the service. +7. On `None`/`[]`: cache as failure for ~30 s. Don't schedule. +8. On exception: log at error, cache as failure for ~30 s. Don't schedule. Don't crash. + +The Python side is responsible for its own per-port and per-path timeouts inside `discover`. The shared `http_probe`/`tcp_probe` helpers carry sensible defaults (500 ms per attempt). The Agent-side total deadline is the outer bound. + +## Demo plan + +The same krakend container fixture as the previous experiment, plus one integration from each new bucket: + +1. **OpenMetrics base-class default** — krakend. Confirms the migration from the krakend experiment's `%%discovered_port%%` template path to the new `discover` path produces an equivalent scheduled config. +2. **`http-text-format`** — apache with mod_status. Probe `/server-status?auto`, verify `body_contains("Total Accesses:")`, return one instance with `apache_status_url`. +3. **`http-multi-path`** — nginx with stub_status. Probe three (path, verifier) tuples in order, return the first match. +4. **`tcp-protocol-handshake`** — redis. TCP `PING` → `+PONG` verification, return `{host, port}`. + +For each: golden path (default port), non-default port (server moved), negative case (wrong service labelled with the ad_identifier). + +## File-level summary of the change + +| Repo | Path | Change | +|------|------|--------| +| `integrations-core` | `/datadog_checks//data/auto_conf_discovery.yaml` | New file per discovered integration. Contains `ad_identifiers` (and optional `init_config`). No template instance. | +| `integrations-core` | `datadog_checks_base/datadog_checks/base/utils/discovery/` | New package: `http`, `tcp`, `ports`, `verifiers`. | +| `integrations-core` | `datadog_checks_base/datadog_checks/base/checks/discovery/` | New per-pattern mixins/base classes. | +| `integrations-core` | `/datadog_checks//check.py` | Adopt the matching base class or implement `discover` directly. ~5–15 lines per integration in the targeted buckets. | +| `datadog-agent` | `comp/core/autodiscovery/discovery/openmetrics_prober.go` | Delete (superseded by the Python path). | +| `datadog-agent` | `comp/core/autodiscovery/discoverer/` (new package) | Cross-into-Python bridge, candidate-port ordering, cache, time budget. | +| `datadog-agent` | `comp/core/autodiscovery/integration/config.go` | Keep `Discovery` field (or rename to a marker bool — discovery is now indicated by file presence and the integration's `discover` method). | +| `datadog-agent` | `comp/core/autodiscovery/autodiscoveryimpl/configmgr.go` | Replace the `prober.Probe` call with `discoverer.Discover` returning `[]integration.Config` directly. | +| `datadog-agent` | `pkg/util/tmplvar/resolver.go` | Remove `%%discovered_port%%` resolver and `GetDiscoveredPort`. No discovered-value templating. | +| `datadog-agent` | `comp/core/autodiscovery/discovery/service_wrapper.go` | Delete. | +| `datadog-agent` | rtloader bridge | New entry point: `discover(service_handle) -> instances|None`. Marshals `Service` projection to Python and the result back. | + +## Risks to verify during implementation + +- **Python execution latency.** Discovery runs on service-arrival events, not on the check schedule. Confirm rtloader can host a `discover` invocation with sub-second overhead. If the Python pool is busy with checks, queueing matters; the negative cache mitigates retry storms but the first call needs to be fast. +- **Process listener interaction.** `ProcessService` populates `host = 127.0.0.1` and `ports` from observed TCP listeners. Confirm the Python `Service` projection sees a usable port list for at least one targeted integration when the integration is run as a host-local process (not a container). +- **Digest stability across re-invocations.** Confirm `discover` results produce stable `integration.Config` digests when the underlying service state is unchanged. The cache is the primary defence; verify no codepath bypasses it. +- **Interaction with existing `auto_conf.yaml` for the same integration.** When both files exist (during an integration's migration), the prefer-discovery rule must avoid double-scheduling. +- **Host resolution parity.** The host string passed into Python must match exactly what `%%host%%` resolution produces today (same multi-network policy, same IPv6 bracketing). Existing `tmplvar.GetHost` is the reference implementation; reuse it rather than reimplement. + +## Out of scope but worth noting for follow-up + +- **Cluster-agent flow.** Service/Endpoints listeners on the cluster agent; potentially a node-agent-runs-discovery / cluster-agent-consumes-result split, or Python-in-cluster-agent. +- **Credentialled integrations.** `creds-*` buckets (75 integrations, 29%). A separate experiment would explore whether secret-store integration plus probe-shape detection can carry any of these. +- **`ad_identifiers` inference from check metadata.** Track separately from this experiment. +- **Renaming the Python `Service` type.** `Workload`, `DiscoveryTarget`, or `Target` — revisit before GA. +- **Exposing `pid`, `image_name`, `extra_config`, `tags`, `hostname` to Python.** Add when a concrete integration needs them, expected to be the trigger for a process-discovery experiment. diff --git a/krakend/changelog.d/23576.changed b/krakend/changelog.d/23576.changed new file mode 100644 index 0000000000000..32e8dec756449 --- /dev/null +++ b/krakend/changelog.d/23576.changed @@ -0,0 +1 @@ +Migrate to Python discover() classmethod for advanced auto-config; auto_conf_discovery.yaml no longer carries an instance template. \ No newline at end of file diff --git a/krakend/changelog.d/23577.added b/krakend/changelog.d/23577.added new file mode 100644 index 0000000000000..5915aab820777 --- /dev/null +++ b/krakend/changelog.d/23577.added @@ -0,0 +1 @@ +Add unit tests for discover() and an e2e test for advanced auto-config. \ No newline at end of file diff --git a/krakend/changelog.d/23577.changed b/krakend/changelog.d/23577.changed new file mode 100644 index 0000000000000..20d7758d84d22 --- /dev/null +++ b/krakend/changelog.d/23577.changed @@ -0,0 +1 @@ +Drop unjustified 9090 port hint from discover(); probe all exposed service ports. \ No newline at end of file diff --git a/krakend/datadog_checks/krakend/check.py b/krakend/datadog_checks/krakend/check.py index 1fd5666870479..3d348e7228fb7 100644 --- a/krakend/datadog_checks/krakend/check.py +++ b/krakend/datadog_checks/krakend/check.py @@ -51,6 +51,20 @@ class KrakendCheck(OpenMetricsBaseCheckV2): __NAMESPACE__ = "krakend.api" DEFAULT_METRIC_LIMIT = 0 + @classmethod + def discover(cls, service): + from datadog_checks.base.utils.discovery import ( + candidate_ports, + http_probe, + is_prometheus_exposition, + ) + + for port in candidate_ports(service, []): + if http_probe(service.host, port.number, "/metrics", + verifier=is_prometheus_exposition()): + return [{"openmetrics_endpoint": f"http://{service.host}:{port.number}/metrics"}] + return None + def create_scraper(self, config: InstanceType): return HttpCodeClassScraper(self, self.get_config_with_defaults(config)) diff --git a/krakend/datadog_checks/krakend/data/auto_conf_discovery.yaml b/krakend/datadog_checks/krakend/data/auto_conf_discovery.yaml new file mode 100644 index 0000000000000..45515eee969d6 --- /dev/null +++ b/krakend/datadog_checks/krakend/data/auto_conf_discovery.yaml @@ -0,0 +1,5 @@ +ad_identifiers: + - krakend +discovery: {} +init_config: +instances: [] diff --git a/krakend/tests/conftest.py b/krakend/tests/conftest.py index e9ce5ad576b68..f32ca79558c73 100644 --- a/krakend/tests/conftest.py +++ b/krakend/tests/conftest.py @@ -18,6 +18,15 @@ COMPOSE_FILE_E2E = Path(__file__).parent / "docker" / "docker-compose.yml" COMPOSE_FILE_LAB = Path(__file__).parent / "lab" / "docker-compose.yml" +INTEGRATIONS_CORE_ROOT = Path(__file__).resolve().parents[2] +KRAKEND_AUTOCONF = ( + Path(__file__).parent.parent / "datadog_checks" / "krakend" / "data" / "auto_conf_discovery.yaml" +) +DISCOVERY_HELPERS_DIR = ( + INTEGRATIONS_CORE_ROOT / "datadog_checks_base" / "datadog_checks" / "base" / "utils" / "discovery" +) +SITE_PACKAGES = "/opt/datadog-agent/embedded/lib/python3.13/site-packages" + @pytest.fixture(scope="session") def is_lab() -> bool: @@ -52,9 +61,22 @@ def run_docker_e2e(env_vars: dict[str, str], conditions: list[LazyFunction]): ): asyncio.run(generate_sample_traffic()) - yield { - "instances": [{"openmetrics_endpoint": OPEN_METRICS_ENDPOINT}], - } + yield ( + { + "instances": [{"openmetrics_endpoint": OPEN_METRICS_ENDPOINT}], + }, + { + # The autoconfig YAML + base helpers overlay let the + # discovery test exercise AD + discover() in this same + # env. They are no-ops for the regular test_e2e, which + # passes its own explicit config to dd_agent_check. + "docker_volumes": [ + f"{KRAKEND_AUTOCONF}:/etc/datadog-agent/conf.d/krakend.d/auto_conf_discovery.yaml:ro", + f"{DISCOVERY_HELPERS_DIR}:{SITE_PACKAGES}/datadog_checks/base/utils/discovery:ro", + "/var/run/docker.sock:/var/run/docker.sock:ro", + ], + }, + ) @pytest.fixture(scope="session") diff --git a/krakend/tests/test_e2e.py b/krakend/tests/test_e2e.py index 0547ada62e698..9b66cc8218a32 100644 --- a/krakend/tests/test_e2e.py +++ b/krakend/tests/test_e2e.py @@ -21,3 +21,21 @@ def test_e2e(dd_agent_check, instance: InstanceBuilder): check_submission_type=True, check_symmetric_inclusion=True, ) + + +@pytest.mark.e2e +def test_e2e_discovery(dd_agent_check): + aggregator = dd_agent_check( + {"init_config": {}, "instances": []}, + check_rate=True, + discovery_min_instances=1, + discovery_timeout=30, + ) + + metadata_metrics = get_metrics_from_metadata() + + aggregator.assert_metrics_using_metadata( + metadata_metrics, + check_submission_type=True, + check_symmetric_inclusion=True, + ) diff --git a/krakend/tests/test_unit.py b/krakend/tests/test_unit.py index 52914df7e931b..7b6f1fe3c4e82 100644 --- a/krakend/tests/test_unit.py +++ b/krakend/tests/test_unit.py @@ -4,11 +4,13 @@ from collections.abc import Callable from pathlib import Path +from unittest.mock import patch import pytest from datadog_checks.base import AgentCheck from datadog_checks.base.stubs.aggregator import AggregatorStub +from datadog_checks.base.utils.discovery import Port, Service from datadog_checks.krakend import KrakendCheck from tests.helpers import get_metrics_from_metadata from tests.types import InstanceBuilder @@ -121,3 +123,40 @@ def test_service_check_emitted(ready_check: KrakendCheck, aggregator: Aggregator def test_http_code_class_tag(ready_check: KrakendCheck, aggregator: AggregatorStub): aggregator.assert_metric_has_tag("krakend.api.http_client.duration.bucket", "code_class:5XX") + + +# --------------------------------------------------------------------------- +# discover() unit tests +# --------------------------------------------------------------------------- + + +def _service(*ports: int) -> Service: + return Service(id="svc", host="h", ports=tuple(Port(number=p) for p in ports)) + + +def test_discover_returns_url_for_first_matching_port(): + with patch("datadog_checks.base.utils.discovery.http_probe", side_effect=[True]) as probe: + result = KrakendCheck.discover(_service(9090)) + assert result == [{"openmetrics_endpoint": "http://h:9090/metrics"}] + probe.assert_called_once() + + +def test_discover_skips_non_matching_ports(): + with patch("datadog_checks.base.utils.discovery.http_probe", side_effect=[False, True]) as probe: + result = KrakendCheck.discover(_service(8080, 9090)) + assert result == [{"openmetrics_endpoint": "http://h:9090/metrics"}] + assert probe.call_count == 2 + + +def test_discover_returns_none_when_no_port_matches(): + with patch("datadog_checks.base.utils.discovery.http_probe", side_effect=[False, False, False]) as probe: + result = KrakendCheck.discover(_service(80, 8080, 9090)) + assert result is None + assert probe.call_count == 3 + + +def test_discover_returns_none_when_service_has_no_ports(): + with patch("datadog_checks.base.utils.discovery.http_probe") as probe: + result = KrakendCheck.discover(_service()) + assert result is None + probe.assert_not_called()