diff --git a/charts/aztec-node/README.md b/charts/aztec-node/README.md index 7342bc6..23ba8d0 100644 --- a/charts/aztec-node/README.md +++ b/charts/aztec-node/README.md @@ -10,6 +10,10 @@ A Helm chart for deploying an Aztec node | Key | Type | Default | Description | |-----|------|---------|-------------| +| centralMonitoring | object | `{"enabled":false,"promEndpoint":"https://vm.monitoring.gcp.obol.tech/write","token":""}` | Central Monitoring (Obol) Configures Prometheus remote_write to central Obol endpoint Automatically enables monitoring stack if not already enabled | +| centralMonitoring.enabled | bool | `false` | Specifies whether central monitoring should be enabled | +| centralMonitoring.promEndpoint | string | `"https://vm.monitoring.gcp.obol.tech/write"` | https endpoint to obol central prometheus | +| centralMonitoring.token | string | `""` | The authentication token to the central prometheus | | certificate.domains | list | `[]` | | | certificate.enabled | bool | `false` | | | customNetwork | object | `{"feeAssetHandlerContractAddress":null,"l1ChainId":null,"registryContractAddress":null,"slashFactoryAddress":null}` | Custom network - (not recommended) - Only for custom testnet usecases Must have deployed your own protocol contracts first | @@ -20,6 +24,8 @@ A Helm chart for deploying an Aztec node | image.repository | string | `"aztecprotocol/aztec"` | Image repository | | image.tag | string | `"2.1.5"` | Image tag | | initContainers | list | `[]` | Additional init containers | +| monitoring | object | `{"enabled":false}` | Local Monitoring Stack Deploys OTEL collector + Prometheus for metrics collection | +| monitoring.enabled | bool | `false` | Specifies whether local monitoring should be enabled Deploys OTEL collector (receives OTLP, exposes Prometheus metrics on :8889) and Prometheus (scrapes OTEL collector on :8889, exposes :9090) | | nameOverride | string | `""` | Overrides the chart name | | network | string | `nil` | Network name - this is a predefined network - testnet, devnet | | networkName | string | `"staging-public"` | Network identifier used in resource naming (l2-{role}-node-{networkName}-{component}) This appears in service/statefulset names for easy identification | diff --git a/charts/aztec-node/templates/otel-collector-configmap.yaml b/charts/aztec-node/templates/otel-collector-configmap.yaml new file mode 100644 index 0000000..61a5a13 --- /dev/null +++ b/charts/aztec-node/templates/otel-collector-configmap.yaml @@ -0,0 +1,24 @@ +{{- if eq (include "chart.monitoringEnabled" .) "true" -}} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "chart.resourceName" . }}-otel-collector + labels: + {{- include "chart.labels" . | nindent 4 }} +data: + config.yaml: | + receivers: + otlp: + protocols: + http: + endpoint: 0.0.0.0:4318 + exporters: + prometheus: + endpoint: "0.0.0.0:8889" + namespace: "aztec" + service: + pipelines: + metrics: + receivers: [otlp] + exporters: [prometheus] +{{- end }} diff --git a/charts/aztec-node/templates/otel-collector-deployment.yaml b/charts/aztec-node/templates/otel-collector-deployment.yaml new file mode 100644 index 0000000..c9b5ca8 --- /dev/null +++ b/charts/aztec-node/templates/otel-collector-deployment.yaml @@ -0,0 +1,46 @@ +{{- if eq (include "chart.monitoringEnabled" .) "true" -}} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "chart.resourceName" . }}-otel-collector + labels: + {{- include "chart.labels" . | nindent 4 }} + app: otel-collector +spec: + replicas: 1 + selector: + matchLabels: + {{- include "chart.selectorLabels" . | nindent 6 }} + app: otel-collector + template: + metadata: + labels: + {{- include "chart.selectorLabels" . | nindent 8 }} + app: otel-collector + spec: + containers: + - name: otel-collector + image: otel/opentelemetry-collector-contrib:0.91.0 + args: + - --config=/etc/otel/config.yaml + ports: + - containerPort: 4318 + name: otlp-http + - containerPort: 8889 + name: prometheus + volumeMounts: + - name: config + mountPath: /etc/otel/config.yaml + subPath: config.yaml + resources: + requests: + cpu: "100m" + memory: "256Mi" + limits: + cpu: "500m" + memory: "512Mi" + volumes: + - name: config + configMap: + name: {{ include "chart.resourceName" . }}-otel-collector +{{- end }} diff --git a/charts/aztec-node/templates/otel-collector-service.yaml b/charts/aztec-node/templates/otel-collector-service.yaml new file mode 100644 index 0000000..4179a84 --- /dev/null +++ b/charts/aztec-node/templates/otel-collector-service.yaml @@ -0,0 +1,20 @@ +{{- if eq (include "chart.monitoringEnabled" .) "true" -}} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "chart.resourceName" . }}-otel-collector + labels: + {{- include "chart.labels" . | nindent 4 }} +spec: + ports: + - port: 4318 + targetPort: 4318 + name: otlp-http + - port: 8889 + targetPort: 8889 + name: prometheus + selector: + {{- include "chart.selectorLabels" . | nindent 4 }} + app: otel-collector + type: ClusterIP +{{- end }} diff --git a/charts/aztec-node/templates/prometheus-configmap.yaml b/charts/aztec-node/templates/prometheus-configmap.yaml new file mode 100644 index 0000000..7927aed --- /dev/null +++ b/charts/aztec-node/templates/prometheus-configmap.yaml @@ -0,0 +1,28 @@ +{{- if eq (include "chart.monitoringEnabled" .) "true" -}} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "chart.resourceName" . }}-prometheus + labels: + {{- include "chart.labels" . | nindent 4 }} +data: + prometheus.yaml: | + global: + scrape_interval: 30s + evaluation_interval: 30s + {{- if .Values.centralMonitoring.enabled }} + remote_write: + - url: {{ $.Values.centralMonitoring.promEndpoint }} + authorization: + credentials: {{ $.Values.centralMonitoring.token }} + {{- end }} + scrape_configs: + - job_name: '{{ include "chart.resourceName" . }}' + static_configs: + - targets: ['{{ include "chart.resourceName" . }}-otel-collector:8889'] + relabel_configs: + - target_label: 'aztec_role' + replacement: '{{ .Values.role }}' + - target_label: 'aztec_network' + replacement: '{{ .Values.networkName }}' +{{- end }} diff --git a/charts/aztec-node/templates/prometheus-deployment.yaml b/charts/aztec-node/templates/prometheus-deployment.yaml new file mode 100644 index 0000000..147b7e8 --- /dev/null +++ b/charts/aztec-node/templates/prometheus-deployment.yaml @@ -0,0 +1,39 @@ +{{- if eq (include "chart.monitoringEnabled" .) "true" -}} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "chart.resourceName" . }}-prometheus + labels: + {{- include "chart.labels" . | nindent 4 }} + app: prometheus +spec: + replicas: 1 + selector: + matchLabels: + {{- include "chart.selectorLabels" . | nindent 6 }} + app: prometheus + template: + metadata: + labels: + {{- include "chart.selectorLabels" . | nindent 8 }} + app: prometheus + spec: + containers: + - name: prometheus + image: prom/prometheus:v3.5.0 + args: + - --config.file=/etc/prometheus/prometheus.yaml + ports: + - containerPort: 9090 + protocol: TCP + volumeMounts: + - name: config + mountPath: /etc/prometheus/prometheus.yaml + subPath: prometheus.yaml + securityContext: + runAsUser: 0 + volumes: + - name: config + configMap: + name: {{ include "chart.resourceName" . }}-prometheus +{{- end }} diff --git a/charts/aztec-node/templates/prometheus-service.yaml b/charts/aztec-node/templates/prometheus-service.yaml new file mode 100644 index 0000000..051c98d --- /dev/null +++ b/charts/aztec-node/templates/prometheus-service.yaml @@ -0,0 +1,17 @@ +{{- if eq (include "chart.monitoringEnabled" .) "true" -}} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "chart.resourceName" . }}-prometheus + labels: + {{- include "chart.labels" . | nindent 4 }} +spec: + ports: + - port: 9090 + protocol: TCP + targetPort: 9090 + selector: + {{- include "chart.selectorLabels" . | nindent 4 }} + app: prometheus + type: ClusterIP +{{- end }} diff --git a/charts/aztec-node/templates/statefulset-prover-agent.yaml b/charts/aztec-node/templates/statefulset-prover-agent.yaml index 9239b2b..56b1212 100644 --- a/charts/aztec-node/templates/statefulset-prover-agent.yaml +++ b/charts/aztec-node/templates/statefulset-prover-agent.yaml @@ -103,6 +103,25 @@ spec: {{- end }} - name: DATA_DIRECTORY value: {{ .Values.node.storage.dataDirectory | quote }} + {{- if or (eq (include "chart.monitoringEnabled" .) "true") .Values.node.metrics.otelCollectorEndpoint }} + - name: OTEL_EXCLUDE_METRICS + value: {{ .Values.node.metrics.otelExcludeMetrics | quote }} + {{- if eq (include "chart.monitoringEnabled" .) "true" }} + - name: OTEL_EXPORTER_OTLP_METRICS_ENDPOINT + value: "http://{{ include "chart.resourceName" . }}-otel-collector:4318/v1/metrics" + - name: OTEL_EXPORTER_OTLP_TRACES_ENDPOINT + value: "http://{{ include "chart.resourceName" . }}-otel-collector:4318/v1/traces" + - name: OTEL_EXPORTER_OTLP_LOGS_ENDPOINT + value: "http://{{ include "chart.resourceName" . }}-otel-collector:4318/v1/logs" + {{- else }} + - name: OTEL_EXPORTER_OTLP_METRICS_ENDPOINT + value: "{{ .Values.node.metrics.otelCollectorEndpoint }}/v1/metrics" + - name: OTEL_EXPORTER_OTLP_TRACES_ENDPOINT + value: "{{ .Values.node.metrics.otelCollectorEndpoint }}/v1/traces" + - name: OTEL_EXPORTER_OTLP_LOGS_ENDPOINT + value: "{{ .Values.node.metrics.otelCollectorEndpoint }}/v1/logs" + {{- end }} + {{- end }} resources: {{- if .Values.prover.agent.resources }} {{- toYaml .Values.prover.agent.resources | nindent 12 }} diff --git a/charts/aztec-node/templates/statefulset-prover-broker.yaml b/charts/aztec-node/templates/statefulset-prover-broker.yaml index d82770b..876e69f 100644 --- a/charts/aztec-node/templates/statefulset-prover-broker.yaml +++ b/charts/aztec-node/templates/statefulset-prover-broker.yaml @@ -112,6 +112,25 @@ spec: {{- end }} - name: DATA_DIRECTORY value: {{ .Values.node.storage.dataDirectory | quote }} + {{- if or (eq (include "chart.monitoringEnabled" .) "true") .Values.node.metrics.otelCollectorEndpoint }} + - name: OTEL_EXCLUDE_METRICS + value: {{ .Values.node.metrics.otelExcludeMetrics | quote }} + {{- if eq (include "chart.monitoringEnabled" .) "true" }} + - name: OTEL_EXPORTER_OTLP_METRICS_ENDPOINT + value: "http://{{ include "chart.resourceName" . }}-otel-collector:4318/v1/metrics" + - name: OTEL_EXPORTER_OTLP_TRACES_ENDPOINT + value: "http://{{ include "chart.resourceName" . }}-otel-collector:4318/v1/traces" + - name: OTEL_EXPORTER_OTLP_LOGS_ENDPOINT + value: "http://{{ include "chart.resourceName" . }}-otel-collector:4318/v1/logs" + {{- else }} + - name: OTEL_EXPORTER_OTLP_METRICS_ENDPOINT + value: "{{ .Values.node.metrics.otelCollectorEndpoint }}/v1/metrics" + - name: OTEL_EXPORTER_OTLP_TRACES_ENDPOINT + value: "{{ .Values.node.metrics.otelCollectorEndpoint }}/v1/traces" + - name: OTEL_EXPORTER_OTLP_LOGS_ENDPOINT + value: "{{ .Values.node.metrics.otelCollectorEndpoint }}/v1/logs" + {{- end }} + {{- end }} ports: - containerPort: {{ .Values.service.httpPort }} name: http-rpc diff --git a/charts/aztec-node/templates/statefulset-prover-node.yaml b/charts/aztec-node/templates/statefulset-prover-node.yaml index 736903f..2f0b5ae 100644 --- a/charts/aztec-node/templates/statefulset-prover-node.yaml +++ b/charts/aztec-node/templates/statefulset-prover-node.yaml @@ -142,6 +142,25 @@ spec: value: {{ .Values.node.storage.dataStoreMapSize | quote }} - name: WS_DB_MAP_SIZE_KB value: {{ .Values.node.storage.worldStateMapSize | quote }} + {{- if or (eq (include "chart.monitoringEnabled" .) "true") .Values.node.metrics.otelCollectorEndpoint }} + - name: OTEL_EXCLUDE_METRICS + value: {{ .Values.node.metrics.otelExcludeMetrics | quote }} + {{- if eq (include "chart.monitoringEnabled" .) "true" }} + - name: OTEL_EXPORTER_OTLP_METRICS_ENDPOINT + value: "http://{{ include "chart.resourceName" . }}-otel-collector:4318/v1/metrics" + - name: OTEL_EXPORTER_OTLP_TRACES_ENDPOINT + value: "http://{{ include "chart.resourceName" . }}-otel-collector:4318/v1/traces" + - name: OTEL_EXPORTER_OTLP_LOGS_ENDPOINT + value: "http://{{ include "chart.resourceName" . }}-otel-collector:4318/v1/logs" + {{- else }} + - name: OTEL_EXPORTER_OTLP_METRICS_ENDPOINT + value: "{{ .Values.node.metrics.otelCollectorEndpoint }}/v1/metrics" + - name: OTEL_EXPORTER_OTLP_TRACES_ENDPOINT + value: "{{ .Values.node.metrics.otelCollectorEndpoint }}/v1/traces" + - name: OTEL_EXPORTER_OTLP_LOGS_ENDPOINT + value: "{{ .Values.node.metrics.otelCollectorEndpoint }}/v1/logs" + {{- end }} + {{- end }} ports: - containerPort: {{ .Values.service.httpPort }} name: http-rpc diff --git a/charts/aztec-node/templates/statefulset.yaml b/charts/aztec-node/templates/statefulset.yaml index 79fe205..01e1daf 100644 --- a/charts/aztec-node/templates/statefulset.yaml +++ b/charts/aztec-node/templates/statefulset.yaml @@ -185,9 +185,17 @@ spec: {{- end }} - name: USE_GCLOUD_LOGGING value: {{ .Values.node.metrics.useGcloudLogging | quote }} - {{- if .Values.node.metrics.otelCollectorEndpoint }} + {{- if or (eq (include "chart.monitoringEnabled" .) "true") .Values.node.metrics.otelCollectorEndpoint }} - name: OTEL_EXCLUDE_METRICS value: {{ .Values.node.metrics.otelExcludeMetrics | quote }} + {{- if eq (include "chart.monitoringEnabled" .) "true" }} + - name: OTEL_EXPORTER_OTLP_METRICS_ENDPOINT + value: "http://{{ include "chart.resourceName" . }}-otel-collector:4318/v1/metrics" + - name: OTEL_EXPORTER_OTLP_TRACES_ENDPOINT + value: "http://{{ include "chart.resourceName" . }}-otel-collector:4318/v1/traces" + - name: OTEL_EXPORTER_OTLP_LOGS_ENDPOINT + value: "http://{{ include "chart.resourceName" . }}-otel-collector:4318/v1/logs" + {{- else }} - name: OTEL_EXPORTER_OTLP_METRICS_ENDPOINT value: "{{ .Values.node.metrics.otelCollectorEndpoint }}/v1/metrics" - name: OTEL_EXPORTER_OTLP_TRACES_ENDPOINT @@ -195,6 +203,7 @@ spec: - name: OTEL_EXPORTER_OTLP_LOGS_ENDPOINT value: "{{ .Values.node.metrics.otelCollectorEndpoint }}/v1/logs" {{- end }} + {{- end }} {{- if .Values.node.coinbase }} - name: COINBASE value: {{ .Values.node.coinbase | quote }} diff --git a/charts/aztec-node/templates/validate.yaml b/charts/aztec-node/templates/validate.yaml new file mode 100644 index 0000000..4a9b89e --- /dev/null +++ b/charts/aztec-node/templates/validate.yaml @@ -0,0 +1,15 @@ +{{- if .Values.centralMonitoring.enabled }} +{{- if not .Values.centralMonitoring.token }} +{{- fail ".Values.centralMonitoring.token is required when centralMonitoring is enabled" }} +{{- end }} +{{- end }} + +{{/* +Helper to determine if monitoring stack should be deployed. +Monitoring is enabled if either: +- monitoring.enabled is true +- centralMonitoring.enabled is true (auto-enables monitoring) +*/}} +{{- define "chart.monitoringEnabled" -}} +{{- or .Values.monitoring.enabled .Values.centralMonitoring.enabled -}} +{{- end -}} diff --git a/charts/aztec-node/values.yaml b/charts/aztec-node/values.yaml index a9621dc..a7e8f5d 100644 --- a/charts/aztec-node/values.yaml +++ b/charts/aztec-node/values.yaml @@ -328,3 +328,26 @@ serviceAccount: name: "" # -- Annotations for the service account annotations: {} + +# ===================================================================== +# MONITORING +# ===================================================================== + +# -- Local Monitoring Stack +# Deploys OTEL collector + Prometheus for metrics collection +monitoring: + # -- Specifies whether local monitoring should be enabled + # Deploys OTEL collector (receives OTLP, exposes Prometheus metrics on :8889) + # and Prometheus (scrapes OTEL collector on :8889, exposes :9090) + enabled: false + +# -- Central Monitoring (Obol) +# Configures Prometheus remote_write to central Obol endpoint +# Automatically enables monitoring stack if not already enabled +centralMonitoring: + # -- Specifies whether central monitoring should be enabled + enabled: false + # -- https endpoint to obol central prometheus + promEndpoint: "https://vm.monitoring.gcp.obol.tech/write" + # -- The authentication token to the central prometheus + token: ""