diff --git a/docker/Makefile b/docker/Makefile index a454442827..70ee46bcaf 100644 --- a/docker/Makefile +++ b/docker/Makefile @@ -1,4 +1,4 @@ -.PHONY: up down wait-compose-ready restart logs postgres build-hyperlane-cli build-celestia-devnet-images push-celestia-devnet-images +.PHONY: up down wait-compose-ready restart logs postgres enable-chaos-std enable-chaos-brutal restart-toxiproxy build-hyperlane-cli build-celestia-devnet-images push-celestia-devnet-images PROJECT_ROOT := $(shell git rev-parse --show-toplevel) DOCKER_COMPOSE_DIR := $(PROJECT_ROOT)/docker @@ -47,20 +47,25 @@ postgres: @echo "Starting fresh PostgreSQL environment..." docker compose -f $(DOCKER_COMPOSE_DIR)/docker-compose.postgres.yml up +CHAOS_DIR := $(PROJECT_ROOT)/scripts/chaos +DOCKER_CHAOS_ENV := LISTEN_ADDR=0.0.0.0 POSTGRES_UPSTREAM=host.docker.internal:5432 + +# Steady DA latency on the primary rollup (toxi_scenario P5). enable-chaos-std: - ./toxiproxy/remove_toxics.sh - ./toxiproxy/enable_toxics.sh - ./toxiproxy/status_chaos.sh + $(CHAOS_DIR)/toxi_scenario.sh clear primary + $(CHAOS_DIR)/toxi_scenario.sh scenario P5 primary + $(CHAOS_DIR)/toxi_scenario.sh list +# Compound failure: DA latency + postgres resets (toxi_scenario P6). enable-chaos-brutal: - ./toxiproxy/remove_toxics.sh - TIMEOUT_RATIO=0.9 LATENCY_RATIO=0.1 LIMIT_DATA_RATIO=0.4 ./toxiproxy/enable_toxics.sh - ./toxiproxy/status_chaos.sh + $(CHAOS_DIR)/toxi_scenario.sh clear primary + $(CHAOS_DIR)/toxi_scenario.sh scenario P6 primary + $(CHAOS_DIR)/toxi_scenario.sh list restart-toxiproxy: docker compose restart toxiproxy sleep 5 - TOXIPROXY_HOST="localhost" ./toxiproxy/configure.sh + $(DOCKER_CHAOS_ENV) $(CHAOS_DIR)/toxi_apply_config.sh # Needs docker logged in # echo $GITHUB_TOKEN | docker login ghcr.io -u USERNAME --password-stdin diff --git a/docker/README.md b/docker/README.md index 2b460670d2..706349001a 100644 --- a/docker/README.md +++ b/docker/README.md @@ -63,46 +63,52 @@ updated during consecutive runs. ## Chaos Engineering -[Toxiproxy](https://github.com/Shopify/toxiproxy) enables chaos engineering by simulating network failures and instabilities. -Use it to test how the rollup behaves when the connection to celestia-node is unreliable. +[Toxiproxy](https://github.com/Shopify/toxiproxy) sits between the rollup and its +upstreams (Postgres + Celestia DA RPC/gRPC) so we can inject latency, timeouts, and +connection resets. The toxiproxy scripts live in [`../scripts/chaos/`](../scripts/chaos/) +and work for both baremetal and docker — baremetal is the default; docker mode is +selected via env vars. ### Setup -1. Uncomment the toxiproxy service in [`docker-compose.yml`](./docker-compose.yml) -2. Configure your rollup to connect to port `26659` (proxied) instead of `26658` (direct) +1. Uncomment the `toxiproxy` service in [`docker-compose.yml`](./docker-compose.yml). +2. Start it: `docker compose up -d toxiproxy`. +3. Populate the seven proxies from the host shell: + ```bash + LISTEN_ADDR=0.0.0.0 POSTGRES_UPSTREAM=host.docker.internal:5432 \ + ../scripts/chaos/toxi_apply_config.sh + ``` +4. Point the rollup at the proxied ports (`5433` for postgres, `26678` for celestia + RPC, `9091` for celestia gRPC). ### Usage -The proxy starts without any network toxics enabled. Use the provided scripts to control network conditions: +Apply chaos via the scenario CLI (or the `make` shortcuts below): ```bash -# Enable standard toxics (light network issues) -docker/toxiproxy/enable_standard_toxics.sh +# Steady DA latency on the primary rollup +make enable-chaos-std -# Enable brutal toxics (severe network issues) -docker/toxiproxy/enable_brutal_toxics.sh +# Compound failure: DA latency + postgres connection resets +make enable-chaos-brutal -# Remove all toxics (restore normal network) -docker/toxiproxy/remove_toxics.sh - -# Check current toxic status -docker/toxiproxy/status_chaos.sh +# Or run scenarios directly: +../scripts/chaos/toxi_scenario.sh scenario P5 primary +../scripts/chaos/toxi_scenario.sh clear all +../scripts/chaos/toxi_scenario.sh list ``` -Available toxic types include latency, timeouts, connection resets, and bandwidth limiting. -This allows you to test rollup resilience under various network failure scenarios. +See `../scripts/chaos/toxi_scenario.sh --help` for the full list of named toxics +(`rpc-latency`, `rpc-timeout`, `pg-reset`, `pg-latency`) and scenarios (`P1`–`P7`, +`R1`–`R2`). ### Troubleshooting -**Toxiproxy crashes when adding toxics:** -- This happens when trying to add toxics to a proxy with active connections -- Solution: Restart toxiproxy and try again: - ```bash - docker compose restart toxiproxy - # Wait a few seconds, then try adding toxics again - ``` - -**Best practices:** -- Add toxics immediately after starting toxiproxy, before connections are established -- Use the remove script to clean up toxics before stopping services -- Monitor toxiproxy logs for crash indicators: `docker compose logs toxiproxy` +**Toxiproxy crashes when adding toxics to a proxy with active connections:** +```bash +make restart-toxiproxy +``` +This restarts the container and re-populates all seven proxies in one shot. + +Add toxics immediately after starting toxiproxy, before connections are established. +Tail logs with `docker compose logs toxiproxy`. diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 863f18a00b..79a65c676d 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -1,21 +1,24 @@ include: - docker-compose.celestia.yml -# Uncomment this if you want to test network delays/errors +# Uncomment to test network delays/errors against the rollup (see ../scripts/chaos/README.md). +# After `docker compose up -d toxiproxy`, populate the proxies from the host shell: +# +# LISTEN_ADDR=0.0.0.0 POSTGRES_UPSTREAM=host.docker.internal:5432 \ +# ../scripts/chaos/toxi_apply_config.sh +# +# Then apply scenarios with ../scripts/chaos/toxi_scenario.sh. #services: # toxiproxy: -# image: shopify/toxiproxy:2.1.4 +# image: ghcr.io/shopify/toxiproxy:2.12.0 # hostname: toxiproxy # environment: # LOG_LEVEL: "debug" -# depends_on: -# - sequencer-0 # ports: -# - "127.0.0.1:26659:26659" -# - "127.0.0.1:8474:8474" -# toxiproxy-config: -# image: curlimages/curl:8.9.1 -# depends_on: -# - toxiproxy -# volumes: -# - ./toxiproxy:/opt/toxiproxy -# command: [ "/opt/toxiproxy/configure.sh" ] +# - "127.0.0.1:8474:8474" # admin API +# - "127.0.0.1:5433:5433" # postgres_1 +# - "127.0.0.1:5434:5434" # postgres_2 +# - "127.0.0.1:5435:5435" # postgres_3 +# - "127.0.0.1:26678:26678" # celestia_rpc_1 +# - "127.0.0.1:26679:26679" # celestia_rpc_2 +# - "127.0.0.1:9091:9091" # celestia_grpc_1 +# - "127.0.0.1:9092:9092" # celestia_grpc_2 diff --git a/docker/toxiproxy/configure.sh b/docker/toxiproxy/configure.sh deleted file mode 100755 index 2727765a6d..0000000000 --- a/docker/toxiproxy/configure.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/sh -set -e - -# This script just enables proxies, but does not install any toxics - -echo "Configuring toxiproxy on standard mode for sequencer-0:\n" -TOXIPROXY_HOST="${TOXIPROXY_HOST:-toxiproxy}" - -echo "Creating proxy..." -curl -v --fail -H "Content-Type: application/json" -d '{"name" : "sequencer-0", "listen" : "0.0.0.0:26659", "upstream" : "sequencer-0:26658"}' http://$TOXIPROXY_HOST:8474/proxies - -echo "\n\n===== Final Configuration =====" -curl -s http://$TOXIPROXY_HOST:8474/proxies/sequencer-0 - -echo "\n\n=====\nConfiguration is completed!" \ No newline at end of file diff --git a/docker/toxiproxy/enable_toxics.sh b/docker/toxiproxy/enable_toxics.sh deleted file mode 100755 index 0106d285de..0000000000 --- a/docker/toxiproxy/enable_toxics.sh +++ /dev/null @@ -1,87 +0,0 @@ -#!/bin/sh -set -e - -TOXIPROXY_HOST="${TOXIPROXY_HOST:-127.0.0.1}" -PROXY_TARGET="${PROXY_TARGET:-sequencer-0}" -TIMEOUT_RATIO="${TIMEOUT_RATIO:-0.1}" -LATENCY_RATIO="${LATENCY_RATIO:-0.1}" -LIMIT_DATA_RATIO="${LIMIT_DATA_RATIO:-0.3}" - - -echo "⚠️ WARNING: Adding toxics to a proxy with active connections may cause toxiproxy to crash." -echo "🔄 If you encounter crashes, restart the toxiproxy service and try again." -echo "" - -# Check if toxiproxy is accessible -echo "🔍 Checking toxiproxy connectivity..." -if ! curl -s --fail --connect-timeout 5 http://$TOXIPROXY_HOST:8474/proxies >/dev/null 2>&1; then - echo "❌ Cannot connect to toxiproxy at $TOXIPROXY_HOST:8474" - echo "💡 Make sure toxiproxy is running: docker compose up toxiproxy" - exit 1 -fi - -# Check if proxy exists -if ! curl -s --fail http://$TOXIPROXY_HOST:8474/proxies/$PROXY_TARGET >/dev/null 2>&1; then - echo "❌ Proxy '$PROXY_TARGET' not found" - echo "💡 Make sure to run configure.sh first to create the proxy" - exit 1 -fi - -echo "✅ Toxiproxy is accessible and proxy exists" -echo "" - -# Function to add toxic with error handling -add_toxic() { - local toxic_type="$1" - local toxic_data="$2" - - echo "🧪 Adding $toxic_type toxic..." - - # Use temporary files to capture curl output and HTTP code separately - local temp_response=$(mktemp) - local temp_stderr=$(mktemp) - - # Make the curl request - local http_code - http_code=$(curl -s -w "%{http_code}" -H "Content-Type: application/json" \ - -d "$toxic_data" \ - -o "$temp_response" \ - http://$TOXIPROXY_HOST:8474/proxies/$PROXY_TARGET/toxics 2>"$temp_stderr") - - local response_body=$(cat "$temp_response") - local stderr_output=$(cat "$temp_stderr") - - if [ "$http_code" -eq 200 ] || [ "$http_code" -eq 201 ]; then - echo "✅ $toxic_type toxic added successfully" - else - echo "❌ Failed to add $toxic_type toxic (HTTP $http_code)" - if [ -n "$response_body" ]; then - echo "🔍 API response: $response_body" - fi - if [ -n "$stderr_output" ]; then - echo "🔍 Curl error: $stderr_output" - fi - echo "💡 Try restarting toxiproxy: make restart-toxiproxy" - - # Clean up temp files - rm -f "$temp_response" "$temp_stderr" - return 1 - fi - - # Clean up temp files - rm -f "$temp_response" "$temp_stderr" -} - -# Add toxics with better error handling -add_toxic "timeout" "{\"type\": \"timeout\", \"toxicity\": $TIMEOUT_RATIO, \"attributes\": {\"timeout\": 30000}}" || exit 1 -sleep 1 -add_toxic "latency" "{\"type\": \"latency\", \"toxicity\": $LATENCY_RATIO, \"attributes\": {\"latency\": 65000}}" || exit 1 -sleep 1 -add_toxic "limit_data" "{\"type\": \"limit_data\", \"toxicity\": $LIMIT_DATA_RATIO, \"attributes\": {\"bytes\": 5000}}" || exit 1 - -echo "" -echo "📋 Final Configuration:" -curl -s "http://$TOXIPROXY_HOST:8474/proxies/$PROXY_TARGET/toxics" - -echo "" -echo "✅ Standard toxics configuration completed!" diff --git a/docker/toxiproxy/remove_toxics.sh b/docker/toxiproxy/remove_toxics.sh deleted file mode 100755 index 79d228b005..0000000000 --- a/docker/toxiproxy/remove_toxics.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/sh -set -e - -TOXIPROXY_HOST=${TOXIPROXY_HOST:-127.0.0.1} -PROXY_NAME="sequencer-0" - -echo "🧪 Removing all toxics from proxy: $PROXY_NAME" - -if ! curl -s --fail http://$TOXIPROXY_HOST:8474/proxies/$PROXY_NAME > /dev/null 2>&1; then - echo "❌ Error: Proxy '$PROXY_NAME' does not exist" - exit 1 -fi - -# Get all toxics for the proxy -echo "📊 Fetching current toxics..." -toxics_response=$(curl -s --fail http://$TOXIPROXY_HOST:8474/proxies/$PROXY_NAME/toxics) - -if [ "$toxics_response" = "[]" ]; then - echo "No toxics found for proxy '$PROXY_NAME'" - exit 0 -fi - -# Parse toxic names using jq -toxic_names=$(echo "$toxics_response" | jq -r '.[].name') - -if [ -z "$toxic_names" ]; then - echo "No toxics found for proxy '$PROXY_NAME'" - exit 0 -fi - -echo "Found toxics to remove:" -echo "$toxic_names" - -# Remove each toxic -for toxic_name in $toxic_names; do - echo "Removing toxic: $toxic_name" - if curl -s --fail -X DELETE http://$TOXIPROXY_HOST:8474/proxies/$PROXY_NAME/toxics/$toxic_name > /dev/null; then - echo " ✓ Removed toxic: $toxic_name" - else - echo " ✗ Failed to remove toxic: $toxic_name" - fi -done - -echo "📋 Final Configuration:" -curl -s http://$TOXIPROXY_HOST:8474/proxies/$PROXY_NAME/toxics -echo "\n✅ All toxics have been removed from proxy '$PROXY_NAME'" diff --git a/docker/toxiproxy/status_chaos.sh b/docker/toxiproxy/status_chaos.sh deleted file mode 100755 index f680a6f691..0000000000 --- a/docker/toxiproxy/status_chaos.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/sh - -echo "Checking toxiproxy status..." -echo "" - -# Check if toxiproxy is running -if ! curl -s http://localhost:8474/proxies >/dev/null 2>&1; then - echo "❌ Toxiproxy is not reachable at localhost:8474" - exit 1 -fi - -# Get proxy status -echo "📊 Proxy Status:" -proxy_info=$(curl -s http://localhost:8474/proxies/sequencer-0) -echo "$proxy_info" | grep -o '"enabled":[^,]*' | cut -d: -f2 - -# Show active toxics -echo "" -echo "🧪 Active Toxics:" -curl -s http://localhost:8474/proxies/sequencer-0/toxics | \ - jq -r '.[].name' | \ - while read toxic; do - echo " - $toxic" - done - -echo "" -echo "📋 Full Configuration:" -curl -s http://localhost:8474/proxies/sequencer-0 \ No newline at end of file diff --git a/scripts/chaos/clean_rollup_data.sh b/scripts/chaos/clean_rollup_data.sh new file mode 100755 index 0000000000..db6bdf1892 --- /dev/null +++ b/scripts/chaos/clean_rollup_data.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +# clean_rollup_data.sh — wipe rollup state directories + sequencer postgres schema. +# +# Reads paths from a rollup TOML config: +# storage.path +# da.connection_string (sqlite://...) +# sequencer.preferred.postgres_config.postgres_connection_string +# +# Env: +# MODE=baremetal|docker baremetal (default) runs psql directly; +# docker runs psql inside a compose service. +# PG_DOCKER_SERVICE compose service name when MODE=docker (default: postgres) +# RM_LOG log file to remove (default: /var/log/rollup.log; skipped if missing) +# +# Requires: yq (mikefarah/yq, parses TOML), psql, plus docker (when MODE=docker). + +set -euo pipefail + +die() { echo "error: $*" >&2; exit 1; } + +if [ $# -ne 1 ]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +CONFIG_PATH="$1" +[ -f "$CONFIG_PATH" ] || die "Config file not found: $CONFIG_PATH" + +command -v yq >/dev/null || die "yq required (https://github.com/mikefarah/yq)" +command -v psql >/dev/null || die "psql required" + +MODE="${MODE:-baremetal}" +PG_DOCKER_SERVICE="${PG_DOCKER_SERVICE:-postgres}" +RM_LOG="${RM_LOG:-/var/log/rollup.log}" + +case "$MODE" in + baremetal|docker) ;; + *) die "MODE must be baremetal or docker (got '$MODE')" ;; +esac + +if [ "$MODE" = "docker" ]; then + command -v docker >/dev/null || die "docker required for MODE=docker" +fi + +STORAGE_PATH=$(yq -p toml -oy '.storage.path' "$CONFIG_PATH") +DA_CONN=$(yq -p toml -oy '.da.connection_string' "$CONFIG_PATH") +SEQUENCER_ADDR=$(yq -p toml -oy '.sequencer.preferred.postgres_config.postgres_connection_string' "$CONFIG_PATH") + +# Extract sqlite file path from connection string like "sqlite:///mnt/da/demo_mock_da.sqlite?mode=rwc" +DA_PATH="${DA_CONN#sqlite://}" +DA_PATH="${DA_PATH%%\?*}" + +echo "Cleaning rollup dbs (MODE=$MODE)" +echo " storage.path = $STORAGE_PATH" +echo " mock_da path = $DA_PATH" +echo " sequencer postgres = $SEQUENCER_ADDR" + +if [ -n "$STORAGE_PATH" ] && [ "$STORAGE_PATH" != "null" ]; then + rm -rf "${STORAGE_PATH:?}"/* +fi + +if [ -n "$DA_PATH" ] && [ "$DA_PATH" != "null" ]; then + # Explicit triplet — `rm -rf "$DA_PATH"*` would also nuke same-prefix neighbours. + rm -f "$DA_PATH" "${DA_PATH}-shm" "${DA_PATH}-wal" +fi + +if [ -f "$RM_LOG" ]; then + rm -f "$RM_LOG" +fi + +echo "Cleaning sequencer's postgresql" +psql_cmd=(psql -v ON_ERROR_STOP=1 "$SEQUENCER_ADDR") +if [ "$MODE" = "docker" ]; then + psql_cmd=(docker compose exec -T "$PG_DOCKER_SERVICE" psql -v ON_ERROR_STOP=1 "$SEQUENCER_ADDR") +fi + +"${psql_cmd[@]}" <&2; usage ;; + esac +done + +command -v curl >/dev/null || { echo "curl required" >&2; exit 1; } + +stop_service() { + if $FORCE_KILL; then + echo "$(date): Force-killing service" + eval "$KILL_CMD" || true + sleep 1 + else + echo "$(date): Stopping service gracefully" + eval "$STOP_CMD" + fi +} + +while true; do + stop_service + + sleep_sec=$((RANDOM % 5 + 3)) + echo "$(date): Sleeping $sleep_sec seconds" + sleep "$sleep_sec" + + echo "$(date): Starting service" + eval "$START_CMD" + + echo "$(date): Waiting for ready (timeout: ${READY_TIMEOUT}s)" + start_time=$(date +%s) + while true; do + if curl --fail -s -o /dev/null --max-time 5 "$READY_URL"; then + echo "$(date): Ready" + break + fi + if (( $(date +%s) - start_time >= READY_TIMEOUT )); then + echo "$(date): Timeout waiting for ready" >&2 + exit 1 + fi + sleep 1 + done + + sleep_min=$((RANDOM % 6 + 5)) + echo "$(date): Sleeping $sleep_min minutes" + sleep "${sleep_min}m" +done diff --git a/scripts/chaos/periodic_sigkill_on_log_message.sh b/scripts/chaos/periodic_sigkill_on_log_message.sh new file mode 100755 index 0000000000..69a77c9fa9 --- /dev/null +++ b/scripts/chaos/periodic_sigkill_on_log_message.sh @@ -0,0 +1,120 @@ +#!/usr/bin/env bash +# periodic_sigkill_on_log_message.sh — SIGKILL the rollup process when a log line matches. +# +# Loop: tail -f LOG_FILE, grep for PATTERN, optional random delay, KILL_CMD, +# wait for restart via START_CMD + READY_URL, sleep, repeat. +# +# A log file path is REQUIRED (-l). For docker, redirect compose logs into a file: +# docker compose logs -f rollup > /tmp/rollup.log & +# ./periodic_sigkill_on_log_message.sh -l /tmp/rollup.log -p "Commiting a group" +# +# Env (all overridable; defaults are baremetal+systemd): +# KILL_CMD SIGKILL default: bash -c 'pid=$(systemctl show -p MainPID --value rollup); +# [ "$pid" -gt 0 ] && kill -9 "$pid" || true' +# START_CMD start default: systemctl start rollup +# READY_URL readiness probe default: http://127.0.0.1:12346/sequencer/ready +# READY_TIMEOUT seconds default: 120 +# LOG_WAIT_TIMEOUT seconds default: 300 + +set -euo pipefail + +# shellcheck disable=SC2016 # the inner $pid expansion happens in the bash -c subshell, not here +KILL_CMD="${KILL_CMD:-bash -c 'pid=\$(systemctl show -p MainPID --value rollup); [ \"\$pid\" -gt 0 ] && kill -9 \"\$pid\" || true'}" +START_CMD="${START_CMD:-systemctl start rollup}" +READY_URL="${READY_URL:-http://127.0.0.1:12346/sequencer/ready}" +READY_TIMEOUT="${READY_TIMEOUT:-120}" +LOG_WAIT_TIMEOUT="${LOG_WAIT_TIMEOUT:-300}" + +LOG_FILE="" +LOG_PATTERN="Commiting a group" +DELAY_MIN_MS=0 +DELAY_MAX_MS=0 + +usage() { + cat <&2 +Usage: $0 -l|--log FILE [-p|--pattern PATTERN] [-d|--delay MIN_MS MAX_MS] + -l, --log FILE Log file to watch (REQUIRED) + -p, --pattern PATTERN Log pattern to match (default: $LOG_PATTERN) + -d, --delay MIN MAX Sleep random ms in [MIN,MAX] after match before kill + +Override restart behavior via KILL_CMD / START_CMD / READY_URL env vars. +EOF + exit 1 +} + +while [[ $# -gt 0 ]]; do + case "$1" in + -d|--delay) + DELAY_MIN_MS="$2" + DELAY_MAX_MS="$3" + shift 3 + ;; + -l|--log) + LOG_FILE="$2" + shift 2 + ;; + -p|--pattern) + LOG_PATTERN="$2" + shift 2 + ;; + -h|--help) usage ;; + *) echo "Unknown option: $1" >&2; usage ;; + esac +done + +[ -n "$LOG_FILE" ] || { echo "error: --log is required" >&2; usage; } +[ -f "$LOG_FILE" ] || { echo "error: log file not found: $LOG_FILE" >&2; exit 1; } + +command -v curl >/dev/null || { echo "curl required" >&2; exit 1; } + +kill_on_log_pattern() { + echo "$(date): Waiting for pattern: '$LOG_PATTERN' (timeout: ${LOG_WAIT_TIMEOUT}s)" + + if ! timeout "$LOG_WAIT_TIMEOUT" bash -c "tail -n 0 -f '$LOG_FILE' | grep -m 1 -q '$LOG_PATTERN'"; then + echo "$(date): Timeout waiting for log pattern" >&2 + exit 1 + fi + + echo "$(date): Pattern matched" + + if [[ "$DELAY_MAX_MS" -gt 0 ]]; then + delay_ms=$((RANDOM % (DELAY_MAX_MS - DELAY_MIN_MS + 1) + DELAY_MIN_MS)) + # `sleep` accepts decimals in coreutils; fractional ms is overkill here. + delay_sec="$((delay_ms / 1000)).$(printf '%03d' "$((delay_ms % 1000))")" + echo "$(date): Delaying ${delay_ms}ms" + sleep "$delay_sec" + fi + + echo "$(date): Killing rollup (SIGKILL)" + eval "$KILL_CMD" || true + sleep 1 +} + +while true; do + kill_on_log_pattern + + sleep_sec=$((RANDOM % 5 + 3)) + echo "$(date): Sleeping $sleep_sec seconds" + sleep "$sleep_sec" + + echo "$(date): Starting rollup" + eval "$START_CMD" + + echo "$(date): Waiting for ready (timeout: ${READY_TIMEOUT}s)" + start_time=$(date +%s) + while true; do + if curl --fail -s -o /dev/null --max-time 5 "$READY_URL"; then + echo "$(date): Ready" + break + fi + if (( $(date +%s) - start_time >= READY_TIMEOUT )); then + echo "$(date): Timeout waiting for ready" >&2 + exit 1 + fi + sleep 1 + done + + sleep_min=$((RANDOM % 6 + 5)) + echo "$(date): Sleeping $sleep_min minutes" + sleep "${sleep_min}m" +done diff --git a/scripts/chaos/run_generator_continuously.sh b/scripts/chaos/run_generator_continuously.sh new file mode 100755 index 0000000000..546a33d795 --- /dev/null +++ b/scripts/chaos/run_generator_continuously.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# run_generator_continuously.sh — keep the soak-test generator alive in a loop. +# +# Builds sov-soak-testing/generator (no-op if cached), then loops: +# spawn generator → on crash, restart with a fresh salt; on clean exit, stop. +# +# Env: +# LOG_DIR where to write generator-*.log default: cwd +# LOG_KEEP how many log files to retain default: 10 + +set -euo pipefail + +if [ $# -lt 1 ]; then + echo "Usage: $0 [runtime=demo-celestia] [num_workers=20]" >&2 + exit 1 +fi + +API_URL="$1" +RUNTIME="${2:-demo-celestia}" +NUM_WORKERS="${3:-20}" +LOG_DIR="${LOG_DIR:-.}" +LOG_KEEP="${LOG_KEEP:-10}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +GENERATOR="$PROJECT_ROOT/target/release/generator" + +mkdir -p "$LOG_DIR" + +export NO_COLOR=1 + +echo "[$(date -Iseconds)] Building generator (no-op if up-to-date)..." +(cd "$PROJECT_ROOT" && cargo build --release -p sov-soak-testing --bin generator) + +prune_logs() { + # Keep the most recent $LOG_KEEP files; delete the rest. + # `ls -t` orders newest-first; `tail -n +N` skips the first N-1. + local extra + # shellcheck disable=SC2012 # ls -t is the simplest way to sort by mtime here + extra="$(ls -t "$LOG_DIR"/generator-"${RUNTIME}"-*.log 2>/dev/null | tail -n "+$((LOG_KEEP + 1))" || true)" + [ -n "$extra" ] && echo "$extra" | xargs rm -f -- +} + +ITER=0 +while true; do + ITER=$((ITER + 1)) + SALT=$(date +%s) + LOG_FILE="$LOG_DIR/generator-${RUNTIME}-${ITER}.log" + echo "[$(date -Iseconds)] Starting generator (iter=$ITER, salt=$SALT) -> $LOG_FILE" + if "$GENERATOR" \ + --runtime="$RUNTIME" \ + --api-url="$API_URL" \ + --num-workers="$NUM_WORKERS" \ + --salt="$SALT" \ + --validity-profile=clean \ + --tx-type=bank \ + --restart-after-seconds=21600 \ + >"$LOG_FILE" 2>&1; then + echo "[$(date -Iseconds)] Generator exited cleanly, stopping loop" + break + fi + echo "[$(date -Iseconds)] Generator crashed (see $LOG_FILE), restarting with new salt..." + prune_logs +done diff --git a/scripts/chaos/toxi_apply_config.sh b/scripts/chaos/toxi_apply_config.sh new file mode 100755 index 0000000000..bacc17b079 --- /dev/null +++ b/scripts/chaos/toxi_apply_config.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash +# toxi_apply_config.sh — populate the sovereign-soak toxiproxy proxies. +# +# Creates (or replaces) seven proxies via Toxiproxy admin API POST /populate: +# +# postgres_1 :5433 → ${POSTGRES_UPSTREAM} (primary rollup state DB) +# postgres_2 :5434 → ${POSTGRES_UPSTREAM} (replica rollup state DB) +# postgres_3 :5435 → ${POSTGRES_UPSTREAM} (auxiliary slot) +# celestia_rpc_1 :26678 → ${CELESTIA_RPC_UPSTREAM} (primary) +# celestia_rpc_2 :26679 → ${CELESTIA_RPC_UPSTREAM} (replica) +# celestia_grpc_1 :9091 → ${CELESTIA_GRPC_UPSTREAM} (primary) +# celestia_grpc_2 :9092 → ${CELESTIA_GRPC_UPSTREAM} (replica) +# +# Defaults are baremetal (loopback). For docker, override: +# +# LISTEN_ADDR=0.0.0.0 \ +# POSTGRES_UPSTREAM=host.docker.internal:5432 \ +# ./toxi_apply_config.sh +# +# Env vars (all optional): +# TOXIPROXY_HOST admin API host:port default 127.0.0.1:8474 +# LISTEN_ADDR listen interface default 127.0.0.1 +# POSTGRES_UPSTREAM host:port for postgres default 127.0.0.1:5432 +# CELESTIA_RPC_UPSTREAM host:port for cel RPC default da-private.celestia-mocha.com:26658 +# CELESTIA_GRPC_UPSTREAM host:port for cel gRPC default rpc-private.celestia-mocha.com:9090 +# POSTGRES_{1,2,3}_PORT, CELESTIA_RPC_{1,2}_PORT, CELESTIA_GRPC_{1,2}_PORT +# per-proxy listen port overrides (defaults above) + +set -euo pipefail + +TOXIPROXY_HOST="${TOXIPROXY_HOST:-127.0.0.1:8474}" +LISTEN_ADDR="${LISTEN_ADDR:-127.0.0.1}" +POSTGRES_UPSTREAM="${POSTGRES_UPSTREAM:-127.0.0.1:5432}" +CELESTIA_RPC_UPSTREAM="${CELESTIA_RPC_UPSTREAM:-da-private.celestia-mocha.com:26658}" +CELESTIA_GRPC_UPSTREAM="${CELESTIA_GRPC_UPSTREAM:-rpc-private.celestia-mocha.com:9090}" + +POSTGRES_1_PORT="${POSTGRES_1_PORT:-5433}" +POSTGRES_2_PORT="${POSTGRES_2_PORT:-5434}" +POSTGRES_3_PORT="${POSTGRES_3_PORT:-5435}" +CELESTIA_RPC_1_PORT="${CELESTIA_RPC_1_PORT:-26678}" +CELESTIA_RPC_2_PORT="${CELESTIA_RPC_2_PORT:-26679}" +CELESTIA_GRPC_1_PORT="${CELESTIA_GRPC_1_PORT:-9091}" +CELESTIA_GRPC_2_PORT="${CELESTIA_GRPC_2_PORT:-9092}" + +die() { echo "error: $*" >&2; exit 1; } + +for bin in curl jq; do + command -v "${bin}" >/dev/null || die "${bin} not found in PATH" +done + +URL="http://${TOXIPROXY_HOST}" + +curl -fsS --connect-timeout 5 "${URL}/proxies" >/dev/null \ + || die "toxiproxy admin API unreachable at ${URL} (start toxiproxy-server first)" + +proxy() { + local name="$1" port="$2" upstream="$3" + jq -n \ + --arg name "${name}" \ + --arg listen "${LISTEN_ADDR}:${port}" \ + --arg upstream "${upstream}" \ + '{name: $name, listen: $listen, upstream: $upstream, enabled: true}' +} + +payload="$(jq -s '.' <( + proxy postgres_1 "${POSTGRES_1_PORT}" "${POSTGRES_UPSTREAM}" + proxy postgres_2 "${POSTGRES_2_PORT}" "${POSTGRES_UPSTREAM}" + proxy postgres_3 "${POSTGRES_3_PORT}" "${POSTGRES_UPSTREAM}" + proxy celestia_rpc_1 "${CELESTIA_RPC_1_PORT}" "${CELESTIA_RPC_UPSTREAM}" + proxy celestia_rpc_2 "${CELESTIA_RPC_2_PORT}" "${CELESTIA_RPC_UPSTREAM}" + proxy celestia_grpc_1 "${CELESTIA_GRPC_1_PORT}" "${CELESTIA_GRPC_UPSTREAM}" + proxy celestia_grpc_2 "${CELESTIA_GRPC_2_PORT}" "${CELESTIA_GRPC_UPSTREAM}" +))" + +echo "Populating toxiproxy at ${URL} ..." +curl -fsS -H 'Content-Type: application/json' -X POST \ + -d "${payload}" "${URL}/populate" >/dev/null + +echo "Done. Current proxies:" +curl -fsS "${URL}/proxies" \ + | jq -r 'to_entries | sort_by(.key)[] | " \(.key)\t\(.value.listen)\t→ \(.value.upstream)"' diff --git a/scripts/chaos/toxi_scenario.sh b/scripts/chaos/toxi_scenario.sh new file mode 100755 index 0000000000..d92c002cb0 --- /dev/null +++ b/scripts/chaos/toxi_scenario.sh @@ -0,0 +1,261 @@ +#!/usr/bin/env bash +# toxi_scenario.sh — toxic + scenario CLI for the sovereign-soak toxiproxy setup. +# +# Targets: +# primary → *_1 proxies (default) +# replica → *_2 proxies +# both → *_1 and *_2 +# tertiary → postgres_3 (postgres class only — no celestia _3 proxies exist) +# all → every proxy (clear only) +# +# Usage: +# toxi_scenario.sh list +# toxi_scenario.sh clear [primary|replica|both|tertiary|all] # default: all +# toxi_scenario.sh toxic [primary|replica|both|tertiary] # default: primary +# toxi_scenario.sh scenario [primary|replica|both|tertiary] # default: primary +# +# tertiary is only valid for pg-* toxics and pg-only scenarios (P1/P3/P4). +# Reaching for tertiary on an rpc/grpc class is a hard error. +# +# Named toxics: +# rpc-latency celestia_rpc latency 300ms ±200ms +# rpc-timeout celestia_rpc timeout 40000ms +# pg-reset postgres reset_peer 0ms +# pg-latency postgres latency 50ms ±20ms +# +# Scenarios (P1..P7 single-rollup; R1..R2 replica-only): +# P1 postgres reset_peer 0ms, toxicity 0.10 +# P2 celestia_rpc timeout 8000ms, toxicity 1.00 +# P3 postgres timeout 15000ms, toxicity 1.00 +# P4 postgres latency 100ms ±25ms, toxicity 1.00 +# P5 celestia_rpc latency 6000ms ±1000ms, toxicity 1.00 +# P6 celestia_rpc latency 6000ms ±1000ms (1.00) + postgres reset_peer 0ms (0.10) +# P7 celestia_grpc reset_peer 0ms, toxicity 0.50 +# R1 postgres_2 latency 8000ms ±500ms, toxicity 1.00 (target: replica) +# R2 postgres_2 reset_peer 0ms, toxicity 0.20 (target: replica) +# +# Apply semantics: `scenario` clears the proxies it touches (for the chosen target) +# before applying, so calling the same scenario twice never stacks toxics. `toxic` +# adds to the existing toxic set without clearing — run `clear` first for a clean slate. + +set -euo pipefail + +CLI="${TOXIPROXY_CLI:-toxiproxy-cli}" +HOST="${TOXIPROXY_HOST:-127.0.0.1:8474}" +export TOXIPROXY_URL="http://${HOST}" + +die() { echo "error: $*" >&2; exit 2; } + +for bin in "${CLI}" curl jq; do + command -v "${bin}" >/dev/null || die "${bin} not found in PATH" +done + +# ----- proxy name expansion -------------------------------------------------- + +# class_proxies → prints proxy names, one per line +class_proxies() { + local class="$1" target="$2" + case "${target}" in + primary) printf '%s_1\n' "${class}" ;; + replica) printf '%s_2\n' "${class}" ;; + both) printf '%s_1\n%s_2\n' "${class}" "${class}" ;; + tertiary) + [[ "${class}" == "postgres" ]] \ + || die "tertiary target only valid for postgres class (got '${class}')" + printf 'postgres_3\n' + ;; + *) die "bad target '${target}' (expected primary|replica|both|tertiary)" ;; + esac +} + +# all_proxies_for_target → every proxy for the target (used by `clear`) +all_proxies_for_target() { + local target="$1" + case "${target}" in + primary) echo "postgres_1 celestia_rpc_1 celestia_grpc_1" ;; + replica) echo "postgres_2 celestia_rpc_2 celestia_grpc_2" ;; + both) echo "postgres_1 celestia_rpc_1 celestia_grpc_1 postgres_2 celestia_rpc_2 celestia_grpc_2" ;; + tertiary) echo "postgres_3" ;; + all) echo "postgres_1 postgres_2 postgres_3 celestia_rpc_1 celestia_rpc_2 celestia_grpc_1 celestia_grpc_2" ;; + *) die "bad target '${target}' (expected primary|replica|both|tertiary|all)" ;; + esac +} + +# ----- primitives ------------------------------------------------------------ + +clear_proxy() { + local proxy="$1" + # Delete every toxic on this proxy. Admin API returns {name,type,...} per toxic. + local names + names="$(curl -fsS "${TOXIPROXY_URL}/proxies/${proxy}/toxics" | jq -r '.[].name')" || return 0 + local t + for t in ${names}; do + curl -fsS -X DELETE "${TOXIPROXY_URL}/proxies/${proxy}/toxics/${t}" >/dev/null + done +} + +# add_toxic [attr=value ...] +add_toxic() { + local proxy="$1" name="$2" type="$3" toxicity="$4"; shift 4 + local args=(--type "${type}" --toxicName "${name}" --toxicity "${toxicity}") + local kv + for kv in "$@"; do args+=(--attribute "${kv}"); done + "${CLI}" toxic add "${args[@]}" "${proxy}" +} + +# ----- named toxics (applied to all proxies of the given class+target) ------ + +apply_named_toxic() { + local name="$1" target="${2:-primary}" + local class + case "${name}" in + rpc-latency|rpc-timeout) class="celestia_rpc" ;; + pg-reset|pg-latency) class="postgres" ;; + *) die "unknown toxic '${name}' (expected rpc-latency|rpc-timeout|pg-reset|pg-latency)" ;; + esac + + local proxies; proxies="$(class_proxies "${class}" "${target}")" + local p + for p in ${proxies}; do + case "${name}" in + rpc-latency) add_toxic "${p}" "${name}" latency 1.0 latency=300 jitter=200 ;; + rpc-timeout) add_toxic "${p}" "${name}" timeout 0.2 timeout=40000 ;; + pg-reset) add_toxic "${p}" "${name}" reset_peer 0.1 timeout=0 ;; + pg-latency) add_toxic "${p}" "${name}" latency 1.0 latency=50 jitter=20 ;; + esac + done +} + +# ----- scenarios ------------------------------------------------------------- + +scenario_P1() { # postgres reset_peer 0ms toxicity 0.10 + local target="$1" p + for p in $(class_proxies postgres "${target}"); do + clear_proxy "${p}" + add_toxic "${p}" "P1" reset_peer 0.10 timeout=0 + done +} + +scenario_P2() { # celestia_rpc timeout 8000ms toxicity 0.15 + local target="$1" p + for p in $(class_proxies celestia_rpc "${target}"); do + clear_proxy "${p}" + add_toxic "${p}" "P2" timeout 0.15 timeout=8000 + done +} + +scenario_P3() { # postgres timeout 15000ms toxicity 1.00 + local target="$1" p + for p in $(class_proxies postgres "${target}"); do + clear_proxy "${p}" + add_toxic "${p}" "P3" timeout 1.0 timeout=15000 + done +} + +scenario_P4() { # postgres latency 100ms ±25ms toxicity 1.00 + local target="$1" p + for p in $(class_proxies postgres "${target}"); do + clear_proxy "${p}" + add_toxic "${p}" "P4" latency 1.0 latency=100 jitter=25 + done +} + +scenario_P5() { # celestia_rpc latency 6000ms ±1000ms toxicity 1.00 + local target="$1" p + for p in $(class_proxies celestia_rpc "${target}"); do + clear_proxy "${p}" + add_toxic "${p}" "P5" latency 1.0 latency=6000 jitter=1000 + done +} + +scenario_P6() { # P5 rpc latency + P1 pg reset + local target="$1" p + for p in $(class_proxies celestia_rpc "${target}"); do + clear_proxy "${p}" + add_toxic "${p}" "P6_rpc_latency" latency 1.0 latency=6000 jitter=1000 + done + for p in $(class_proxies postgres "${target}"); do + clear_proxy "${p}" + add_toxic "${p}" "P6_pg_reset" reset_peer 0.10 timeout=0 + done +} + +scenario_P7() { # celestia_grpc reset_peer 0ms toxicity 0.50 + local target="$1" p + for p in $(class_proxies celestia_grpc "${target}"); do + clear_proxy "${p}" + add_toxic "${p}" "P7" reset_peer 0.50 timeout=0 + done +} + +scenario_R1() { # replica postgres latency 8000ms ±500ms toxicity 1.00 + clear_proxy postgres_2 + add_toxic postgres_2 "R1" latency 1.0 latency=8000 jitter=500 +} + +scenario_R2() { # replica postgres reset_peer 0ms toxicity 0.20 + clear_proxy postgres_2 + add_toxic postgres_2 "R2" reset_peer 0.20 timeout=0 +} + +# ----- dispatch -------------------------------------------------------------- + +cmd_list() { + "${CLI}" list + echo + local p + for p in postgres_1 postgres_2 postgres_3 celestia_rpc_1 celestia_rpc_2 celestia_grpc_1 celestia_grpc_2; do + echo "---- ${p} ----" + "${CLI}" inspect "${p}" || true + done +} + +cmd_clear() { + local target="${1:-all}" + local p + for p in $(all_proxies_for_target "${target}"); do + clear_proxy "${p}" + done + echo "cleared toxics on target: ${target}" +} + +cmd_toxic() { + [[ $# -ge 1 ]] || die "usage: toxi_scenario.sh toxic [primary|replica|both|tertiary]" + apply_named_toxic "$1" "${2:-primary}" + echo "applied toxic '$1' on target: ${2:-primary}" +} + +cmd_scenario() { + [[ $# -ge 1 ]] || die "usage: toxi_scenario.sh scenario [primary|replica|both|tertiary]" + local id="$1" target="${2:-primary}" + case "${id}" in + P2|P5|P6|P7) + [[ "${target}" != "tertiary" ]] \ + || die "scenario ${id} touches a celestia class — tertiary is postgres-only" + "scenario_${id}" "${target}" + ;; + P1|P3|P4) "scenario_${id}" "${target}" ;; + R1|R2) + [[ -z "${2:-}" || "${2}" == "replica" ]] \ + || die "scenario ${id} is replica-only; do not pass '${2}'" + "scenario_${id}" + ;; + *) die "unknown scenario '${id}' (expected P1..P7 or R1..R2)" ;; + esac + echo "applied scenario ${id} on target: ${target}" +} + +main() { + [[ $# -ge 1 ]] || { awk 'NR>1 && /^[^#]/{exit} NR>1{print}' "$0"; exit 1; } + local cmd="$1"; shift + case "${cmd}" in + list) cmd_list "$@" ;; + clear) cmd_clear "$@" ;; + toxic) cmd_toxic "$@" ;; + scenario) cmd_scenario "$@" ;; + -h|--help|help) awk 'NR>1 && /^[^#]/{exit} NR>1{print}' "$0" ;; + *) die "unknown command '${cmd}'" ;; + esac +} + +main "$@"