Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/iocore/hostdb/HostDBProcessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,7 @@ struct ResolveInfo {
bool mark_active_server_up();

/// Select / resolve to the next RR entry for the record.
bool select_next_rr();
bool select_next_rr(ts_time now, ts_seconds fail_window);

bool is_srv() const;
};
Expand Down
4 changes: 3 additions & 1 deletion include/proxy/http/HttpTransact.h
Original file line number Diff line number Diff line change
Expand Up @@ -1034,7 +1034,7 @@ class HttpTransact
static void handle_response_from_parent_plugin(State *s);
static void handle_response_from_server(State *s);
static void delete_server_rr_entry(State *s, int max_retries);
static void retry_server_connection_not_open(State *s, ServerState_t conn_state, unsigned max_retries);
static void retry_server_connection_not_open(State *s, unsigned max_retries);
static void error_log_connection_failure(State *s, ServerState_t conn_state);
static void handle_server_connection_not_open(State *s);
static void handle_forward_server_connection_open(State *s);
Expand Down Expand Up @@ -1078,6 +1078,8 @@ class HttpTransact
static bool handle_trace_and_options_requests(State *s, HTTPHdr *incoming_hdr);
static void bootstrap_state_variables_from_request(State *s, HTTPHdr *incoming_request);

static uint8_t origin_server_connect_attempts_max_retries(State *s);

// WARNING: this function may be called multiple times for the same transaction.
//
static void initialize_state_variables_from_request(State *s, HTTPHdr *obsolete_incoming_request);
Expand Down
14 changes: 10 additions & 4 deletions src/iocore/hostdb/HostDB.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1706,13 +1706,19 @@ ResolveInfo::set_active(HostDBInfo *info)
}

bool
ResolveInfo::select_next_rr()
ResolveInfo::select_next_rr(ts_time now, ts_seconds fail_window)
{
if (active) {
if (auto rr_info{this->record->rr_info()}; rr_info.count() > 1) {
unsigned limit = active - rr_info.data(), idx = (limit + 1) % rr_info.count();
while ((idx = (idx + 1) % rr_info.count()) != limit && !rr_info[idx].is_up()) {}
active = &rr_info[idx];
unsigned limit = active - rr_info.data();
size_t idx = (limit + 1) % rr_info.count();
for (; idx != limit; idx = (idx + 1) % rr_info.count()) {
if (!rr_info[idx].is_down(now, fail_window)) {
active = &rr_info[idx];
break;
}
}

return idx != limit; // if the active record was actually changed.
}
}
Expand Down
4 changes: 4 additions & 0 deletions src/proxy/http/HttpConfig.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1348,6 +1348,10 @@ HttpConfig::reconfigure()
"will never redispatch to another server",
m_master.oride.connect_attempts_rr_retries, params->oride.connect_attempts_max_retries);
}
if (m_master.oride.connect_attempts_rr_retries > 0 && params->oride.connect_attempts_max_retries_down_server == 0) {
Warning("connect_attempts_max_retries_down_server=0 with round-robin enabled skips probing recovering (SUSPECT) origins; "
"set connect_attempts_max_retries_down_server >= 1 is recommended");
Comment on lines +1352 to +1353
Copy link

Copilot AI Apr 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This Warning message is misleading: connect_attempts_max_retries_down_server=0 doesn’t inherently “skip probing” SUSPECT origins; it mainly reduces the retry budget (potentially to just a single probe attempt). Also the phrasing is grammatically off (“set … is recommended”) and it would be clearer to reference the full record name (proxy.config.http.connect_attempts_max_retries_down_server). Please update the message (and/or the logic) so it accurately reflects the actual behavior.

Suggested change
Warning("connect_attempts_max_retries_down_server=0 with round-robin enabled skips probing recovering (SUSPECT) origins; "
"set connect_attempts_max_retries_down_server >= 1 is recommended");
Warning("proxy.config.http.connect_attempts_max_retries_down_server=0 with round-robin enabled leaves no retry budget "
"for down or recovering (SUSPECT) origins beyond the initial attempt; setting "
"proxy.config.http.connect_attempts_max_retries_down_server >= 1 is recommended");

Copilot uses AI. Check for mistakes.
}
params->oride.connect_attempts_retry_backoff_base = m_master.oride.connect_attempts_retry_backoff_base;

params->oride.connect_attempts_rr_retries = m_master.oride.connect_attempts_rr_retries;
Expand Down
67 changes: 32 additions & 35 deletions src/proxy/http/HttpSM.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

#include "proxy/http/HttpConfig.h"
#include "tscore/ink_hrtime.h"
#include "tscore/ink_time.h"
#include "tsutil/Metrics.h"
#include "tsutil/ts_bw_format.h"
#include "proxy/ProxyTransaction.h"
Expand Down Expand Up @@ -4731,9 +4732,12 @@ HttpSM::do_hostdb_update_if_necessary()
t_state.dns_info.active->http_version = t_state.updated_server_version;
}

char addrbuf[INET6_ADDRPORTSTRLEN];
SMDbg(dbg_ctl_http, "update hostdb info: %s", ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf, sizeof(addrbuf)));

// Check to see if we need to report or clear a connection failure
if (track_connect_fail()) {
this->mark_host_failure(&t_state.dns_info, ts_clock::from_time_t(t_state.client_request_time));
this->mark_host_failure(&t_state.dns_info, ts_clock::now());
} else {
if (t_state.dns_info.mark_active_server_up()) {
char addrbuf[INET6_ADDRPORTSTRLEN];
Expand All @@ -4748,8 +4752,6 @@ HttpSM::do_hostdb_update_if_necessary()
}
}

char addrbuf[INET6_ADDRPORTSTRLEN];
SMDbg(dbg_ctl_http, "server info = %s", ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf, sizeof(addrbuf)));
return;
}

Expand Down Expand Up @@ -5521,12 +5523,6 @@ HttpSM::do_http_server_open(bool raw, bool only_direct)
return;
}
}
if (HttpTransact::is_server_negative_cached(&t_state) == true &&
t_state.txn_conf->connect_attempts_max_retries_down_server <= 0) {
SMDbg(dbg_ctl_http_seq, "Not connecting to the server because it is marked down.");
call_transact_and_set_next_state(HttpTransact::OriginDown);
return;
}

// Check for self loop.
if (!_ua.get_txn()->is_outbound_transparent() && HttpTransact::will_this_request_self_loop(&t_state)) {
Expand Down Expand Up @@ -5972,34 +5968,35 @@ HttpSM::do_transform_open()
void
HttpSM::mark_host_failure(ResolveInfo *info, ts_time time_down)
{
char addrbuf[INET6_ADDRPORTSTRLEN];
ink_assert(time_down != TS_TIME_ZERO);

if (info->active) {
if (time_down != TS_TIME_ZERO) {
ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf, sizeof(addrbuf));
// Increment the fail_count
if (auto [down, fail_count] = info->active->increment_fail_count(time_down, t_state.txn_conf->connect_attempts_rr_retries,
t_state.txn_conf->down_server_timeout);
down) {
char *url_str = t_state.hdr_info.client_request.url_string_get_ref(nullptr);
std::string_view host_name{t_state.unmapped_url.host_get()};
swoc::bwprint(error_bw_buffer, "CONNECT : {::s} connecting to {} for host='{}' url='{}' fail_count='{}' marking down",
swoc::bwf::Errno(t_state.current.server->connect_result), t_state.current.server->dst_addr, host_name,
swoc::bwf::FirstOf(url_str, "<none>"), fail_count);
Log::error("%s", error_bw_buffer.c_str());
SMDbg(dbg_ctl_http, "hostdb update marking IP: %s as down", addrbuf);
ATS_PROBE2(hostdb_mark_ip_as_down, sm_id, addrbuf);
} else {
ATS_PROBE3(hostdb_inc_ip_failcount, sm_id, addrbuf, fail_count);
SMDbg(dbg_ctl_http, "hostdb increment IP failcount %s to %d", addrbuf, fail_count);
}
} else { // Clear the failure
info->active->mark_up();
}
if (info->active == nullptr) {
return;
}

char addrbuf[INET6_ADDRPORTSTRLEN];
ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf, sizeof(addrbuf));

uint8_t max_connect_retries = HttpTransact::origin_server_connect_attempts_max_retries(&t_state);
ts_seconds fail_window = t_state.txn_conf->down_server_timeout;

// Mark the host DOWN only after every attempt has failed. `max_connect_retries` counts only "retries", so the total attempt
// budget is `max_connect_retries + 1` (the initial connect plus each retry).
auto [down, fail_count] = info->active->increment_fail_count(time_down, max_connect_retries + 1, fail_window);

Comment on lines +5980 to +5986
Copy link

Copilot AI Apr 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

max_connect_retries is a uint8_t and is used as max_connect_retries + 1 for the attempt budget. If max_connect_retries is 255 (or has already wrapped due to narrowing), + 1 overflows back to 0, which would make the down-threshold computation incorrect. Consider computing the attempt budget in a wider integer type and saturating/clamping to UINT8_MAX before passing it to increment_fail_count().

Copilot uses AI. Check for mistakes.
if (down) {
char *url_str = t_state.hdr_info.client_request.url_string_get_ref(nullptr);
std::string_view host_name{t_state.unmapped_url.host_get()};
swoc::bwprint(error_bw_buffer, "CONNECT : {::s} connecting to {} for host='{}' url='{}' fail_count='{}' marking down",
swoc::bwf::Errno(t_state.current.server->connect_result), t_state.current.server->dst_addr, host_name,
swoc::bwf::FirstOf(url_str, "<none>"), fail_count);
Log::error("%s", error_bw_buffer.c_str());
SMDbg(dbg_ctl_http, "hostdb update marking IP: %s as down", addrbuf);
ATS_PROBE2(hostdb_mark_ip_as_down, sm_id, addrbuf);
} else {
ATS_PROBE3(hostdb_inc_ip_failcount, sm_id, addrbuf, fail_count);
SMDbg(dbg_ctl_http, "hostdb increment IP failcount %s to %d", addrbuf, fail_count);
}
#ifdef DEBUG
ink_assert(std::chrono::system_clock::now() + t_state.txn_conf->down_server_timeout > time_down);
#endif
}

void
Expand Down
Loading