-
Notifications
You must be signed in to change notification settings - Fork 859
fix: stop retrying 429s caused by Fabric capacity limits #2539
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -7,6 +7,7 @@ import com.microsoft.azure.synapse.ml.logging.SynapseMLLogging | |||||||
| import org.apache.commons.io.IOUtils | ||||||||
| import org.apache.http.client.config.RequestConfig | ||||||||
| import org.apache.http.client.methods.{CloseableHttpResponse, HttpPost, HttpRequestBase} | ||||||||
| import org.apache.http.entity.BufferedHttpEntity | ||||||||
| import org.apache.http.impl.client.{CloseableHttpClient, HttpClientBuilder} | ||||||||
| import org.apache.http.impl.conn.PoolingHttpClientConnectionManager | ||||||||
| import org.apache.spark.injections.UDFUtils | ||||||||
|
|
@@ -104,17 +105,32 @@ object HandlingUtils extends SparkLogging { | |||||||
| case 201 => true | ||||||||
| case 202 => true | ||||||||
| case 429 => | ||||||||
| Option(response.getFirstHeader("Retry-After")) | ||||||||
| .foreach { h => | ||||||||
| logInfo(s"waiting ${h.getValue} on ${ | ||||||||
| request match { | ||||||||
| case p: HttpPost => p.getURI + " " + | ||||||||
| Try(IOUtils.toString(p.getEntity.getContent, "UTF-8")).getOrElse("") | ||||||||
| case _ => request.getURI | ||||||||
| } | ||||||||
| }") | ||||||||
| } | ||||||||
| false | ||||||||
| // Buffer the response entity so the body can be inspected | ||||||||
| // and still returned to the caller if we don't retry | ||||||||
| if (response.getEntity != null) { | ||||||||
| response.setEntity(new BufferedHttpEntity(response.getEntity)) | ||||||||
| } | ||||||||
| val bodyStr = Option(response.getEntity) | ||||||||
| .flatMap(e => Try(IOUtils.toString(e.getContent, "UTF-8")).toOption) | ||||||||
| .getOrElse("") | ||||||||
|
||||||||
| if (bodyStr.contains("CapacityLimitExceeded")) { | ||||||||
| // Fabric capacity-exceeded 429s are NOT transient rate limits — | ||||||||
| // retrying will not help and causes hangs | ||||||||
| logWarning(s"Capacity limit exceeded (non-retryable 429) on ${request.getURI}: $bodyStr") | ||||||||
|
||||||||
| logWarning(s"Capacity limit exceeded (non-retryable 429) on ${request.getURI}: $bodyStr") | |
| logWarning(s"Capacity limit exceeded (non-retryable 429, code=CapacityLimitExceeded) " + | |
| s"on ${request.getURI}") |
Copilot
AI
Apr 3, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bodyStr.contains("CapacityLimitExceeded") is a brittle detector and could produce false positives/negatives (e.g., the substring appearing in a message, different JSON shapes, casing). Since the codebase already uses JSON parsing (e.g., spray-json in core), consider parsing the response JSON and checking error.code == "CapacityLimitExceeded" explicitly before deciding not to retry.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -273,6 +273,98 @@ class VerifySendWithRetries extends TestBase { | |
| } | ||
| } | ||
|
|
||
| test("429 with CapacityLimitExceeded body is not retried") { | ||
| val port = getFreePort | ||
| val requestCount = new AtomicInteger(0) | ||
| val capacityBody = | ||
| """{"error":{"code":"CapacityLimitExceeded","message":"Serverless capacity limit exceeded"}}""" | ||
| val server = startServer(port) { exchange => | ||
| val n = requestCount.incrementAndGet() | ||
| if (n == 1) { | ||
| respond(exchange, 429, capacityBody) | ||
| } else { | ||
| respond(exchange, 200, """{"ok":true}""") | ||
| } | ||
| } | ||
| try { | ||
| val client = HttpClients.createDefault() | ||
| val request = new HttpGet(s"http://localhost:$port/test") | ||
| val start = System.currentTimeMillis() | ||
| val response = HandlingUtils.sendWithRetries( | ||
| client, request, Array(100, 100, 100)) | ||
| val elapsed = System.currentTimeMillis() - start | ||
| val code = response.getStatusLine.getStatusCode | ||
| response.close() | ||
| client.close() | ||
|
|
||
| assert(code === 429, "Capacity-exceeded 429 should be returned immediately, not retried") | ||
| assert(requestCount.get() === 1, "Should not retry on CapacityLimitExceeded") | ||
| assert(elapsed < 1000, s"Should return immediately, took ${elapsed}ms") | ||
|
||
| } finally { | ||
| server.stop(0) | ||
| } | ||
| } | ||
|
|
||
| test("429 with CapacityLimitExceeded ignores Retry-After header") { | ||
| val port = getFreePort | ||
| val requestCount = new AtomicInteger(0) | ||
| val capacityBody = | ||
| """{"error":{"code":"CapacityLimitExceeded","message":"Serverless capacity limit exceeded"}}""" | ||
| val server = startServer(port) { exchange => | ||
| val n = requestCount.incrementAndGet() | ||
| if (n == 1) { | ||
| respond(exchange, 429, capacityBody, headers = Map("Retry-After" -> "5")) | ||
| } else { | ||
| respond(exchange, 200, """{"ok":true}""") | ||
| } | ||
| } | ||
| try { | ||
| val client = HttpClients.createDefault() | ||
| val request = new HttpGet(s"http://localhost:$port/test") | ||
| val start = System.currentTimeMillis() | ||
| val response = HandlingUtils.sendWithRetries( | ||
| client, request, Array(100, 100, 100)) | ||
| val elapsed = System.currentTimeMillis() - start | ||
| val code = response.getStatusLine.getStatusCode | ||
| response.close() | ||
| client.close() | ||
|
|
||
| assert(code === 429, "Capacity-exceeded should not retry even with Retry-After") | ||
| assert(requestCount.get() === 1, "Should not retry on CapacityLimitExceeded") | ||
| assert(elapsed < 1000, s"Should ignore Retry-After and return immediately, took ${elapsed}ms") | ||
| } finally { | ||
| server.stop(0) | ||
| } | ||
| } | ||
|
|
||
| test("429 with non-capacity error body still retries normally") { | ||
| val port = getFreePort | ||
| val requestCount = new AtomicInteger(0) | ||
| val rateLimitBody = """{"error":{"code":"RateLimitExceeded","message":"Too many requests"}}""" | ||
| val server = startServer(port) { exchange => | ||
| val n = requestCount.incrementAndGet() | ||
| if (n <= 2) { | ||
| respond(exchange, 429, rateLimitBody) | ||
| } else { | ||
| respond(exchange, 200, """{"ok":true}""") | ||
| } | ||
| } | ||
| try { | ||
| val client = HttpClients.createDefault() | ||
| val request = new HttpGet(s"http://localhost:$port/test") | ||
| val response = HandlingUtils.sendWithRetries( | ||
| client, request, Array(100, 100, 100)) | ||
| val code = response.getStatusLine.getStatusCode | ||
| response.close() | ||
| client.close() | ||
|
|
||
| assert(code === 200, "Non-capacity 429 should still retry and eventually succeed") | ||
| assert(requestCount.get() === 3, "Should have retried past the rate-limit 429s") | ||
| } finally { | ||
| server.stop(0) | ||
| } | ||
| } | ||
|
|
||
| test("429 with Retry-After 0 means retry immediately") { | ||
| val port = getFreePort | ||
| val requestCount = new AtomicInteger(0) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BufferedHttpEntitywill buffer the entire 429 response payload in memory. If a service returns a large body (or misbehaves), this can create avoidable memory pressure during retries. Consider guarding this with a size cap (e.g., based onContent-Length) or reading only a bounded prefix sufficient to detect the error code, and skip buffering/inspection when the entity is too large.