util: Extract helper function for backoff of retries on error response.

Extracts message_fetch and get_events algorithms for calculating the exponential backoff into a shared helper function. These match the algorithm that were designed for the Python API, except that we use a ratio of 2 rather than sqrt(2) in the message_fetch code path.
2025-11-13 10:26:28 +00:00 · 2025-01-29 21:00:36 +01:00
parent 79224b0149
commit 078ce89380
4 changed files with 84 additions and 40 deletions
--- a/web/src/message_fetch.ts
+++ b/web/src/message_fetch.ts
@@ -28,6 +28,7 @@ import {narrow_term_schema} from "./state_data.ts";
 import * as stream_data from "./stream_data.ts";
 import * as stream_list from "./stream_list.ts";
 import * as ui_report from "./ui_report.ts";
+import * as util from "./util.ts";

 const response_schema = z.object({
    anchor: z.number(),
@@ -444,28 +445,7 @@ export function load_messages(opts: MessageFetchOptions, attempt = 1): void {

            ui_report.show_error($("#connection-error"));

-            // We need to respect the server's rate-limiting headers, but beyond
-            // that, we also want to avoid contributing to a thundering herd if
-            // the server is giving us 500s/502s.
-            //
-            // So we do the maximum of the retry-after header and an exponential
-            // backoff with ratio 2 and half jitter. Starts at 1-2s and ends at
-            // 16-32s after 5 failures.
-            const backoff_scale = Math.min(2 ** attempt, 32);
-            const backoff_delay_secs = ((1 + Math.random()) / 2) * backoff_scale;
-            let rate_limit_delay_secs = 0;
-            const rate_limited_error_schema = z.object({
-                "retry-after": z.number(),
-                code: z.literal("RATE_LIMIT_HIT"),
-            });
-            const parsed = rate_limited_error_schema.safeParse(xhr.responseJSON);
-            if (xhr.status === 429 && parsed?.success && parsed?.data) {
-                // Add a bit of jitter to the required delay suggested by the
-                // server, because we may be racing with other copies of the web
-                // app.
-                rate_limit_delay_secs = parsed.data["retry-after"] + Math.random() * 0.5;
-            }
-            const delay_secs = Math.max(backoff_delay_secs, rate_limit_delay_secs);
+            const delay_secs = util.get_retry_backoff_seconds(xhr, attempt, true);
            setTimeout(() => {
                load_messages(opts, attempt + 1);
            }, delay_secs * 1000);
--- a/web/src/server_events.js
+++ b/web/src/server_events.js
@@ -13,6 +13,7 @@ import * as sent_messages from "./sent_messages.ts";
 import * as server_events_dispatch from "./server_events_dispatch.js";
 import {server_message_schema} from "./server_message.ts";
 import * as ui_report from "./ui_report.ts";
+import * as util from "./util.ts";
 import * as watchdog from "./watchdog.ts";

 // Docs: https://zulip.readthedocs.io/en/latest/subsystems/events-system.html
@@ -243,24 +244,7 @@ function get_events({dont_block = false} = {}) {
                blueslip.error("Failed to handle get_events error", undefined, error);
            }

-            // We need to respect the server's rate-limiting headers, but beyond
-            // that, we also want to avoid contributing to a thundering herd if
-            // the server is giving us 500s/502s.
-            //
-            // So we do the maximum of the retry-after header and an exponential
-            // backoff with ratio sqrt(2) and half jitter. Starts at 1-2s and ends at
-            // 45-90s after enough failures.
-            const backoff_scale = Math.min(2 ** ((get_events_failures + 1) / 2), 90);
-            const backoff_delay_secs = ((1 + Math.random()) / 2) * backoff_scale;
-            let rate_limit_delay_secs = 0;
-            if (xhr.status === 429 && xhr.responseJSON?.code === "RATE_LIMIT_HIT") {
-                // Add a bit of jitter to the required delay suggested
-                // by the server, because we may be racing with other
-                // copies of the web app.
-                rate_limit_delay_secs = xhr.responseJSON["retry-after"] + Math.random() * 0.5;
-            }
-
-            const retry_delay_secs = Math.max(backoff_delay_secs, rate_limit_delay_secs);
+            const retry_delay_secs = util.get_retry_backoff_seconds(xhr, get_events_failures);
            get_events_timeout = setTimeout(get_events, retry_delay_secs * 1000);
        },
    });
--- a/web/src/util.ts
+++ b/web/src/util.ts
@@ -1,5 +1,6 @@
 import Handlebars from "handlebars/runtime.js";
 import _ from "lodash";
+import {z} from "zod";

 import * as blueslip from "./blueslip.ts";
 import {$t} from "./i18n.ts";
@@ -545,3 +546,39 @@ export function get_final_topic_display_name(topic_name: string): string {
    }
    return topic_name;
 }
+
+export function get_retry_backoff_seconds(
+    xhr: JQuery.jqXHR<unknown>,
+    attempts: number,
+    tighter_backoff = false,
+): number {
+    // We need to respect the server's rate-limiting headers, but beyond
+    // that, we also want to avoid contributing to a thundering herd if
+    // the server is giving us 500/502 responses.
+    //
+    // We do the maximum of the retry-after header and an exponential
+    // backoff.
+    let backoff_scale: number;
+    if (tighter_backoff) {
+        // Starts at 1-2s and ends at 16-32s after enough failures.
+        backoff_scale = Math.min(2 ** attempts, 32);
+    } else {
+        // Starts at 1-2s and ends at 45-90s after enough failures.
+        backoff_scale = Math.min(2 ** ((attempts + 1) / 2), 90);
+    }
+    // Add a bit jitter to backoff scale.
+    const backoff_delay_secs = ((1 + Math.random()) / 2) * backoff_scale;
+    let rate_limit_delay_secs = 0;
+    const rate_limited_error_schema = z.object({
+        "retry-after": z.number(),
+        code: z.literal("RATE_LIMIT_HIT"),
+    });
+    const parsed = rate_limited_error_schema.safeParse(xhr.responseJSON);
+    if (xhr.status === 429 && parsed?.success && parsed?.data) {
+        // Add a bit of jitter to the required delay suggested by the
+        // server, because we may be racing with other copies of the web
+        // app.
+        rate_limit_delay_secs = parsed.data["retry-after"] + Math.random() * 0.5;
+    }
+    return Math.max(backoff_delay_secs, rate_limit_delay_secs);
+}
--- a/web/tests/util.test.cjs
+++ b/web/tests/util.test.cjs
@@ -510,3 +510,46 @@ run_test("get_final_topic_display_name", ({override}) => {
    override(realm, "realm_empty_topic_display_name", "random topic name");
    assert.deepEqual(util.get_final_topic_display_name(""), "random topic name");
 });
+
+run_test("get_retry_backoff_seconds", () => {
+    const xhr_500_error = {
+        status: 500,
+    };
+
+    // Shorter backoff scale
+    // First retry should be between 1-2 seconds.
+    let backoff = util.get_retry_backoff_seconds(xhr_500_error, 1, true);
+    assert.ok(backoff >= 1);
+    assert.ok(backoff < 3);
+    // 100th retry should be between 16-32 seconds.
+    backoff = util.get_retry_backoff_seconds(xhr_500_error, 100, true);
+    assert.ok(backoff >= 16);
+    assert.ok(backoff <= 32);
+
+    // Longer backoff scale
+    // First retry should be between 1-2 seconds.
+    backoff = util.get_retry_backoff_seconds(xhr_500_error, 1);
+    assert.ok(backoff >= 1);
+    assert.ok(backoff <= 3);
+    // 100th retry should be between 45-90 seconds.
+    backoff = util.get_retry_backoff_seconds(xhr_500_error, 100);
+    assert.ok(backoff >= 45);
+    assert.ok(backoff <= 90);
+
+    const xhr_rate_limit_error = {
+        status: 429,
+        responseJSON: {
+            code: "RATE_LIMIT_HIT",
+            msg: "API usage exceeded rate limit",
+            result: "error",
+            "retry-after": 28.706807374954224,
+        },
+    };
+    // First retry should be greater than the retry-after value.
+    backoff = util.get_retry_backoff_seconds(xhr_rate_limit_error, 1);
+    assert.ok(backoff >= 28.706807374954224);
+    // 100th retry should be between 45-90 seconds.
+    backoff = util.get_retry_backoff_seconds(xhr_rate_limit_error, 100);
+    assert.ok(backoff >= 45);
+    assert.ok(backoff <= 90);
+});