util: Extract helper function for backoff of retries on error response.

Extracts message_fetch and get_events algorithms for calculating the exponential backoff into a shared helper function. These match the algorithm that were designed for the Python API, except that we use a ratio of 2 rather than sqrt(2) in the message_fetch code path.
2025-11-13 18:36:36 +00:00 · 2025-01-29 21:00:36 +01:00
parent 79224b0149
commit 078ce89380
4 changed files with 84 additions and 40 deletions
--- a/web/src/message_fetch.ts
+++ b/web/src/message_fetch.ts
@@ -28,6 +28,7 @@ import {narrow_term_schema} from "./state_data.ts";
 import * as stream_data from "./stream_data.ts";
 import * as stream_list from "./stream_list.ts";
 import * as ui_report from "./ui_report.ts";
 import * as util from "./util.ts";
 const response_schema = z.object({
    anchor: z.number(),
@@ -444,28 +445,7 @@ export function load_messages(opts: MessageFetchOptions, attempt = 1): void {
            ui_report.show_error($("#connection-error"));
-            // We need to respect the server's rate-limiting headers, but beyond
+            const delay_secs = util.get_retry_backoff_seconds(xhr, attempt, true);
            // that, we also want to avoid contributing to a thundering herd if
            // the server is giving us 500s/502s.
            //
            // So we do the maximum of the retry-after header and an exponential
            // backoff with ratio 2 and half jitter. Starts at 1-2s and ends at
            // 16-32s after 5 failures.
            const backoff_scale = Math.min(2 ** attempt, 32);
            const backoff_delay_secs = ((1 + Math.random()) / 2) * backoff_scale;
            let rate_limit_delay_secs = 0;
            const rate_limited_error_schema = z.object({
                "retry-after": z.number(),
                code: z.literal("RATE_LIMIT_HIT"),
            });
            const parsed = rate_limited_error_schema.safeParse(xhr.responseJSON);
            if (xhr.status === 429 && parsed?.success && parsed?.data) {
                // Add a bit of jitter to the required delay suggested by the
                // server, because we may be racing with other copies of the web
                // app.
                rate_limit_delay_secs = parsed.data["retry-after"] + Math.random() * 0.5;
            }
            const delay_secs = Math.max(backoff_delay_secs, rate_limit_delay_secs);
            setTimeout(() => {
                load_messages(opts, attempt + 1);
            }, delay_secs * 1000);
--- a/web/src/server_events.js
+++ b/web/src/server_events.js
@@ -13,6 +13,7 @@ import * as sent_messages from "./sent_messages.ts";
 import * as server_events_dispatch from "./server_events_dispatch.js";
 import {server_message_schema} from "./server_message.ts";
 import * as ui_report from "./ui_report.ts";
 import * as util from "./util.ts";
 import * as watchdog from "./watchdog.ts";
 // Docs: https://zulip.readthedocs.io/en/latest/subsystems/events-system.html
@@ -243,24 +244,7 @@ function get_events({dont_block = false} = {}) {
                blueslip.error("Failed to handle get_events error", undefined, error);
            }
-            // We need to respect the server's rate-limiting headers, but beyond
+            const retry_delay_secs = util.get_retry_backoff_seconds(xhr, get_events_failures);
            // that, we also want to avoid contributing to a thundering herd if
            // the server is giving us 500s/502s.
            //
            // So we do the maximum of the retry-after header and an exponential
            // backoff with ratio sqrt(2) and half jitter. Starts at 1-2s and ends at
            // 45-90s after enough failures.
            const backoff_scale = Math.min(2 ** ((get_events_failures + 1) / 2), 90);
            const backoff_delay_secs = ((1 + Math.random()) / 2) * backoff_scale;
            let rate_limit_delay_secs = 0;
            if (xhr.status === 429 && xhr.responseJSON?.code === "RATE_LIMIT_HIT") {
                // Add a bit of jitter to the required delay suggested
                // by the server, because we may be racing with other
                // copies of the web app.
                rate_limit_delay_secs = xhr.responseJSON["retry-after"] + Math.random() * 0.5;
            }
            const retry_delay_secs = Math.max(backoff_delay_secs, rate_limit_delay_secs);
            get_events_timeout = setTimeout(get_events, retry_delay_secs * 1000);
        },
    });
--- a/web/src/util.ts
+++ b/web/src/util.ts
@@ -1,5 +1,6 @@
 import Handlebars from "handlebars/runtime.js";
 import _ from "lodash";
 import {z} from "zod";
 import * as blueslip from "./blueslip.ts";
 import {$t} from "./i18n.ts";
@@ -545,3 +546,39 @@ export function get_final_topic_display_name(topic_name: string): string {
    }
    return topic_name;
 }
 export function get_retry_backoff_seconds(
    xhr: JQuery.jqXHR<unknown>,
    attempts: number,
    tighter_backoff = false,
 ): number {
    // We need to respect the server's rate-limiting headers, but beyond
    // that, we also want to avoid contributing to a thundering herd if
    // the server is giving us 500/502 responses.
    //
    // We do the maximum of the retry-after header and an exponential
    // backoff.
    let backoff_scale: number;
    if (tighter_backoff) {
        // Starts at 1-2s and ends at 16-32s after enough failures.
        backoff_scale = Math.min(2 ** attempts, 32);
    } else {
        // Starts at 1-2s and ends at 45-90s after enough failures.
        backoff_scale = Math.min(2 ** ((attempts + 1) / 2), 90);
    }
    // Add a bit jitter to backoff scale.
    const backoff_delay_secs = ((1 + Math.random()) / 2) * backoff_scale;
    let rate_limit_delay_secs = 0;
    const rate_limited_error_schema = z.object({
        "retry-after": z.number(),
        code: z.literal("RATE_LIMIT_HIT"),
    });
    const parsed = rate_limited_error_schema.safeParse(xhr.responseJSON);
    if (xhr.status === 429 && parsed?.success && parsed?.data) {
        // Add a bit of jitter to the required delay suggested by the
        // server, because we may be racing with other copies of the web
        // app.
        rate_limit_delay_secs = parsed.data["retry-after"] + Math.random() * 0.5;
    }
    return Math.max(backoff_delay_secs, rate_limit_delay_secs);
 }
--- a/web/tests/util.test.cjs
+++ b/web/tests/util.test.cjs
@@ -510,3 +510,46 @@ run_test("get_final_topic_display_name", ({override}) => {
    override(realm, "realm_empty_topic_display_name", "random topic name");
    assert.deepEqual(util.get_final_topic_display_name(""), "random topic name");
 });
 run_test("get_retry_backoff_seconds", () => {
    const xhr_500_error = {
        status: 500,
    };
    // Shorter backoff scale
    // First retry should be between 1-2 seconds.
    let backoff = util.get_retry_backoff_seconds(xhr_500_error, 1, true);
    assert.ok(backoff >= 1);
    assert.ok(backoff < 3);
    // 100th retry should be between 16-32 seconds.
    backoff = util.get_retry_backoff_seconds(xhr_500_error, 100, true);
    assert.ok(backoff >= 16);
    assert.ok(backoff <= 32);
    // Longer backoff scale
    // First retry should be between 1-2 seconds.
    backoff = util.get_retry_backoff_seconds(xhr_500_error, 1);
    assert.ok(backoff >= 1);
    assert.ok(backoff <= 3);
    // 100th retry should be between 45-90 seconds.
    backoff = util.get_retry_backoff_seconds(xhr_500_error, 100);
    assert.ok(backoff >= 45);
    assert.ok(backoff <= 90);
    const xhr_rate_limit_error = {
        status: 429,
        responseJSON: {
            code: "RATE_LIMIT_HIT",
            msg: "API usage exceeded rate limit",
            result: "error",
            "retry-after": 28.706807374954224,
        },
    };
    // First retry should be greater than the retry-after value.
    backoff = util.get_retry_backoff_seconds(xhr_rate_limit_error, 1);
    assert.ok(backoff >= 28.706807374954224);
    // 100th retry should be between 45-90 seconds.
    backoff = util.get_retry_backoff_seconds(xhr_rate_limit_error, 100);
    assert.ok(backoff >= 45);
    assert.ok(backoff <= 90);
 });