util: Extract helper function for backoff of retries on error response.

Extracts message_fetch and get_events algorithms for calculating the
exponential backoff into a shared helper function.

These match the algorithm that were designed for the Python API, except
that we use a ratio of 2 rather than sqrt(2) in the message_fetch code
path.
This commit is contained in:
Lauryn Menard
2025-01-29 21:00:36 +01:00
committed by Tim Abbott
parent 79224b0149
commit 078ce89380
4 changed files with 84 additions and 40 deletions

View File

@@ -28,6 +28,7 @@ import {narrow_term_schema} from "./state_data.ts";
import * as stream_data from "./stream_data.ts";
import * as stream_list from "./stream_list.ts";
import * as ui_report from "./ui_report.ts";
import * as util from "./util.ts";
const response_schema = z.object({
anchor: z.number(),
@@ -444,28 +445,7 @@ export function load_messages(opts: MessageFetchOptions, attempt = 1): void {
ui_report.show_error($("#connection-error"));
// We need to respect the server's rate-limiting headers, but beyond
// that, we also want to avoid contributing to a thundering herd if
// the server is giving us 500s/502s.
//
// So we do the maximum of the retry-after header and an exponential
// backoff with ratio 2 and half jitter. Starts at 1-2s and ends at
// 16-32s after 5 failures.
const backoff_scale = Math.min(2 ** attempt, 32);
const backoff_delay_secs = ((1 + Math.random()) / 2) * backoff_scale;
let rate_limit_delay_secs = 0;
const rate_limited_error_schema = z.object({
"retry-after": z.number(),
code: z.literal("RATE_LIMIT_HIT"),
});
const parsed = rate_limited_error_schema.safeParse(xhr.responseJSON);
if (xhr.status === 429 && parsed?.success && parsed?.data) {
// Add a bit of jitter to the required delay suggested by the
// server, because we may be racing with other copies of the web
// app.
rate_limit_delay_secs = parsed.data["retry-after"] + Math.random() * 0.5;
}
const delay_secs = Math.max(backoff_delay_secs, rate_limit_delay_secs);
const delay_secs = util.get_retry_backoff_seconds(xhr, attempt, true);
setTimeout(() => {
load_messages(opts, attempt + 1);
}, delay_secs * 1000);

View File

@@ -13,6 +13,7 @@ import * as sent_messages from "./sent_messages.ts";
import * as server_events_dispatch from "./server_events_dispatch.js";
import {server_message_schema} from "./server_message.ts";
import * as ui_report from "./ui_report.ts";
import * as util from "./util.ts";
import * as watchdog from "./watchdog.ts";
// Docs: https://zulip.readthedocs.io/en/latest/subsystems/events-system.html
@@ -243,24 +244,7 @@ function get_events({dont_block = false} = {}) {
blueslip.error("Failed to handle get_events error", undefined, error);
}
// We need to respect the server's rate-limiting headers, but beyond
// that, we also want to avoid contributing to a thundering herd if
// the server is giving us 500s/502s.
//
// So we do the maximum of the retry-after header and an exponential
// backoff with ratio sqrt(2) and half jitter. Starts at 1-2s and ends at
// 45-90s after enough failures.
const backoff_scale = Math.min(2 ** ((get_events_failures + 1) / 2), 90);
const backoff_delay_secs = ((1 + Math.random()) / 2) * backoff_scale;
let rate_limit_delay_secs = 0;
if (xhr.status === 429 && xhr.responseJSON?.code === "RATE_LIMIT_HIT") {
// Add a bit of jitter to the required delay suggested
// by the server, because we may be racing with other
// copies of the web app.
rate_limit_delay_secs = xhr.responseJSON["retry-after"] + Math.random() * 0.5;
}
const retry_delay_secs = Math.max(backoff_delay_secs, rate_limit_delay_secs);
const retry_delay_secs = util.get_retry_backoff_seconds(xhr, get_events_failures);
get_events_timeout = setTimeout(get_events, retry_delay_secs * 1000);
},
});

View File

@@ -1,5 +1,6 @@
import Handlebars from "handlebars/runtime.js";
import _ from "lodash";
import {z} from "zod";
import * as blueslip from "./blueslip.ts";
import {$t} from "./i18n.ts";
@@ -545,3 +546,39 @@ export function get_final_topic_display_name(topic_name: string): string {
}
return topic_name;
}
export function get_retry_backoff_seconds(
xhr: JQuery.jqXHR<unknown>,
attempts: number,
tighter_backoff = false,
): number {
// We need to respect the server's rate-limiting headers, but beyond
// that, we also want to avoid contributing to a thundering herd if
// the server is giving us 500/502 responses.
//
// We do the maximum of the retry-after header and an exponential
// backoff.
let backoff_scale: number;
if (tighter_backoff) {
// Starts at 1-2s and ends at 16-32s after enough failures.
backoff_scale = Math.min(2 ** attempts, 32);
} else {
// Starts at 1-2s and ends at 45-90s after enough failures.
backoff_scale = Math.min(2 ** ((attempts + 1) / 2), 90);
}
// Add a bit jitter to backoff scale.
const backoff_delay_secs = ((1 + Math.random()) / 2) * backoff_scale;
let rate_limit_delay_secs = 0;
const rate_limited_error_schema = z.object({
"retry-after": z.number(),
code: z.literal("RATE_LIMIT_HIT"),
});
const parsed = rate_limited_error_schema.safeParse(xhr.responseJSON);
if (xhr.status === 429 && parsed?.success && parsed?.data) {
// Add a bit of jitter to the required delay suggested by the
// server, because we may be racing with other copies of the web
// app.
rate_limit_delay_secs = parsed.data["retry-after"] + Math.random() * 0.5;
}
return Math.max(backoff_delay_secs, rate_limit_delay_secs);
}

View File

@@ -510,3 +510,46 @@ run_test("get_final_topic_display_name", ({override}) => {
override(realm, "realm_empty_topic_display_name", "random topic name");
assert.deepEqual(util.get_final_topic_display_name(""), "random topic name");
});
run_test("get_retry_backoff_seconds", () => {
const xhr_500_error = {
status: 500,
};
// Shorter backoff scale
// First retry should be between 1-2 seconds.
let backoff = util.get_retry_backoff_seconds(xhr_500_error, 1, true);
assert.ok(backoff >= 1);
assert.ok(backoff < 3);
// 100th retry should be between 16-32 seconds.
backoff = util.get_retry_backoff_seconds(xhr_500_error, 100, true);
assert.ok(backoff >= 16);
assert.ok(backoff <= 32);
// Longer backoff scale
// First retry should be between 1-2 seconds.
backoff = util.get_retry_backoff_seconds(xhr_500_error, 1);
assert.ok(backoff >= 1);
assert.ok(backoff <= 3);
// 100th retry should be between 45-90 seconds.
backoff = util.get_retry_backoff_seconds(xhr_500_error, 100);
assert.ok(backoff >= 45);
assert.ok(backoff <= 90);
const xhr_rate_limit_error = {
status: 429,
responseJSON: {
code: "RATE_LIMIT_HIT",
msg: "API usage exceeded rate limit",
result: "error",
"retry-after": 28.706807374954224,
},
};
// First retry should be greater than the retry-after value.
backoff = util.get_retry_backoff_seconds(xhr_rate_limit_error, 1);
assert.ok(backoff >= 28.706807374954224);
// 100th retry should be between 45-90 seconds.
backoff = util.get_retry_backoff_seconds(xhr_rate_limit_error, 100);
assert.ok(backoff >= 45);
assert.ok(backoff <= 90);
});