mirror of
https://github.com/zulip/zulip.git
synced 2025-11-13 18:36:36 +00:00
util: Extract helper function for backoff of retries on error response.
Extracts message_fetch and get_events algorithms for calculating the exponential backoff into a shared helper function. These match the algorithm that were designed for the Python API, except that we use a ratio of 2 rather than sqrt(2) in the message_fetch code path.
This commit is contained in:
committed by
Tim Abbott
parent
79224b0149
commit
078ce89380
@@ -28,6 +28,7 @@ import {narrow_term_schema} from "./state_data.ts";
|
|||||||
import * as stream_data from "./stream_data.ts";
|
import * as stream_data from "./stream_data.ts";
|
||||||
import * as stream_list from "./stream_list.ts";
|
import * as stream_list from "./stream_list.ts";
|
||||||
import * as ui_report from "./ui_report.ts";
|
import * as ui_report from "./ui_report.ts";
|
||||||
|
import * as util from "./util.ts";
|
||||||
|
|
||||||
const response_schema = z.object({
|
const response_schema = z.object({
|
||||||
anchor: z.number(),
|
anchor: z.number(),
|
||||||
@@ -444,28 +445,7 @@ export function load_messages(opts: MessageFetchOptions, attempt = 1): void {
|
|||||||
|
|
||||||
ui_report.show_error($("#connection-error"));
|
ui_report.show_error($("#connection-error"));
|
||||||
|
|
||||||
// We need to respect the server's rate-limiting headers, but beyond
|
const delay_secs = util.get_retry_backoff_seconds(xhr, attempt, true);
|
||||||
// that, we also want to avoid contributing to a thundering herd if
|
|
||||||
// the server is giving us 500s/502s.
|
|
||||||
//
|
|
||||||
// So we do the maximum of the retry-after header and an exponential
|
|
||||||
// backoff with ratio 2 and half jitter. Starts at 1-2s and ends at
|
|
||||||
// 16-32s after 5 failures.
|
|
||||||
const backoff_scale = Math.min(2 ** attempt, 32);
|
|
||||||
const backoff_delay_secs = ((1 + Math.random()) / 2) * backoff_scale;
|
|
||||||
let rate_limit_delay_secs = 0;
|
|
||||||
const rate_limited_error_schema = z.object({
|
|
||||||
"retry-after": z.number(),
|
|
||||||
code: z.literal("RATE_LIMIT_HIT"),
|
|
||||||
});
|
|
||||||
const parsed = rate_limited_error_schema.safeParse(xhr.responseJSON);
|
|
||||||
if (xhr.status === 429 && parsed?.success && parsed?.data) {
|
|
||||||
// Add a bit of jitter to the required delay suggested by the
|
|
||||||
// server, because we may be racing with other copies of the web
|
|
||||||
// app.
|
|
||||||
rate_limit_delay_secs = parsed.data["retry-after"] + Math.random() * 0.5;
|
|
||||||
}
|
|
||||||
const delay_secs = Math.max(backoff_delay_secs, rate_limit_delay_secs);
|
|
||||||
setTimeout(() => {
|
setTimeout(() => {
|
||||||
load_messages(opts, attempt + 1);
|
load_messages(opts, attempt + 1);
|
||||||
}, delay_secs * 1000);
|
}, delay_secs * 1000);
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ import * as sent_messages from "./sent_messages.ts";
|
|||||||
import * as server_events_dispatch from "./server_events_dispatch.js";
|
import * as server_events_dispatch from "./server_events_dispatch.js";
|
||||||
import {server_message_schema} from "./server_message.ts";
|
import {server_message_schema} from "./server_message.ts";
|
||||||
import * as ui_report from "./ui_report.ts";
|
import * as ui_report from "./ui_report.ts";
|
||||||
|
import * as util from "./util.ts";
|
||||||
import * as watchdog from "./watchdog.ts";
|
import * as watchdog from "./watchdog.ts";
|
||||||
|
|
||||||
// Docs: https://zulip.readthedocs.io/en/latest/subsystems/events-system.html
|
// Docs: https://zulip.readthedocs.io/en/latest/subsystems/events-system.html
|
||||||
@@ -243,24 +244,7 @@ function get_events({dont_block = false} = {}) {
|
|||||||
blueslip.error("Failed to handle get_events error", undefined, error);
|
blueslip.error("Failed to handle get_events error", undefined, error);
|
||||||
}
|
}
|
||||||
|
|
||||||
// We need to respect the server's rate-limiting headers, but beyond
|
const retry_delay_secs = util.get_retry_backoff_seconds(xhr, get_events_failures);
|
||||||
// that, we also want to avoid contributing to a thundering herd if
|
|
||||||
// the server is giving us 500s/502s.
|
|
||||||
//
|
|
||||||
// So we do the maximum of the retry-after header and an exponential
|
|
||||||
// backoff with ratio sqrt(2) and half jitter. Starts at 1-2s and ends at
|
|
||||||
// 45-90s after enough failures.
|
|
||||||
const backoff_scale = Math.min(2 ** ((get_events_failures + 1) / 2), 90);
|
|
||||||
const backoff_delay_secs = ((1 + Math.random()) / 2) * backoff_scale;
|
|
||||||
let rate_limit_delay_secs = 0;
|
|
||||||
if (xhr.status === 429 && xhr.responseJSON?.code === "RATE_LIMIT_HIT") {
|
|
||||||
// Add a bit of jitter to the required delay suggested
|
|
||||||
// by the server, because we may be racing with other
|
|
||||||
// copies of the web app.
|
|
||||||
rate_limit_delay_secs = xhr.responseJSON["retry-after"] + Math.random() * 0.5;
|
|
||||||
}
|
|
||||||
|
|
||||||
const retry_delay_secs = Math.max(backoff_delay_secs, rate_limit_delay_secs);
|
|
||||||
get_events_timeout = setTimeout(get_events, retry_delay_secs * 1000);
|
get_events_timeout = setTimeout(get_events, retry_delay_secs * 1000);
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
import Handlebars from "handlebars/runtime.js";
|
import Handlebars from "handlebars/runtime.js";
|
||||||
import _ from "lodash";
|
import _ from "lodash";
|
||||||
|
import {z} from "zod";
|
||||||
|
|
||||||
import * as blueslip from "./blueslip.ts";
|
import * as blueslip from "./blueslip.ts";
|
||||||
import {$t} from "./i18n.ts";
|
import {$t} from "./i18n.ts";
|
||||||
@@ -545,3 +546,39 @@ export function get_final_topic_display_name(topic_name: string): string {
|
|||||||
}
|
}
|
||||||
return topic_name;
|
return topic_name;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function get_retry_backoff_seconds(
|
||||||
|
xhr: JQuery.jqXHR<unknown>,
|
||||||
|
attempts: number,
|
||||||
|
tighter_backoff = false,
|
||||||
|
): number {
|
||||||
|
// We need to respect the server's rate-limiting headers, but beyond
|
||||||
|
// that, we also want to avoid contributing to a thundering herd if
|
||||||
|
// the server is giving us 500/502 responses.
|
||||||
|
//
|
||||||
|
// We do the maximum of the retry-after header and an exponential
|
||||||
|
// backoff.
|
||||||
|
let backoff_scale: number;
|
||||||
|
if (tighter_backoff) {
|
||||||
|
// Starts at 1-2s and ends at 16-32s after enough failures.
|
||||||
|
backoff_scale = Math.min(2 ** attempts, 32);
|
||||||
|
} else {
|
||||||
|
// Starts at 1-2s and ends at 45-90s after enough failures.
|
||||||
|
backoff_scale = Math.min(2 ** ((attempts + 1) / 2), 90);
|
||||||
|
}
|
||||||
|
// Add a bit jitter to backoff scale.
|
||||||
|
const backoff_delay_secs = ((1 + Math.random()) / 2) * backoff_scale;
|
||||||
|
let rate_limit_delay_secs = 0;
|
||||||
|
const rate_limited_error_schema = z.object({
|
||||||
|
"retry-after": z.number(),
|
||||||
|
code: z.literal("RATE_LIMIT_HIT"),
|
||||||
|
});
|
||||||
|
const parsed = rate_limited_error_schema.safeParse(xhr.responseJSON);
|
||||||
|
if (xhr.status === 429 && parsed?.success && parsed?.data) {
|
||||||
|
// Add a bit of jitter to the required delay suggested by the
|
||||||
|
// server, because we may be racing with other copies of the web
|
||||||
|
// app.
|
||||||
|
rate_limit_delay_secs = parsed.data["retry-after"] + Math.random() * 0.5;
|
||||||
|
}
|
||||||
|
return Math.max(backoff_delay_secs, rate_limit_delay_secs);
|
||||||
|
}
|
||||||
|
|||||||
@@ -510,3 +510,46 @@ run_test("get_final_topic_display_name", ({override}) => {
|
|||||||
override(realm, "realm_empty_topic_display_name", "random topic name");
|
override(realm, "realm_empty_topic_display_name", "random topic name");
|
||||||
assert.deepEqual(util.get_final_topic_display_name(""), "random topic name");
|
assert.deepEqual(util.get_final_topic_display_name(""), "random topic name");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
run_test("get_retry_backoff_seconds", () => {
|
||||||
|
const xhr_500_error = {
|
||||||
|
status: 500,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Shorter backoff scale
|
||||||
|
// First retry should be between 1-2 seconds.
|
||||||
|
let backoff = util.get_retry_backoff_seconds(xhr_500_error, 1, true);
|
||||||
|
assert.ok(backoff >= 1);
|
||||||
|
assert.ok(backoff < 3);
|
||||||
|
// 100th retry should be between 16-32 seconds.
|
||||||
|
backoff = util.get_retry_backoff_seconds(xhr_500_error, 100, true);
|
||||||
|
assert.ok(backoff >= 16);
|
||||||
|
assert.ok(backoff <= 32);
|
||||||
|
|
||||||
|
// Longer backoff scale
|
||||||
|
// First retry should be between 1-2 seconds.
|
||||||
|
backoff = util.get_retry_backoff_seconds(xhr_500_error, 1);
|
||||||
|
assert.ok(backoff >= 1);
|
||||||
|
assert.ok(backoff <= 3);
|
||||||
|
// 100th retry should be between 45-90 seconds.
|
||||||
|
backoff = util.get_retry_backoff_seconds(xhr_500_error, 100);
|
||||||
|
assert.ok(backoff >= 45);
|
||||||
|
assert.ok(backoff <= 90);
|
||||||
|
|
||||||
|
const xhr_rate_limit_error = {
|
||||||
|
status: 429,
|
||||||
|
responseJSON: {
|
||||||
|
code: "RATE_LIMIT_HIT",
|
||||||
|
msg: "API usage exceeded rate limit",
|
||||||
|
result: "error",
|
||||||
|
"retry-after": 28.706807374954224,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
// First retry should be greater than the retry-after value.
|
||||||
|
backoff = util.get_retry_backoff_seconds(xhr_rate_limit_error, 1);
|
||||||
|
assert.ok(backoff >= 28.706807374954224);
|
||||||
|
// 100th retry should be between 45-90 seconds.
|
||||||
|
backoff = util.get_retry_backoff_seconds(xhr_rate_limit_error, 100);
|
||||||
|
assert.ok(backoff >= 45);
|
||||||
|
assert.ok(backoff <= 90);
|
||||||
|
});
|
||||||
|
|||||||
Reference in New Issue
Block a user