sharding: Swap new config into place during restart-server.

This allows restart-server, before moving the new config into place,
to perform a diff and only restart the affected Tornado ports.
This commit is contained in:
Alex Vandiver
2025-10-07 19:16:51 +00:00
committed by Tim Abbott
parent e18b0fcd16
commit 01664a1a10
3 changed files with 51 additions and 6 deletions

View File

@@ -696,6 +696,11 @@ def start_arg_parser(action: str, add_help: bool = False) -> argparse.ArgumentPa
action="store_true",
help=f"Only {action} Django (not Tornado or workers)",
)
which_services.add_argument(
"--tornado-reshard",
action="store_true",
help="Restart changed Tornado shards",
)
if action == "restart":
parser.add_argument(
"--less-graceful",

View File

@@ -12,9 +12,6 @@ if ! [ -e /etc/zulip/nginx_sharding_map.conf.tmp ] || ! [ -e /etc/zulip/sharding
exit 1
fi
mv /etc/zulip/nginx_sharding_map.conf.tmp /etc/zulip/nginx_sharding_map.conf
mv /etc/zulip/sharding.json.tmp /etc/zulip/sharding.json
# In the ordering of operations below, the crucial detail is that
# Django, Tornado, and workers need to be restarted before reloading
# nginx. Django and Tornado have in-memory maps of which realm belongs
@@ -28,5 +25,5 @@ mv /etc/zulip/sharding.json.tmp /etc/zulip/sharding.json
# clients getting into reload loops ending in crashing on 500 response
# while Django is restarting. For this reason it's important to
# reload nginx only after Django and Tornado.
"$(dirname "$0")/restart-server" --skip-client-reloads
"$(dirname "$0")/restart-server" --skip-client-reloads --tornado-reshard
service nginx reload

View File

@@ -1,5 +1,6 @@
#!/usr/bin/env python3
import contextlib
import json
import logging
import os
import pwd
@@ -51,11 +52,18 @@ elif username != "zulip":
logging.error("Must be run as user 'zulip'.")
sys.exit(1)
if os.environ.get("RUNNING_UNDER_CRON") and os.path.exists(LOCK_DIR):
logging.info("Skipping cron-triggered restart during deploy.")
sys.exit(1)
if args.tornado_reshard:
tornado_reshard_files = ["/etc/zulip/sharding.json", "/etc/zulip/nginx_sharding_map.conf"]
if not all(os.path.exists(f"{path}.tmp") for path in tornado_reshard_files):
logging.info(
"No resharding changes to apply! Edit zulip.conf and run refresh-sharding-and-restart"
)
sys.exit(1)
if not args.skip_checks:
logging.info("Running syntax and database checks")
subprocess.check_call(["./manage.py", "check", "--database", "default"])
@@ -174,6 +182,36 @@ def restart_or_start(service: str) -> None:
subprocess.check_call(["supervisorctl", our_verb, service])
def ports_as_set(val: int | list[int]) -> frozenset[int]:
return frozenset(val) if isinstance(val, list) else frozenset([val])
def update_tornado_sharding() -> list[int]:
with open("/etc/zulip/sharding.json") as old_shard_fh:
old_sharding = json.load(old_shard_fh)
with open("/etc/zulip/sharding.json.tmp") as new_shard_fh:
new_sharding = json.load(new_shard_fh)
affected_tornados: set[int] = set()
for realm in set().union(old_sharding["shard_map"], new_sharding["shard_map"]):
old_ports = ports_as_set(old_sharding["shard_map"].get(realm, []))
new_ports = ports_as_set(new_sharding["shard_map"].get(realm, []))
affected_tornados |= old_ports ^ new_ports
old_regex_set = {
(regex, ports_as_set(ports)) for (regex, ports) in old_sharding["shard_regexes"]
}
new_regex_set = {
(regex, ports_as_set(ports)) for (regex, ports) in new_sharding["shard_regexes"]
}
for regex, ports in old_regex_set ^ new_regex_set:
affected_tornados |= ports
tornado_reshard_files = ["/etc/zulip/sharding.json", "/etc/zulip/nginx_sharding_map.conf"]
for path in tornado_reshard_files:
os.rename(f"{path}.tmp", path)
return list(affected_tornados)
if action == "restart" and len(workers) > 0:
if args.less_graceful:
# The less graceful form stops every worker now; we start them
@@ -190,6 +228,11 @@ if action == "restart" and len(workers) > 0:
restart_or_start(worker)
if has_application_server():
if args.tornado_reshard:
affected_tornado_ports = update_tornado_sharding()
logging.info("Tornado ports affected by this resharding: %s", affected_tornado_ports)
else:
affected_tornado_ports = tornado_ports
if not args.only_django:
# Next, we restart the Tornado processes sequentially, in order to
# minimize downtime of the tornado service caused by too many
@@ -200,7 +243,7 @@ if has_application_server():
# this before restarting Django, in case there are new event types
# which it will need to know how to deal with.
if len(tornado_ports) > 1:
for p in tornado_ports:
for p in affected_tornado_ports:
# Restart Tornado processes individually for a better rate of
# restarts. This also avoids behavior with restarting a whole
# supervisord group where if any individual process is slow to