diff --git a/scripts/lib/zulip_tools.py b/scripts/lib/zulip_tools.py index 292eaa4c2e..3a2dae971f 100755 --- a/scripts/lib/zulip_tools.py +++ b/scripts/lib/zulip_tools.py @@ -696,6 +696,11 @@ def start_arg_parser(action: str, add_help: bool = False) -> argparse.ArgumentPa action="store_true", help=f"Only {action} Django (not Tornado or workers)", ) + which_services.add_argument( + "--tornado-reshard", + action="store_true", + help="Restart changed Tornado shards", + ) if action == "restart": parser.add_argument( "--less-graceful", diff --git a/scripts/refresh-sharding-and-restart b/scripts/refresh-sharding-and-restart index 26c7e9af80..4d29c706a7 100755 --- a/scripts/refresh-sharding-and-restart +++ b/scripts/refresh-sharding-and-restart @@ -12,9 +12,6 @@ if ! [ -e /etc/zulip/nginx_sharding_map.conf.tmp ] || ! [ -e /etc/zulip/sharding exit 1 fi -mv /etc/zulip/nginx_sharding_map.conf.tmp /etc/zulip/nginx_sharding_map.conf -mv /etc/zulip/sharding.json.tmp /etc/zulip/sharding.json - # In the ordering of operations below, the crucial detail is that # Django, Tornado, and workers need to be restarted before reloading # nginx. Django and Tornado have in-memory maps of which realm belongs @@ -28,5 +25,5 @@ mv /etc/zulip/sharding.json.tmp /etc/zulip/sharding.json # clients getting into reload loops ending in crashing on 500 response # while Django is restarting. For this reason it's important to # reload nginx only after Django and Tornado. -"$(dirname "$0")/restart-server" --skip-client-reloads +"$(dirname "$0")/restart-server" --skip-client-reloads --tornado-reshard service nginx reload diff --git a/scripts/restart-server b/scripts/restart-server index 45760e0763..f36dce046b 100755 --- a/scripts/restart-server +++ b/scripts/restart-server @@ -1,5 +1,6 @@ #!/usr/bin/env python3 import contextlib +import json import logging import os import pwd @@ -51,11 +52,18 @@ elif username != "zulip": logging.error("Must be run as user 'zulip'.") sys.exit(1) - if os.environ.get("RUNNING_UNDER_CRON") and os.path.exists(LOCK_DIR): logging.info("Skipping cron-triggered restart during deploy.") sys.exit(1) +if args.tornado_reshard: + tornado_reshard_files = ["/etc/zulip/sharding.json", "/etc/zulip/nginx_sharding_map.conf"] + if not all(os.path.exists(f"{path}.tmp") for path in tornado_reshard_files): + logging.info( + "No resharding changes to apply! Edit zulip.conf and run refresh-sharding-and-restart" + ) + sys.exit(1) + if not args.skip_checks: logging.info("Running syntax and database checks") subprocess.check_call(["./manage.py", "check", "--database", "default"]) @@ -174,6 +182,36 @@ def restart_or_start(service: str) -> None: subprocess.check_call(["supervisorctl", our_verb, service]) +def ports_as_set(val: int | list[int]) -> frozenset[int]: + return frozenset(val) if isinstance(val, list) else frozenset([val]) + + +def update_tornado_sharding() -> list[int]: + with open("/etc/zulip/sharding.json") as old_shard_fh: + old_sharding = json.load(old_shard_fh) + with open("/etc/zulip/sharding.json.tmp") as new_shard_fh: + new_sharding = json.load(new_shard_fh) + affected_tornados: set[int] = set() + for realm in set().union(old_sharding["shard_map"], new_sharding["shard_map"]): + old_ports = ports_as_set(old_sharding["shard_map"].get(realm, [])) + new_ports = ports_as_set(new_sharding["shard_map"].get(realm, [])) + affected_tornados |= old_ports ^ new_ports + old_regex_set = { + (regex, ports_as_set(ports)) for (regex, ports) in old_sharding["shard_regexes"] + } + new_regex_set = { + (regex, ports_as_set(ports)) for (regex, ports) in new_sharding["shard_regexes"] + } + for regex, ports in old_regex_set ^ new_regex_set: + affected_tornados |= ports + + tornado_reshard_files = ["/etc/zulip/sharding.json", "/etc/zulip/nginx_sharding_map.conf"] + for path in tornado_reshard_files: + os.rename(f"{path}.tmp", path) + + return list(affected_tornados) + + if action == "restart" and len(workers) > 0: if args.less_graceful: # The less graceful form stops every worker now; we start them @@ -190,6 +228,11 @@ if action == "restart" and len(workers) > 0: restart_or_start(worker) if has_application_server(): + if args.tornado_reshard: + affected_tornado_ports = update_tornado_sharding() + logging.info("Tornado ports affected by this resharding: %s", affected_tornado_ports) + else: + affected_tornado_ports = tornado_ports if not args.only_django: # Next, we restart the Tornado processes sequentially, in order to # minimize downtime of the tornado service caused by too many @@ -200,7 +243,7 @@ if has_application_server(): # this before restarting Django, in case there are new event types # which it will need to know how to deal with. if len(tornado_ports) > 1: - for p in tornado_ports: + for p in affected_tornado_ports: # Restart Tornado processes individually for a better rate of # restarts. This also avoids behavior with restarting a whole # supervisord group where if any individual process is slow to