mirror of
https://github.com/zulip/zulip.git
synced 2025-10-23 04:52:12 +00:00
sharding: Swap new config into place during restart-server.
This allows restart-server, before moving the new config into place, to perform a diff and only restart the affected Tornado ports.
This commit is contained in:
committed by
Tim Abbott
parent
e18b0fcd16
commit
01664a1a10
@@ -696,6 +696,11 @@ def start_arg_parser(action: str, add_help: bool = False) -> argparse.ArgumentPa
|
||||
action="store_true",
|
||||
help=f"Only {action} Django (not Tornado or workers)",
|
||||
)
|
||||
which_services.add_argument(
|
||||
"--tornado-reshard",
|
||||
action="store_true",
|
||||
help="Restart changed Tornado shards",
|
||||
)
|
||||
if action == "restart":
|
||||
parser.add_argument(
|
||||
"--less-graceful",
|
||||
|
@@ -12,9 +12,6 @@ if ! [ -e /etc/zulip/nginx_sharding_map.conf.tmp ] || ! [ -e /etc/zulip/sharding
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mv /etc/zulip/nginx_sharding_map.conf.tmp /etc/zulip/nginx_sharding_map.conf
|
||||
mv /etc/zulip/sharding.json.tmp /etc/zulip/sharding.json
|
||||
|
||||
# In the ordering of operations below, the crucial detail is that
|
||||
# Django, Tornado, and workers need to be restarted before reloading
|
||||
# nginx. Django and Tornado have in-memory maps of which realm belongs
|
||||
@@ -28,5 +25,5 @@ mv /etc/zulip/sharding.json.tmp /etc/zulip/sharding.json
|
||||
# clients getting into reload loops ending in crashing on 500 response
|
||||
# while Django is restarting. For this reason it's important to
|
||||
# reload nginx only after Django and Tornado.
|
||||
"$(dirname "$0")/restart-server" --skip-client-reloads
|
||||
"$(dirname "$0")/restart-server" --skip-client-reloads --tornado-reshard
|
||||
service nginx reload
|
||||
|
@@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
import contextlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import pwd
|
||||
@@ -51,11 +52,18 @@ elif username != "zulip":
|
||||
logging.error("Must be run as user 'zulip'.")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if os.environ.get("RUNNING_UNDER_CRON") and os.path.exists(LOCK_DIR):
|
||||
logging.info("Skipping cron-triggered restart during deploy.")
|
||||
sys.exit(1)
|
||||
|
||||
if args.tornado_reshard:
|
||||
tornado_reshard_files = ["/etc/zulip/sharding.json", "/etc/zulip/nginx_sharding_map.conf"]
|
||||
if not all(os.path.exists(f"{path}.tmp") for path in tornado_reshard_files):
|
||||
logging.info(
|
||||
"No resharding changes to apply! Edit zulip.conf and run refresh-sharding-and-restart"
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
if not args.skip_checks:
|
||||
logging.info("Running syntax and database checks")
|
||||
subprocess.check_call(["./manage.py", "check", "--database", "default"])
|
||||
@@ -174,6 +182,36 @@ def restart_or_start(service: str) -> None:
|
||||
subprocess.check_call(["supervisorctl", our_verb, service])
|
||||
|
||||
|
||||
def ports_as_set(val: int | list[int]) -> frozenset[int]:
|
||||
return frozenset(val) if isinstance(val, list) else frozenset([val])
|
||||
|
||||
|
||||
def update_tornado_sharding() -> list[int]:
|
||||
with open("/etc/zulip/sharding.json") as old_shard_fh:
|
||||
old_sharding = json.load(old_shard_fh)
|
||||
with open("/etc/zulip/sharding.json.tmp") as new_shard_fh:
|
||||
new_sharding = json.load(new_shard_fh)
|
||||
affected_tornados: set[int] = set()
|
||||
for realm in set().union(old_sharding["shard_map"], new_sharding["shard_map"]):
|
||||
old_ports = ports_as_set(old_sharding["shard_map"].get(realm, []))
|
||||
new_ports = ports_as_set(new_sharding["shard_map"].get(realm, []))
|
||||
affected_tornados |= old_ports ^ new_ports
|
||||
old_regex_set = {
|
||||
(regex, ports_as_set(ports)) for (regex, ports) in old_sharding["shard_regexes"]
|
||||
}
|
||||
new_regex_set = {
|
||||
(regex, ports_as_set(ports)) for (regex, ports) in new_sharding["shard_regexes"]
|
||||
}
|
||||
for regex, ports in old_regex_set ^ new_regex_set:
|
||||
affected_tornados |= ports
|
||||
|
||||
tornado_reshard_files = ["/etc/zulip/sharding.json", "/etc/zulip/nginx_sharding_map.conf"]
|
||||
for path in tornado_reshard_files:
|
||||
os.rename(f"{path}.tmp", path)
|
||||
|
||||
return list(affected_tornados)
|
||||
|
||||
|
||||
if action == "restart" and len(workers) > 0:
|
||||
if args.less_graceful:
|
||||
# The less graceful form stops every worker now; we start them
|
||||
@@ -190,6 +228,11 @@ if action == "restart" and len(workers) > 0:
|
||||
restart_or_start(worker)
|
||||
|
||||
if has_application_server():
|
||||
if args.tornado_reshard:
|
||||
affected_tornado_ports = update_tornado_sharding()
|
||||
logging.info("Tornado ports affected by this resharding: %s", affected_tornado_ports)
|
||||
else:
|
||||
affected_tornado_ports = tornado_ports
|
||||
if not args.only_django:
|
||||
# Next, we restart the Tornado processes sequentially, in order to
|
||||
# minimize downtime of the tornado service caused by too many
|
||||
@@ -200,7 +243,7 @@ if has_application_server():
|
||||
# this before restarting Django, in case there are new event types
|
||||
# which it will need to know how to deal with.
|
||||
if len(tornado_ports) > 1:
|
||||
for p in tornado_ports:
|
||||
for p in affected_tornado_ports:
|
||||
# Restart Tornado processes individually for a better rate of
|
||||
# restarts. This also avoids behavior with restarting a whole
|
||||
# supervisord group where if any individual process is slow to
|
||||
|
Reference in New Issue
Block a user