#!/usr/bin/env python3 # # This script contains the actual logic for upgrading from an old # version of Zulip to the new version. upgrade-zulip-stage-2 is # always run from the new version of Zulip, so any bug fixes take # effect on the very next upgrade. import argparse import glob import hashlib import logging import os import re import subprocess import sys import time from typing import TYPE_CHECKING, Optional if TYPE_CHECKING: from typing import NoReturn os.environ["PYTHONUNBUFFERED"] = "y" # Force a known locale. Some packages on PyPI fail to install in some locales. os.environ["LC_ALL"] = "C.UTF-8" os.environ["LANG"] = "C.UTF-8" os.environ["LANGUAGE"] = "C.UTF-8" sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..")) from scripts.lib.zulip_tools import ( assert_running_as_root, get_config, get_config_file, listening_publicly, parse_os_release, run_psql_as_postgres, start_arg_parser, su_to_zulip, ) assert_running_as_root() # Set a known, reliable PATH os.environ["PATH"] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" logging.Formatter.converter = time.gmtime logging.basicConfig(format="%(asctime)s upgrade-zulip-stage-2: %(message)s", level=logging.INFO) def error_desupported_os(vendor: str, os_version: str) -> "NoReturn": # Link to documentation for how to correctly upgrade the OS. logging.critical("Unsupported platform: %s %s", vendor, os_version) logging.info( "Sorry! The support for your OS has been discontinued.\n" "Please upgrade your OS to a supported release first.\n" "See https://zulip.readthedocs.io/en/latest/production/" "upgrade-or-modify.html#upgrading-the-operating-system" ) sys.exit(1) # Do not upgrade on unsupported OS versions. UNSUPPORTED_DISTROS = [ ("ubuntu", "14.04"), ("ubuntu", "16.04"), ("ubuntu", "18.04"), ("debian", "9"), ] distro_info = parse_os_release() vendor = distro_info["ID"] os_version = distro_info["VERSION_ID"] if (vendor, os_version) in UNSUPPORTED_DISTROS: error_desupported_os(vendor, os_version) # make sure we have appropriate file permissions os.umask(0o22) restart_parser = start_arg_parser(action="restart", add_help=False) parser = argparse.ArgumentParser(parents=[restart_parser]) parser.add_argument("deploy_path", metavar="deploy_path", help="Path to deployment directory") parser.add_argument("--skip-puppet", action="store_true", help="Skip doing puppet/apt upgrades.") parser.add_argument("--skip-migrations", action="store_true", help="Skip doing migrations.") parser.add_argument( "--skip-downgrade-check", action="store_true", help="Skip the safety check to prevent database downgrades.", ) parser.add_argument( "--from-git", action="store_true", help="Upgrading from git, so run update-prod-static." ) parser.add_argument( "--ignore-static-assets", action="store_true", help="Do not attempt to copy/manage static assets.", ) parser.add_argument( "--skip-purge-old-deployments", action="store_true", help="Skip purging old deployments." ) parser.add_argument( "--audit-fts-indexes", action="store_true", help="Audit and fix full text search indexes." ) args = parser.parse_args() if not args.skip_puppet and args.less_graceful: logging.warning("Ignored --less-graceful; all upgrades without --skip-puppet are ungraceful.") if not args.skip_puppet and args.skip_tornado: logging.error( "Cannot skip tornado restart unless we are skipping puppet! Omit --skip-tornado, or add --skip-puppet." ) sys.exit(1) deploy_path = args.deploy_path os.chdir(deploy_path) config_file = get_config_file() IS_SERVER_UP = True # Check if rabbitmq port 25672 is listening on anything except 127.0.0.1 rabbitmq_dist_listen = listening_publicly(25672) # Check the erlang magic cookie size cookie_size: Optional[int] = None if os.path.exists("/var/lib/rabbitmq/.erlang.cookie"): with open("/var/lib/rabbitmq/.erlang.cookie", "r") as cookie_fh: cookie_size = len(cookie_fh.readline()) else: logging.info("No RabbitMQ erlang cookie found, not auditing RabbitMQ security.") if args.skip_puppet and rabbitmq_dist_listen: logging.error( "RabbitMQ is publicly-accessible on %s; this is a security vulnerability!", ", ".join(rabbitmq_dist_listen), ) issue = "issue" if cookie_size is not None and cookie_size == 20: # See the below comment -- this is used as a lightweight # signal for a cookie made with Erlang's bad randomizer. logging.error( "RabbitMQ erlang cookie is insecure; this is a critical security vulnerability!" ) issue = "issues" logging.error( "To fix the above security %s, re-run the upgrade without --skip-puppet " "(which may be set in /etc/zulip/zulip.conf), in order to restart the " "necessary services. Running zulip-puppet-apply by itself is not sufficient.", issue, ) sys.exit(1) def shutdown_server() -> None: global IS_SERVER_UP if IS_SERVER_UP: logging.info("Stopping Zulip...") subprocess.check_call(["./scripts/stop-server"], preexec_fn=su_to_zulip) IS_SERVER_UP = False # postgresql.version is required for database servers, but wasn't # previously; fill it in based on what the OS provides. if os.path.exists("/etc/init.d/postgresql"): postgresql_version = get_config(config_file, "postgresql", "version") if not postgresql_version: default_postgresql_version = { ("debian", "10"): "11", ("ubuntu", "20.04"): "12", ("centos", "7"): "11", } if (vendor, os_version) in default_postgresql_version: postgresql_version = default_postgresql_version[(vendor, os_version)] else: error_desupported_os(vendor, os_version) subprocess.check_call( [ "crudini", "--set", "/etc/zulip/zulip.conf", "postgresql", "version", postgresql_version, ] ) if glob.glob("/usr/share/postgresql/*/extension/tsearch_extras.control"): # Remove legacy tsearch_extras package references run_psql_as_postgres( config_file=config_file, sql_query="DROP EXTENSION IF EXISTS tsearch_extras;", ) subprocess.check_call(["apt-get", "remove", "-y", "postgresql-*-tsearch-extras"]) if not args.skip_puppet: logging.info("Upgrading system packages...") subprocess.check_call(["apt-get", "update"]) subprocess.check_call(["apt-get", "-y", "--allow-downgrades", "upgrade"]) # To bootstrap zulip-puppet-apply, we need to install the system yaml # package; new installs get this, but old installs may not have it. if not os.path.exists("/usr/share/doc/python3-yaml"): logging.info("Installing system YAML package, for puppet...") subprocess.check_call(["apt-get", "install", "python3-yaml"]) if not os.path.exists(os.path.join(deploy_path, "zproject/prod_settings.py")): # This is normally done in unpack-zulip, but for upgrading from # zulip<1.4.0, we need to do it. See discussion in commit 586b23637. os.symlink("/etc/zulip/settings.py", os.path.join(deploy_path, "zproject/prod_settings.py")) # Now we should have an environment set up where we can run our tools; # first, creating the production venv. subprocess.check_call( [os.path.join(deploy_path, "scripts", "lib", "create-production-venv"), deploy_path] ) # Check to make sure that this upgrade is not actually a database # downgrade. if not args.skip_downgrade_check: subprocess.check_call( [os.path.join(deploy_path, "scripts", "lib", "check-database-compatibility.py")], preexec_fn=su_to_zulip, ) # Make sure the right version of node is installed subprocess.check_call([os.path.join(deploy_path, "scripts", "lib", "install-node")]) subprocess.check_call([os.path.join(deploy_path, "scripts", "lib", "install-yarn")]) # Generate any new secrets that were added in the new version required. # TODO: Do caching to only run this when it has changed. subprocess.check_call( [os.path.join(deploy_path, "scripts", "setup", "generate_secrets.py"), "--production"] ) # Unpleasant migration: Remove any legacy deployed copies of # images-google-64 from before we renamed that emojiset to # "googleblob": emoji_path = "/home/zulip/prod-static/generated/emoji/images-google-64/1f32d.png" if os.path.exists(emoji_path): with open(emoji_path, "rb") as f: emoji_data = f.read() emoji_sha = hashlib.sha1(emoji_data).hexdigest() if emoji_sha == "47033121dc20b376e0f86f4916969872ad22a293": import shutil shutil.rmtree("/home/zulip/prod-static/generated/emoji/images-google-64") # And then, building/installing the static assets. if args.ignore_static_assets: # For the OS version upgrade use case, the static assets are # already in place, and we don't need to do anything. Further, # neither of the options below will work for all installations, # because if we installed from Git, `prod-static/serve` may be # empty so we can't do the non-Git thing, whereas if we installed # from a tarball, we won't have a `tools/` directory and thus # cannot run `tools/update-prod-static`. pass elif args.from_git: # Because `upgrade-zulip-from-git` needs to build static assets, it # is at risk of being OOM killed on systems with limited free RAM. mem_bytes = os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES") mem_gib = mem_bytes / (1024.0**3) # e.g. 3.74 # Ideally, we'd have 2 thresholds here, depending on whether the # system is running queue workers multithreaded or multiprocess. # See puppet/zulip/manifests/app_frontend_base.pp for background. if mem_gib < 4.2: logging.info("Shutting down server to ensure sufficient free RAM for webpack.") shutdown_server() # Note: The fact that this is before we apply Puppet changes means # that we don't support adding new Puppet dependencies of # update-prod-static with the Git upgrade process. But it'll fail # safely; this seems like a worthwhile tradeoff to minimize downtime. logging.info("Building static assets...") try: subprocess.check_call(["./tools/update-prod-static"], preexec_fn=su_to_zulip) except subprocess.CalledProcessError: logging.error("Failed to build static assets.") if IS_SERVER_UP: logging.error("Usually the cause is insufficient free RAM to run webpack.") logging.error("Try stopping the Zulip server (scripts/stop-server) and trying again.") sys.exit(1) logging.info("Caching Zulip Git version...") subprocess.check_call(["./tools/cache-zulip-git-version"], preexec_fn=su_to_zulip) else: # Since this doesn't do any actual work, it's likely safe to have # this run before we apply Puppet changes (saving a bit of downtime). logging.info("Installing static assets...") subprocess.check_call( ["cp", "-rT", os.path.join(deploy_path, "prod-static/serve"), "/home/zulip/prod-static"], preexec_fn=su_to_zulip, ) usermessage_index_migrations = [ "[ ] 0082_index_starred_user_messages", "[ ] 0083_index_mentioned_user_messages", "[ ] 0095_index_unread_user_messages", "[ ] 0098_index_has_alert_word_user_messages", "[ ] 0099_index_wildcard_mentioned_user_messages", "[ ] 0177_user_message_add_and_index_is_private_flag", "[ ] 0180_usermessage_add_active_mobile_push_notification", ] # Our next optimization is to check whether any migrations are needed # before we start the critical section of the restart. This saves # about 1s of downtime in a no-op upgrade. migrations_needed = False if not args.skip_migrations: logging.info("Checking for needed migrations") migrations_output = subprocess.check_output( ["./manage.py", "showmigrations"], preexec_fn=su_to_zulip, text=True ) need_create_large_indexes = False for ln in migrations_output.split("\n"): line_str = ln.strip() if line_str.startswith("[ ]"): migrations_needed = True if line_str in usermessage_index_migrations: need_create_large_indexes = True if need_create_large_indexes: logging.info("Creating some expensive indexes before starting downtime.") subprocess.check_call(["./manage.py", "create_large_indexes"], preexec_fn=su_to_zulip) if (not args.skip_puppet or migrations_needed) and IS_SERVER_UP: # By default, we shut down the service to apply migrations and # Puppet changes, to minimize risk of issues due to inconsistent # state. shutdown_server() if rabbitmq_dist_listen: shutdown_server() logging.info("Shutting down rabbitmq to adjust its ports...") subprocess.check_call(["/usr/sbin/service", "rabbitmq-server", "stop"]) if cookie_size is not None and cookie_size == 20: # Checking for a 20-character cookie is used as a signal that it # was generated by Erlang's insecure randomizer, which only # provides between 20 and 36 bits of entropy; were it 20 # characters long by a good randomizer, it would be 96 bits and # more than sufficient. We generate, using good randomness, a # 255-character cookie, the max allowed length. logging.info("Generating a secure erlang cookie...") subprocess.check_call(["./scripts/setup/generate-rabbitmq-cookie"]) # Adjust Puppet class names for the manifest renames in the 4.0 release class_renames = { "zulip::app_frontend": "zulip::profile::app_frontend", "zulip::dockervoyager": "zulip::profile::docker", "zulip::memcached": "zulip::profile::memcached", "zulip::postgres_appdb_tuned": "zulip::profile::postgresql", "zulip::postgres_backups": "zulip::postgresql_backups", "zulip::rabbit": "zulip::profile::rabbitmq", "zulip::voyager": "zulip::profile::standalone", } classes = re.split(r"\s*,\s*", get_config(config_file, "machine", "puppet_classes")) new_classes = [class_renames.get(c, c) for c in classes if c != "zulip::base"] if classes != new_classes: logging.info("Adjusting Puppet classes for renames...") subprocess.check_call( [ "crudini", "--set", "/etc/zulip/zulip.conf", "machine", "puppet_classes", ", ".join(new_classes), ] ) if not args.skip_puppet: logging.info("Applying Puppet changes...") subprocess.check_call(["./scripts/zulip-puppet-apply", "--force"]) subprocess.check_call(["apt-get", "-y", "--allow-downgrades", "upgrade"]) if migrations_needed: logging.info("Applying database migrations...") subprocess.check_call(["./manage.py", "migrate", "--noinput"], preexec_fn=su_to_zulip) subprocess.check_call(["./manage.py", "create_realm_internal_bots"], preexec_fn=su_to_zulip) logging.info("Restarting Zulip...") if IS_SERVER_UP or not args.skip_puppet: # Even if the server wasn't up previously, puppet might have # started it if there were supervisord configuration changes, so # we need to use restart-server if puppet ran. restart_args = ["--fill-cache"] if args.skip_tornado: restart_args.append("--skip-tornado") if args.less_graceful: restart_args.append("--less-graceful") subprocess.check_call(["./scripts/restart-server", *restart_args], preexec_fn=su_to_zulip) else: subprocess.check_call(["./scripts/start-server", "--fill-cache"], preexec_fn=su_to_zulip) logging.info("Upgrade complete!") if args.audit_fts_indexes: logging.info("Correcting full-text search indexes for updated dictionary files") logging.info("This may take a while but the server should work while it runs.") subprocess.check_call(["./manage.py", "audit_fts_indexes"], preexec_fn=su_to_zulip) if not args.skip_purge_old_deployments: logging.info("Purging old deployments...") subprocess.check_call(["./scripts/purge-old-deployments"]) else: logging.info("Skipping purging old deployments.") if args.skip_puppet: logging.info("Showing un-applied Puppet changes:") subprocess.check_call(["./scripts/zulip-puppet-apply", "--noop", "--show_diff"])