From 19a11c9556ea86286aea0bd3fff4c8e1ea8bfec2 Mon Sep 17 00:00:00 2001 From: Alex Vandiver Date: Wed, 26 Apr 2023 01:21:36 +0000 Subject: [PATCH] pg_backup_and_purge: Take backups on replicas, if present. Taking backups on the database primary adds additional disk load, which can impact the performance of the application. Switch to taking backups on replicas, if they exist. Some deployments may have multiple replicas, and taking backups on all of them is wasteful and potentially confusing; add a flag to inhibit taking nightly snapshots on the host. If the deployment is a single instance of PostgreSQL, with no replicas, it takes backups as before, modulo the extra flag to allow skipping taking them. --- docs/production/deployment.md | 8 ++++ .../check_postgresql_backup | 40 +++++++++++++++---- .../files/postgresql/pg_backup_and_purge | 31 +++++++++++--- 3 files changed, 66 insertions(+), 13 deletions(-) diff --git a/docs/production/deployment.md b/docs/production/deployment.md index a036990c02..37b5f2ba68 100644 --- a/docs/production/deployment.md +++ b/docs/production/deployment.md @@ -836,6 +836,14 @@ based on the `pg_hba.conf` file; if you are using password authentication, you can set a `postgresql_replication_password` secret for authentication. +#### `skip_backups` + +If set to as true value, inhibits the nightly [`wal-g` backups][wal-g] which +would be taken on all non-replicated hosts and [all warm standby +replicas](#postgresql-warm-standby). This is generally only set if you have +multiple warm standby replicas, in order to avoid taking multiple backups, one +per replica. + #### `ssl_ca_file` Set to the path to the PEM-encoded certificate authority used to diff --git a/puppet/zulip/files/nagios_plugins/zulip_postgresql_backups/check_postgresql_backup b/puppet/zulip/files/nagios_plugins/zulip_postgresql_backups/check_postgresql_backup index 5a14fd7630..2f3e268b5f 100755 --- a/puppet/zulip/files/nagios_plugins/zulip_postgresql_backups/check_postgresql_backup +++ b/puppet/zulip/files/nagios_plugins/zulip_postgresql_backups/check_postgresql_backup @@ -19,13 +19,39 @@ def report(state: str, msg: str) -> None: sys.exit(states[state]) -if ( - subprocess.check_output( - ["psql", "-v", "ON_ERROR_STOP=1", "postgres", "-t", "-c", "SELECT pg_is_in_recovery()"] - ).strip() - != b"f" -): - report("OK", "this is not the primary") +replicas = subprocess.check_output( + [ + "psql", + "-v", + "ON_ERROR_STOP=1", + "postgres", + "-t", + "-c", + "SELECT COUNT(*) FROM pg_stat_replication", + ], + stdin=subprocess.DEVNULL, + text=True, +).strip() +if int(replicas) > 0: + # We are the primary and we have replicas; we expect that backups + # will be taken on one of them. + report("OK", "this is the primary, with backups taken on the replicas") + +skip_backups = subprocess.run( + ["crudini", "--get", "/etc/zulip/zulip.conf", "postgresql", "skip_backups"], + capture_output=True, + text=True, +) +if skip_backups.returncode == 0 and skip_backups.stdout.strip().lower() in [ + 1, + "y", + "t", + "yes", + "true", + "enable", + "enabled", +]: + report("OK", "backups are disabled on this host") try: with open("/var/lib/nagios_state/last_postgresql_backup") as f: diff --git a/puppet/zulip/files/postgresql/pg_backup_and_purge b/puppet/zulip/files/postgresql/pg_backup_and_purge index 9afd77c9bf..d35cc77fd8 100755 --- a/puppet/zulip/files/postgresql/pg_backup_and_purge +++ b/puppet/zulip/files/postgresql/pg_backup_and_purge @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +import configparser import glob import logging import os @@ -14,14 +15,32 @@ logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s") logger = logging.getLogger(__name__) -recovery_val = subprocess.check_output( - ["psql", "-v", "ON_ERROR_STOP=1", "-t", "-c", "SELECT pg_is_in_recovery()"], +config_file = configparser.RawConfigParser() +config_file.read("/etc/zulip/zulip.conf") + + +def get_config( + section: str, + key: str, + default_value: str = "", +) -> str: + if config_file.has_option(section, key): + return config_file.get(section, key) + return default_value + + +replicas = subprocess.check_output( + ["psql", "-v", "ON_ERROR_STOP=1", "-t", "-c", "SELECT COUNT(*) FROM pg_stat_replication"], + stdin=subprocess.DEVNULL, text=True, ).strip() -# Assertion to check that we're extracting the value correctly. -assert recovery_val in ["t", "f"] -if recovery_val == "t": - # Only run if we're the primary +if int(replicas) > 0: + # We are the primary and we have replicas; we expect that backups + # will be taken on one of them. + sys.exit(0) + +skip_backups = get_config("postgresql", "skip_backups", "") +if skip_backups.lower() in [1, "y", "t", "yes", "true", "enable", "enabled"]: sys.exit(0) is_rhel_based = os.path.exists("/etc/redhat-release")