mirror of
				https://github.com/zulip/zulip.git
				synced 2025-11-04 05:53:43 +00:00 
			
		
		
		
	puppet: Remove check_postgresql_backup.
We have replaced this monitoring with the black-box wal-g monitoring, which is more accurate.
This commit is contained in:
		
				
					committed by
					
						
						Tim Abbott
					
				
			
			
				
	
			
			
			
						parent
						
							fef31614d3
						
					
				
				
					commit
					230040caa9
				
			@@ -245,10 +245,8 @@ To restore from a manual backup, the process is basically the reverse of the abo
 | 
				
			|||||||
This restoration process can also be used to migrate a Zulip
 | 
					This restoration process can also be used to migrate a Zulip
 | 
				
			||||||
installation from one server to another.
 | 
					installation from one server to another.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
We recommend running a disaster recovery test after setting up your
 | 
					We recommend running a disaster recovery test after setting up your backups to
 | 
				
			||||||
backups to confirm that your backups are working. You may also want to
 | 
					confirm that your backups are working.
 | 
				
			||||||
monitor that they are up to date using the Nagios plugin at:
 | 
					 | 
				
			||||||
`puppet/zulip/files/nagios_plugins/zulip_postgresql_backups/check_postgresql_backup`.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Data export
 | 
					## Data export
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -528,9 +526,9 @@ it may be minutes before the backup is saved into S3 -- see
 | 
				
			|||||||
If you need always-current backup availability, Zulip also has
 | 
					If you need always-current backup availability, Zulip also has
 | 
				
			||||||
[built-in database replication support](postgresql.md#postgresql-warm-standby).
 | 
					[built-in database replication support](postgresql.md#postgresql-warm-standby).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
You can (and should) monitor that backups are running regularly via
 | 
					You can (and should) monitor that backups are running regularly, for instance
 | 
				
			||||||
the Nagios plugin installed into
 | 
					via the Prometheus exporter found in
 | 
				
			||||||
`/usr/lib/nagios/plugins/zulip_postgresql_backups/check_postgresql_backup`.
 | 
					`puppet/zulip/files/postgresql/wal-g-exporter`
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Streaming backups to S3
 | 
					### Streaming backups to S3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -296,7 +296,6 @@ Database monitoring:
 | 
				
			|||||||
- `check_fts_update_log`: Checks whether full-text search updates are
 | 
					- `check_fts_update_log`: Checks whether full-text search updates are
 | 
				
			||||||
  being processed properly or getting backlogged.
 | 
					  being processed properly or getting backlogged.
 | 
				
			||||||
- `check_postgres`: General checks for database health.
 | 
					- `check_postgres`: General checks for database health.
 | 
				
			||||||
- `check_postgresql_backup`: Checks status of PostgreSQL backups.
 | 
					 | 
				
			||||||
- `check_postgresql_replication_lag`: Checks whether PostgreSQL streaming
 | 
					- `check_postgresql_replication_lag`: Checks whether PostgreSQL streaming
 | 
				
			||||||
  replication is up to date.
 | 
					  replication is up to date.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -145,11 +145,6 @@ define command {
 | 
				
			|||||||
        command_line    /usr/lib/nagios/plugins/check_by_ssh -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_postgresql/check_postgresql_replication_lag'
 | 
					        command_line    /usr/lib/nagios/plugins/check_by_ssh -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_postgresql/check_postgresql_replication_lag'
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
define command {
 | 
					 | 
				
			||||||
        command_name    check_postgresql_backup
 | 
					 | 
				
			||||||
        command_line    /usr/lib/nagios/plugins/check_by_ssh -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_postgresql_backups/check_postgresql_backup'
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
define command{
 | 
					define command{
 | 
				
			||||||
       command_name     check_worker_memory
 | 
					       command_name     check_worker_memory
 | 
				
			||||||
       command_line    /usr/lib/nagios/plugins/check_by_ssh -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_app_frontend/check_worker_memory'
 | 
					       command_line    /usr/lib/nagios/plugins/check_by_ssh -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_app_frontend/check_worker_memory'
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -273,14 +273,6 @@ define service {
 | 
				
			|||||||
        contact_groups                  admins
 | 
					        contact_groups                  admins
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
define service {
 | 
					 | 
				
			||||||
        use                             generic-service
 | 
					 | 
				
			||||||
        service_description             Check last PostgreSQL backup time
 | 
					 | 
				
			||||||
        hostgroup_name                  postgresql
 | 
					 | 
				
			||||||
        check_command                   check_postgresql_backup
 | 
					 | 
				
			||||||
        contact_groups                  admins
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
#### Redis
 | 
					#### Redis
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,66 +0,0 @@
 | 
				
			|||||||
#!/usr/bin/env python3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import subprocess
 | 
					 | 
				
			||||||
import sys
 | 
					 | 
				
			||||||
from datetime import datetime, timedelta, timezone
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import dateutil.parser
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
states = {
 | 
					 | 
				
			||||||
    "OK": 0,
 | 
					 | 
				
			||||||
    "WARNING": 1,
 | 
					 | 
				
			||||||
    "CRITICAL": 2,
 | 
					 | 
				
			||||||
    "UNKNOWN": 3,
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def report(state: str, msg: str) -> None:
 | 
					 | 
				
			||||||
    print(f"{state}: {msg}")
 | 
					 | 
				
			||||||
    sys.exit(states[state])
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
replicas = subprocess.check_output(
 | 
					 | 
				
			||||||
    [
 | 
					 | 
				
			||||||
        "psql",
 | 
					 | 
				
			||||||
        "-v",
 | 
					 | 
				
			||||||
        "ON_ERROR_STOP=1",
 | 
					 | 
				
			||||||
        "postgres",
 | 
					 | 
				
			||||||
        "-t",
 | 
					 | 
				
			||||||
        "-c",
 | 
					 | 
				
			||||||
        "SELECT COUNT(*) FROM pg_stat_replication",
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
    stdin=subprocess.DEVNULL,
 | 
					 | 
				
			||||||
    text=True,
 | 
					 | 
				
			||||||
).strip()
 | 
					 | 
				
			||||||
if int(replicas) > 0:
 | 
					 | 
				
			||||||
    # We are the primary and we have replicas; we expect that backups
 | 
					 | 
				
			||||||
    # will be taken on one of them.
 | 
					 | 
				
			||||||
    report("OK", "this is the primary, with backups taken on the replicas")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
skip_backups = subprocess.run(
 | 
					 | 
				
			||||||
    ["crudini", "--get", "/etc/zulip/zulip.conf", "postgresql", "skip_backups"],
 | 
					 | 
				
			||||||
    capture_output=True,
 | 
					 | 
				
			||||||
    text=True,
 | 
					 | 
				
			||||||
    check=False,
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
if skip_backups.returncode == 0 and skip_backups.stdout.strip().lower() in [
 | 
					 | 
				
			||||||
    1,
 | 
					 | 
				
			||||||
    "y",
 | 
					 | 
				
			||||||
    "t",
 | 
					 | 
				
			||||||
    "yes",
 | 
					 | 
				
			||||||
    "true",
 | 
					 | 
				
			||||||
    "enable",
 | 
					 | 
				
			||||||
    "enabled",
 | 
					 | 
				
			||||||
]:
 | 
					 | 
				
			||||||
    report("OK", "backups are disabled on this host")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
try:
 | 
					 | 
				
			||||||
    with open("/var/lib/nagios_state/last_postgresql_backup") as f:
 | 
					 | 
				
			||||||
        last_backup = dateutil.parser.parse(f.read())
 | 
					 | 
				
			||||||
except OSError:
 | 
					 | 
				
			||||||
    report("UNKNOWN", "could not determine completion time of last PostgreSQL backup")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
if datetime.now(tz=timezone.utc) - last_backup > timedelta(hours=25):
 | 
					 | 
				
			||||||
    report("CRITICAL", f"last PostgreSQL backup completed more than 25 hours ago: {last_backup}")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
report("OK", f"last PostgreSQL backup completed less than 25 hours ago: {last_backup}")
 | 
					 | 
				
			||||||
@@ -58,11 +58,6 @@ env = os.environ.copy()
 | 
				
			|||||||
env["WALG_UPLOAD_DISK_CONCURRENCY"] = disk_concurrency
 | 
					env["WALG_UPLOAD_DISK_CONCURRENCY"] = disk_concurrency
 | 
				
			||||||
subprocess.check_call(["env-wal-g", "backup-push", pg_data_path], env=env)
 | 
					subprocess.check_call(["env-wal-g", "backup-push", pg_data_path], env=env)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
now = datetime.now(tz=timezone.utc)
 | 
					 | 
				
			||||||
with open("/var/lib/nagios_state/last_postgresql_backup", "w") as f:
 | 
					 | 
				
			||||||
    f.write(now.isoformat())
 | 
					 | 
				
			||||||
    f.write("\n")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
backups = {}
 | 
					backups = {}
 | 
				
			||||||
lines = subprocess.check_output(["env-wal-g", "backup-list"], text=True).split("\n")
 | 
					lines = subprocess.check_output(["env-wal-g", "backup-list"], text=True).split("\n")
 | 
				
			||||||
for line in lines[1:]:
 | 
					for line in lines[1:]:
 | 
				
			||||||
@@ -70,7 +65,7 @@ for line in lines[1:]:
 | 
				
			|||||||
        backup_name, date_str = line.split()[0:2]
 | 
					        backup_name, date_str = line.split()[0:2]
 | 
				
			||||||
        backups[dateutil.parser.parse(date_str)] = backup_name
 | 
					        backups[dateutil.parser.parse(date_str)] = backup_name
 | 
				
			||||||
 | 
					
 | 
				
			||||||
one_month_ago = now - timedelta(days=30)
 | 
					one_month_ago = datetime.now(tz=timezone.utc) - timedelta(days=30)
 | 
				
			||||||
for date in sorted(backups.keys(), reverse=True):
 | 
					for date in sorted(backups.keys(), reverse=True):
 | 
				
			||||||
    if date < one_month_ago:
 | 
					    if date < one_month_ago:
 | 
				
			||||||
        # We pass `FIND_FULL` such that if delta backups are being
 | 
					        # We pass `FIND_FULL` such that if delta backups are being
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -45,14 +45,4 @@ class zulip::postgresql_backups {
 | 
				
			|||||||
      mode   => '0600',
 | 
					      mode   => '0600',
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					 | 
				
			||||||
  file { "${zulip::common::nagios_plugins_dir}/zulip_postgresql_backups":
 | 
					 | 
				
			||||||
    require => Package[$zulip::common::nagios_plugins],
 | 
					 | 
				
			||||||
    recurse => true,
 | 
					 | 
				
			||||||
    purge   => true,
 | 
					 | 
				
			||||||
    owner   => 'root',
 | 
					 | 
				
			||||||
    group   => 'root',
 | 
					 | 
				
			||||||
    mode    => '0755',
 | 
					 | 
				
			||||||
    source  => 'puppet:///modules/zulip/nagios_plugins/zulip_postgresql_backups',
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user