Files
zulip/puppet/zulip-internal/files/nagios_plugins/check_pg_replication_lag
Zev Benjamin dd678465ae [manual] Move puppet modules to the top level
The new puppet.conf file has to be moved into place manually.

(imported from commit 253d9a95386dae8c803a998ce2dc7e8be40c880a)
2013-10-30 15:42:26 -04:00

83 lines
3.2 KiB
Python
Executable File

#!/usr/bin/python
"""
Nagios plugin to check the difference between the primary and
secondary Postgres servers' xlog location.
"""
import subprocess
import re
states = {
"OK": 0,
"WARNING": 1,
"CRITICAL": 2,
"UNKNOWN": 3
}
def report(state, msg):
print "%s: %s" % (state, msg)
exit(states[state])
def get_loc_over_ssh(host, func):
try:
return subprocess.check_output(['ssh', host, '-l', 'zulip',
'psql -t -c "SELECT %s()"' % (func,)],
stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
report('CRITICAL', 'ssh failed: %s: %s' % (str(e),e.output))
def loc_to_abs_offset(loc_str):
m = re.match(r'^\s*([0-9a-fA-F]+)/([0-9a-fA-F]+)\s*$', loc_str)
if not m:
raise ValueError("Unknown xlog location format: " + loc_str)
(xlog_file, file_offset) = (m.group(1), m.group(2))
# From PostgreSQL 9.2's pg_xlog_location_diff:
# result = XLogFileSize * (xlogid1 - xlogid2) + xrecoff1 - xrecoff2
# Taking xlogid2 and xrecoff2 to be zero to get the absolute offset:
# result = XLogFileSize * xlogid1 + xrecoff1
#
# xlog_internal.h says:
# #define XLogSegSize ((uint32) XLOG_SEG_SIZE)
# #define XLogSegsPerFile (((uint32) 0xffffffff) / XLogSegSize)
# #define XLogFileSize (XLogSegsPerFile * XLogSegSize)
#
# Since XLOG_SEG_SIZE is normally 16MB, XLogFileSize comes out to 0xFF000000
return 0xFF000000 * int(xlog_file, 16) + int(file_offset, 16)
# Fetch the locations in this order to make the differences positive
# in the normal case given the delay in getting the values via ssh
secondary_replay_loc = get_loc_over_ssh('postgres-secondary.zulip.net', 'pg_last_xlog_replay_location')
secondary_recv_loc = get_loc_over_ssh('postgres-secondary.zulip.net', 'pg_last_xlog_receive_location')
primary_loc = get_loc_over_ssh('postgres-primary.zulip.net', 'pg_current_xlog_location')
primary_offset = loc_to_abs_offset(primary_loc)
secondary_recv_offset = loc_to_abs_offset(secondary_recv_loc)
secondary_replay_offset = loc_to_abs_offset(secondary_replay_loc)
recv_diff = primary_offset - secondary_recv_offset
replay_diff = secondary_recv_offset - secondary_replay_offset
# xlog segments are normally 16MB each. These thresholds are pretty arbitrary.
if recv_diff > 5 * 16 * 1024**2:
report('CRITICAL', 'secondary is %d bytes behind on receiving xlog' % (recv_diff,))
if replay_diff > 5 * 16 * 1024**2:
report('CRITICAL', 'secondary is %d bytes behind on applying received xlog' % (replay_diff))
if recv_diff < 0:
report('CRITICAL', 'secondary is %d bytes ahead on receiving xlog' % (recv_diff,))
if replay_diff < 0:
report('CRITICAL', 'secondary is %d bytes ahead on applying received xlog' % (replay_diff,))
if recv_diff > 16 * 1024**2:
report('WARNING', 'secondary is %d bytes behind on receiving xlog' % (recv_diff,))
if replay_diff > 16 * 1024**2:
report('WARNING', 'secondary is %d bytes behind on applying received xlog' % (replay_diff))
report('OK', ('secondary is %d bytes behind on receiving and %d bytes behind on applying xlog'
% (recv_diff, replay_diff)))