mirror of
				https://github.com/zulip/zulip.git
				synced 2025-11-03 21:43:21 +00:00 
			
		
		
		
	prometheus: Add an exporter for wal-g backup properties.
Since backups may now taken on arbitrary hosts, we need a blackbox monitor that _some_ backup was produced. Add a Prometheus exporter which calls `wal-g backup-list` and reports statistics about the backups. This could be extended to include `wal-g wal-verify`, but that requires a connection to the PostgreSQL server.
This commit is contained in:
		
				
					committed by
					
						
						Tim Abbott
					
				
			
			
				
	
			
			
			
						parent
						
							a22168d8b3
						
					
				
				
					commit
					3aba2789d3
				
			
							
								
								
									
										153
									
								
								puppet/zulip/files/postgresql/wal-g-exporter
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										153
									
								
								puppet/zulip/files/postgresql/wal-g-exporter
									
									
									
									
									
										Executable file
									
								
							@@ -0,0 +1,153 @@
 | 
			
		||||
#!/usr/bin/env python3
 | 
			
		||||
import configparser
 | 
			
		||||
import contextlib
 | 
			
		||||
import json
 | 
			
		||||
import logging
 | 
			
		||||
import subprocess
 | 
			
		||||
import sys
 | 
			
		||||
from collections import defaultdict
 | 
			
		||||
from datetime import datetime, timedelta, timezone
 | 
			
		||||
from http.server import BaseHTTPRequestHandler, HTTPServer
 | 
			
		||||
from typing import Dict, List, Mapping, Optional, Protocol
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class GaugeMetric(Protocol):
 | 
			
		||||
    def __call__(self, value: float, labels: Optional[Mapping[str, str]] = None, /) -> None:
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class WalGPrometheusServer(BaseHTTPRequestHandler):
 | 
			
		||||
    METRIC_PREFIX = "wal_g_backup_"
 | 
			
		||||
 | 
			
		||||
    metrics: Dict[str, List[str]] = {}
 | 
			
		||||
    metric_values: Dict[str, Dict[str, str]] = defaultdict(dict)
 | 
			
		||||
 | 
			
		||||
    server_version = "wal-g-prometheus-server/1.0"
 | 
			
		||||
 | 
			
		||||
    def gauge(
 | 
			
		||||
        self, name: str, description: Optional[str] = None, default_value: Optional[float] = None
 | 
			
		||||
    ) -> GaugeMetric:
 | 
			
		||||
        if name in self.metrics:
 | 
			
		||||
            raise ValueError(f"Redefinition of {name} metric")
 | 
			
		||||
        self.metrics[name] = [f"# TYPE {self.METRIC_PREFIX}{name} gauge"]
 | 
			
		||||
        if description is not None:
 | 
			
		||||
            self.metrics[name].append(f"# HELP {self.METRIC_PREFIX}{name} {description}")
 | 
			
		||||
 | 
			
		||||
        def inner(value: float, labels: Optional[Mapping[str, str]] = None) -> None:
 | 
			
		||||
            label_str = ""
 | 
			
		||||
            if labels:
 | 
			
		||||
                label_str = "{" + ",".join([f'{k}="{v}"' for k, v in labels.items()]) + "}"
 | 
			
		||||
            self.metric_values[name][label_str] = f"{self.METRIC_PREFIX}{name}{label_str} {value}"
 | 
			
		||||
 | 
			
		||||
        if default_value is not None:
 | 
			
		||||
            inner(default_value)
 | 
			
		||||
        return inner
 | 
			
		||||
 | 
			
		||||
    def print_metrics(self) -> None:
 | 
			
		||||
        lines = []
 | 
			
		||||
        for metric_name in self.metrics:
 | 
			
		||||
            if self.metric_values[metric_name]:
 | 
			
		||||
                # Print preamble
 | 
			
		||||
                lines += self.metrics[metric_name]
 | 
			
		||||
                for metric_value in self.metric_values[metric_name].values():
 | 
			
		||||
                    lines.append(metric_value)
 | 
			
		||||
                lines.append("")
 | 
			
		||||
        self.wfile.write("\n".join(lines).encode())
 | 
			
		||||
 | 
			
		||||
    def do_GET(self) -> None:
 | 
			
		||||
        if self.path != "/metrics":
 | 
			
		||||
            self.send_response(404)
 | 
			
		||||
            self.end_headers()
 | 
			
		||||
            sys.stderr.flush()
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        self.send_response(200)
 | 
			
		||||
        self.send_header("Content-type", "text/plain; version=0.0.4")
 | 
			
		||||
        self.end_headers()
 | 
			
		||||
 | 
			
		||||
        self.metrics = {}
 | 
			
		||||
        self.metric_values = defaultdict(dict)
 | 
			
		||||
 | 
			
		||||
        backup_ok = self.gauge("ok", "If wal-g backup-list was OK", 0)
 | 
			
		||||
        backup_count = self.gauge("count", "Number of backups")
 | 
			
		||||
        backup_earliest_age_seconds = self.gauge("earliest_age_seconds", "Age of the oldest backup")
 | 
			
		||||
        backup_latest_age_seconds = self.gauge(
 | 
			
		||||
            "latest_age_seconds", "Age of the most recent backup"
 | 
			
		||||
        )
 | 
			
		||||
        backup_latest_duration_seconds = self.gauge(
 | 
			
		||||
            "latest_duration_seconds", "Duration the most recent backup took, in seconds"
 | 
			
		||||
        )
 | 
			
		||||
        backup_latest_compressed_size_bytes = self.gauge(
 | 
			
		||||
            "latest_compressed_size_bytes", "Size of the most recent backup, in bytes"
 | 
			
		||||
        )
 | 
			
		||||
        backup_latest_uncompressed_size_bytes = self.gauge(
 | 
			
		||||
            "latest_uncompressed_size_bytes",
 | 
			
		||||
            "Uncompressed size of the most recent backup, in bytes",
 | 
			
		||||
        )
 | 
			
		||||
        backup_total_compressed_size_bytes = self.gauge(
 | 
			
		||||
            "total_compressed_size_bytes", "Total compressed size of all backups, in bytes"
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        now = datetime.now(tz=timezone.utc)
 | 
			
		||||
        try:
 | 
			
		||||
            config_file = configparser.RawConfigParser()
 | 
			
		||||
            config_file.read("/etc/zulip/zulip-secrets.conf")
 | 
			
		||||
            bucket = config_file["secrets"]["s3_backups_bucket"]
 | 
			
		||||
 | 
			
		||||
            backup_list_output = subprocess.check_output(
 | 
			
		||||
                ["env-wal-g", "backup-list", "--detail", "--json"],
 | 
			
		||||
                text=True,
 | 
			
		||||
            )
 | 
			
		||||
            data = json.loads(backup_list_output)
 | 
			
		||||
            backup_count(len(data), {"bucket": bucket})
 | 
			
		||||
 | 
			
		||||
            backup_total_compressed_size_bytes(
 | 
			
		||||
                sum(e["compressed_size"] for e in data), {"bucket": bucket}
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            if len(data) > 0:
 | 
			
		||||
                data.sort(key=lambda e: e["time"], reverse=True)
 | 
			
		||||
                latest = data[0]
 | 
			
		||||
                labels = {
 | 
			
		||||
                    "host": latest["hostname"],
 | 
			
		||||
                    "pg_version": str(latest["pg_version"]),
 | 
			
		||||
                    "bucket": bucket,
 | 
			
		||||
                }
 | 
			
		||||
                backup_latest_compressed_size_bytes(latest["compressed_size"], labels)
 | 
			
		||||
                backup_latest_uncompressed_size_bytes(latest["uncompressed_size"], labels)
 | 
			
		||||
 | 
			
		||||
                def t(key: str, e: Dict[str, str] = latest) -> datetime:
 | 
			
		||||
                    return datetime.strptime(e[key], e["date_fmt"]).replace(tzinfo=timezone.utc)
 | 
			
		||||
 | 
			
		||||
                backup_earliest_age_seconds(
 | 
			
		||||
                    (now - t("start_time", data[-1])) / timedelta(seconds=1),
 | 
			
		||||
                    {
 | 
			
		||||
                        "host": data[-1]["hostname"],
 | 
			
		||||
                        "pg_version": data[-1]["pg_version"],
 | 
			
		||||
                        "bucket": bucket,
 | 
			
		||||
                    },
 | 
			
		||||
                )
 | 
			
		||||
                backup_latest_age_seconds((now - t("start_time")) / timedelta(seconds=1), labels)
 | 
			
		||||
                backup_latest_duration_seconds(
 | 
			
		||||
                    (t("finish_time") - t("start_time")) / timedelta(seconds=1), labels
 | 
			
		||||
                )
 | 
			
		||||
            backup_ok(1)
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            logging.exception(e)
 | 
			
		||||
        finally:
 | 
			
		||||
            self.print_metrics()
 | 
			
		||||
            self.log_message(
 | 
			
		||||
                "Served in %.2f seconds",
 | 
			
		||||
                (datetime.now(tz=timezone.utc) - now) / timedelta(seconds=1),
 | 
			
		||||
            )
 | 
			
		||||
            sys.stderr.flush()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
 | 
			
		||||
server = HTTPServer(("127.0.0.1", 9188), WalGPrometheusServer)
 | 
			
		||||
logging.info("Starting server...")
 | 
			
		||||
with contextlib.suppress(KeyboardInterrupt):
 | 
			
		||||
    server.serve_forever()
 | 
			
		||||
 | 
			
		||||
server.server_close()
 | 
			
		||||
logging.info("Stopping server...")
 | 
			
		||||
@@ -6,6 +6,9 @@ class zulip_ops::profile::prometheus_server {
 | 
			
		||||
  include zulip_ops::profile::base
 | 
			
		||||
  include zulip_ops::prometheus::base
 | 
			
		||||
 | 
			
		||||
  # This blackbox monitoring of the backup system runs locally
 | 
			
		||||
  include zulip_ops::prometheus::wal_g
 | 
			
		||||
 | 
			
		||||
  $version = $zulip::common::versions['prometheus']['version']
 | 
			
		||||
  $dir = "/srv/zulip-prometheus-${version}"
 | 
			
		||||
  $bin = "${dir}/prometheus"
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										34
									
								
								puppet/zulip_ops/manifests/prometheus/wal_g.pp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								puppet/zulip_ops/manifests/prometheus/wal_g.pp
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,34 @@
 | 
			
		||||
# @summary Prometheus monitoring of wal-g backups
 | 
			
		||||
#
 | 
			
		||||
class zulip_ops::prometheus::wal_g {
 | 
			
		||||
  include zulip_ops::prometheus::base
 | 
			
		||||
  include zulip::supervisor
 | 
			
		||||
  include zulip::wal_g
 | 
			
		||||
 | 
			
		||||
  file { '/usr/local/bin/wal-g-exporter':
 | 
			
		||||
    ensure  => file,
 | 
			
		||||
    require => User[zulip],
 | 
			
		||||
    owner   => 'zulip',
 | 
			
		||||
    group   => 'zulip',
 | 
			
		||||
    mode    => '0755',
 | 
			
		||||
    source  => 'puppet:///modules/zulip/postgresql/wal-g-exporter',
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  # We embed the hash of the contents into the name of the process, so
 | 
			
		||||
  # that `supervisorctl reread` knows that it has updated.
 | 
			
		||||
  $full_exporter_hash = sha256(file('zulip/postgresql/wal-g-exporter'))
 | 
			
		||||
  $exporter_hash = $full_exporter_hash[0,8]
 | 
			
		||||
  file { "${zulip::common::supervisor_conf_dir}/prometheus_wal_g_exporter.conf":
 | 
			
		||||
    ensure  => file,
 | 
			
		||||
    require => [
 | 
			
		||||
      User[zulip],
 | 
			
		||||
      Package[supervisor],
 | 
			
		||||
      File['/usr/local/bin/wal-g-exporter'],
 | 
			
		||||
    ],
 | 
			
		||||
    owner   => 'root',
 | 
			
		||||
    group   => 'root',
 | 
			
		||||
    mode    => '0644',
 | 
			
		||||
    content => template('zulip_ops/supervisor/conf.d/prometheus_wal_g_exporter.conf.template.erb'),
 | 
			
		||||
    notify  => Service[supervisor],
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -0,0 +1,12 @@
 | 
			
		||||
[program:prometheus_wal_g_exporter]
 | 
			
		||||
# We record the hash of the script so that we can update this file
 | 
			
		||||
# with it, which will make `supervisorctl reread && supervisorctl
 | 
			
		||||
# update` restart this job.
 | 
			
		||||
command=/usr/local/bin/wal-g-exporter
 | 
			
		||||
process_name=wal-g-exporter_<%= @exporter_hash %>
 | 
			
		||||
priority=10
 | 
			
		||||
autostart=true
 | 
			
		||||
autorestart=true
 | 
			
		||||
user=zulip
 | 
			
		||||
redirect_stderr=true
 | 
			
		||||
stdout_logfile=/var/log/zulip/wal_g_exporter.log
 | 
			
		||||
		Reference in New Issue
	
	Block a user