From bd54f0363ea10016cf14d7620db94b9c38d69ba9 Mon Sep 17 00:00:00 2001 From: Alex Vandiver Date: Wed, 26 Mar 2025 13:54:26 +0000 Subject: [PATCH] kandra: Update prometheus configuration. This pulls in the more complete production Prometheus configuration. --- .../kandra/files/prometheus/prometheus.yaml | 29 -- .../manifests/profile/prometheus_server.pp | 16 +- .../prometheus/prometheus.yaml.template.erb | 263 ++++++++++++++++++ 3 files changed, 273 insertions(+), 35 deletions(-) delete mode 100644 puppet/kandra/files/prometheus/prometheus.yaml create mode 100644 puppet/kandra/templates/prometheus/prometheus.yaml.template.erb diff --git a/puppet/kandra/files/prometheus/prometheus.yaml b/puppet/kandra/files/prometheus/prometheus.yaml deleted file mode 100644 index 79f3a37f57..0000000000 --- a/puppet/kandra/files/prometheus/prometheus.yaml +++ /dev/null @@ -1,29 +0,0 @@ -global: - # Set the scrape interval to every 15 seconds. Default is every 1 minute. - scrape_interval: 15s - # Evaluate rules every 15 seconds. The default is every 1 minute. - evaluation_interval: 15s - -scrape_configs: - # Self-monitoring - - job_name: "prometheus" - static_configs: - - targets: ["localhost:9090"] - - job_name: "grafana" - static_configs: - - targets: ["localhost:3000"] - - # Fetch from node_exporter on all of the EC2 hosts - - job_name: "node" - ec2_sd_configs: - - region: us-east-1 - port: 9100 - refresh_interval: 1m - filters: - - name: instance-state-name - values: ["running"] - relabel_configs: - - source_labels: ["__meta_ec2_tag_Name"] - target_label: "instance" - - source_labels: ["__meta_ec2_tag_role"] - target_label: "role" diff --git a/puppet/kandra/manifests/profile/prometheus_server.pp b/puppet/kandra/manifests/profile/prometheus_server.pp index 2b785e4db4..99af1b7051 100644 --- a/puppet/kandra/manifests/profile/prometheus_server.pp +++ b/puppet/kandra/manifests/profile/prometheus_server.pp @@ -49,13 +49,17 @@ class kandra::profile::prometheus_server inherits kandra::profile::base { group => 'root', mode => '0755', } + + $czo = zulipconf('prometheus', 'czo', '') + $other_hosts = split(zulipconf('prometheus', 'other_hosts', ''), ',') + $backup_buckets = split(zulipconf('prometheus', 'walg_buckets', ''), ',') file { '/etc/prometheus/prometheus.yaml': - ensure => file, - owner => 'root', - group => 'root', - mode => '0644', - source => 'puppet:///modules/kandra/prometheus/prometheus.yaml', - notify => Service[supervisor], + ensure => file, + owner => 'root', + group => 'root', + mode => '0644', + content => template('kandra/prometheus/prometheus.yaml.template.erb'), + notify => Service[supervisor], } file { "${zulip::common::supervisor_conf_dir}/prometheus.conf": diff --git a/puppet/kandra/templates/prometheus/prometheus.yaml.template.erb b/puppet/kandra/templates/prometheus/prometheus.yaml.template.erb new file mode 100644 index 0000000000..a421892fb1 --- /dev/null +++ b/puppet/kandra/templates/prometheus/prometheus.yaml.template.erb @@ -0,0 +1,263 @@ +global: + # Set the scrape interval to every 15 seconds. Default is every 1 minute. + scrape_interval: 15s + # Evaluate rules every 15 seconds. The default is every 1 minute. + evaluation_interval: 15s + +scrape_configs: + - job_name: "prometheus" + static_configs: + - targets: ["localhost:9090"] + + - job_name: "grafana" + static_configs: + - targets: ["localhost:3000"] + + - job_name: "node" + ec2_sd_configs: + - region: us-east-1 + port: 9100 + refresh_interval: 1m + filters: + - name: instance-state-name + values: ["running"] + static_configs: + - targets: ["<%= @czo %>:9100"] + labels: + role: prod + instance: <%= @czo %> +<% if @other_hosts -%> + - targets: +<% @other_hosts.each do |host| -%> + - <%= host %>:9100 +<% end -%> +<% end -%> + relabel_configs: + - source_labels: ["__address__"] + regex: "([^.]+).*" + target_label: "instance" + - source_labels: ["__address__"] + regex: "([^.-]+).*" + target_label: "role" + + - source_labels: ["__meta_ec2_tag_Name"] + regex: "(.+)" + target_label: "instance" + - source_labels: ["__meta_ec2_tag_role"] + regex: "(.+)" + target_label: "role" + + - job_name: "camo" + ec2_sd_configs: + - region: us-east-1 + port: 9292 + filters: + - name: "tag:role" + values: ["smokescreen"] + - name: instance-state-name + values: ["running"] + static_configs: + - targets: ["<%= @czo %>:9292"] + labels: + deploy: prod + instance: <%= @czo %> + relabel_configs: + - source_labels: ["__meta_ec2_tag_Name"] + regex: "(.+)" + target_label: "instance" + + - job_name: "smokescreen" + ec2_sd_configs: + - region: us-east-1 + port: 9810 + filters: + - name: "tag:role" + values: ["smokescreen"] + - name: instance-state-name + values: ["running"] + static_configs: + - targets: ["<%= @czo %>:9810"] + labels: + deploy: prod + instance: <%= @czo %> + relabel_configs: + - source_labels: ["__meta_ec2_tag_Name"] + regex: "(.+)" + target_label: "instance" + + - job_name: "uwsgi" + ec2_sd_configs: + - region: us-east-1 + port: 9238 + filters: + - name: "tag:role" + values: ["prod_app_frontend", "staging_app_frontend"] + - name: instance-state-name + values: ["running"] + static_configs: + - targets: ["<%= @czo %>:9238"] + labels: + deploy: prod + instance: <%= @czo %> + relabel_configs: + - source_labels: ["__meta_ec2_tag_Name"] + regex: "(.+)" + target_label: "instance" + - source_labels: ["__meta_ec2_tag_role"] + regex: "(prod|staging)_app_frontend" + replacement: "${1}" + target_label: "deploy" + + - job_name: "rabbitmq" + ec2_sd_configs: + - region: us-east-1 + port: 15692 + filters: + - name: "tag:role" + values: ["prod_app_frontend", "staging_app_frontend"] + - name: instance-state-name + values: ["running"] + static_configs: + - targets: ["<%= @czo %>:15692"] + labels: + deploy: prod + instance: <%= @czo %> + relabel_configs: + - target_label: __metrics_path__ + replacement: "/metrics/per-object" + - source_labels: ["__meta_ec2_tag_Name"] + regex: "(.+)" + target_label: "instance" + - source_labels: ["__meta_ec2_tag_role"] + regex: "(prod|staging)_app_frontend" + replacement: "${1}" + target_label: "deploy" + + - job_name: "tornado" + ec2_sd_configs: + - region: us-east-1 + port: 9256 + filters: + - name: "tag:role" + values: ["prod_app_frontend", "staging_app_frontend"] + - name: instance-state-name + values: ["running"] + static_configs: + - targets: ["<%= @czo %>:9256"] + labels: + deploy: prod + instance: <%= @czo %> + relabel_configs: + - source_labels: ["__meta_ec2_tag_Name"] + regex: "(.+)" + target_label: "instance" + - source_labels: ["__meta_ec2_tag_role"] + regex: "(prod|staging)_app_frontend" + replacement: "${1}" + target_label: "deploy" + + - job_name: "redis" + ec2_sd_configs: + - region: us-east-1 + port: 9121 + filters: + - name: "tag:role" + values: ["redis"] + - name: instance-state-name + values: ["running"] + static_configs: + - targets: ["<%= @czo %>:9121"] + labels: + deploy: prod + instance: <%= @czo %> + relabel_configs: + - source_labels: ["__meta_ec2_tag_Name"] + regex: "(.+)" + target_label: "instance" + + - job_name: "postgres" + ec2_sd_configs: + - region: us-east-1 + port: 9187 + filters: + - name: "tag:role" + values: ["postgresql"] + - name: instance-state-name + values: ["running"] + static_configs: + - targets: ["<%= @czo %>:9187"] + labels: + deploy: prod + instance: <%= @czo %> + relabel_configs: + - source_labels: ["__meta_ec2_tag_Name"] + regex: "(.+)" + target_label: "instance" + + - job_name: "memcached" + static_configs: + - targets: ["<%= @czo %>:11212"] + labels: + deploy: prod + instance: <%= @czo %> + ec2_sd_configs: + - region: us-east-1 + port: 11212 + filters: + - name: "tag:role" + values: ["prod_app_frontend", "staging_app_frontend"] + - name: instance-state-name + values: ["running"] + relabel_configs: + - source_labels: ["__meta_ec2_tag_Name"] + regex: "(.+)" + target_label: "instance" + - source_labels: ["__meta_ec2_tag_role"] + regex: "(prod|staging)_app_frontend" + replacement: "${1}" + target_label: "deploy" + + - job_name: "tusd" + static_configs: + - targets: ["<%= @czo %>:9900"] + labels: + deploy: prod + instance: <%= @czo %> + ec2_sd_configs: + - region: us-east-1 + port: 9900 + filters: + - name: "tag:role" + values: ["prod_app_frontend", "staging_app_frontend"] + - name: instance-state-name + values: ["running"] + relabel_configs: + - source_labels: ["__meta_ec2_tag_Name"] + regex: "(.+)" + target_label: "instance" + - source_labels: ["__meta_ec2_tag_role"] + regex: "(prod|staging)_app_frontend" + replacement: "${1}" + target_label: "deploy" + + - job_name: "wal-g" + scrape_interval: 5m + scrape_timeout: 20s + static_configs: + - targets: +<% @backup_buckets.each do |bucket| -%> + - <%= bucket %> +<% end -%> + relabel_configs: + - source_labels: [__address__] + target_label: __param_bucket + - source_labels: [__param_bucket] + target_label: bucket + - target_label: __address__ + replacement: localhost:9188 + + - job_name: "vector" + scrape_interval: 30s + scrape_timeout: 3s + static_configs: + - targets: ["localhost:9081"]