mirror of
https://github.com/zulip/zulip.git
synced 2025-11-17 04:12:02 +00:00
puppet: Import pagerduty_nagios.pl into puppet.
(imported from commit 1b91524498372d3e69f07468e4635c4d66c44d85)
This commit is contained in:
280
servers/puppet/modules/humbug/files/pagerduty_nagios.pl
Executable file
280
servers/puppet/modules/humbug/files/pagerduty_nagios.pl
Executable file
@@ -0,0 +1,280 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
|
||||
# Nagios plugin that sends Nagios events to PagerDuty.
|
||||
#
|
||||
# Copyright (c) 2011, PagerDuty, Inc. <info@pagerduty.com>
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
# * Neither the name of PagerDuty Inc nor the
|
||||
# names of its contributors may be used to endorse or promote products
|
||||
# derived from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL PAGERDUTY INC BE LIABLE FOR ANY
|
||||
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
use Pod::Usage;
|
||||
use Getopt::Long;
|
||||
use Sys::Syslog;
|
||||
use HTTP::Request::Common qw(POST);
|
||||
use HTTP::Status qw(is_client_error);
|
||||
use LWP::UserAgent;
|
||||
use File::Path;
|
||||
use Fcntl qw(:flock);
|
||||
|
||||
|
||||
=head1 NAME
|
||||
|
||||
pagerduty_nagios -- Send Nagios events to the PagerDuty alert system
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
pagerduty_nagios enqueue [options]
|
||||
|
||||
pagerduty_nagios flush [options]
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This script passes events from Nagios to the PagerDuty alert system. It's
|
||||
meant to be run as a Nagios notification plugin. For more details, please see
|
||||
the PagerDuty Nagios integration docs at:
|
||||
http://www.pagerduty.com/docs/nagios-integration.
|
||||
|
||||
When called in the "enqueue" mode, the script loads a Nagios notification out
|
||||
of the environment and into the event queue. It then tries to flush the
|
||||
queue by sending any enqueued events to the PagerDuty server. The script is
|
||||
typically invoked in this mode from a Nagios notification handler.
|
||||
|
||||
When called in the "flush" mode, the script simply tries to send any enqueued
|
||||
events to the PagerDuty server. This mode is typically invoked by cron. The
|
||||
purpose of this mode is to retry any events that couldn't be sent to the
|
||||
PagerDuty server for whatever reason when they were initially enqueued.
|
||||
|
||||
=head1 OPTIONS
|
||||
|
||||
--api-base URL
|
||||
The base URL used to communicate with PagerDuty. The default option here
|
||||
should be fine, but adjusting it may make sense if your firewall doesn't
|
||||
pass HTTPS traffic for some reason. See the PagerDuty Nagios integration
|
||||
docs for details.
|
||||
|
||||
--field KEY=VALUE
|
||||
Add this key-value pair to the event being passed to PagerDuty. The script
|
||||
automatically gathers Nagios macros out of the environment, so there's no
|
||||
need to specify these explicitly. This option can be repeated as many
|
||||
times as necessary to pass multiple key-value pairs. This option is only
|
||||
useful when an event is being enqueued.0
|
||||
|
||||
--help
|
||||
Display documentation for the script.
|
||||
|
||||
--queue-dir DIR
|
||||
Path to the directory to use to store the event queue. By default, we use
|
||||
/tmp/pagerduty_nagios.
|
||||
|
||||
--verbose
|
||||
Turn on extra debugging information. Useful for debugging.
|
||||
|
||||
=cut
|
||||
|
||||
# This release tested on:
|
||||
# Debian Sarge (Perl 5.8.4)
|
||||
# Ubuntu 9.04 (Perl 5.10.0)
|
||||
|
||||
my $opt_api_base = "https://events.pagerduty.com/nagios/2010-04-15";
|
||||
my %opt_fields;
|
||||
my $opt_help;
|
||||
my $opt_queue_dir = "/tmp/pagerduty_nagios";
|
||||
my $opt_verbose;
|
||||
|
||||
|
||||
sub get_queue_from_dir {
|
||||
my $dh;
|
||||
|
||||
unless (opendir($dh, $opt_queue_dir)) {
|
||||
syslog(LOG_ERR, "opendir %s failed: %s", $opt_queue_dir, $!);
|
||||
die $!;
|
||||
}
|
||||
|
||||
my @files;
|
||||
while (my $f = readdir($dh)) {
|
||||
next unless $f =~ /^pd_(\d+)_\d+\.txt$/;
|
||||
push @files, [int($1), $f];
|
||||
}
|
||||
|
||||
closedir($dh);
|
||||
|
||||
@files = sort { @{$a}[0] <=> @{$b}[0] } @files;
|
||||
return map { @{$_}[1] } @files;
|
||||
}
|
||||
|
||||
|
||||
sub flush_queue {
|
||||
my @files = get_queue_from_dir();
|
||||
my $ua = LWP::UserAgent->new;
|
||||
|
||||
# It's not a big deal if we don't get the message through the first time.
|
||||
# It will get sent the next time cron fires.
|
||||
$ua->timeout(15);
|
||||
|
||||
foreach (@files) {
|
||||
my $filename = "$opt_queue_dir/$_";
|
||||
my $fd;
|
||||
my %event;
|
||||
|
||||
print STDERR "==== Now processing: $filename\n" if $opt_verbose;
|
||||
|
||||
unless (open($fd, "<", $filename)) {
|
||||
syslog(LOG_ERR, "open %s for read failed: %s", $filename, $!);
|
||||
die $!;
|
||||
}
|
||||
|
||||
while (<$fd>) {
|
||||
chomp;
|
||||
my @fields = split("=", $_, 2);
|
||||
$event{$fields[0]} = $fields[1];
|
||||
}
|
||||
|
||||
close($fd);
|
||||
|
||||
my $req = POST("$opt_api_base/create_event", \%event);
|
||||
|
||||
if ($opt_verbose) {
|
||||
my $s = $req->as_string;
|
||||
print STDERR "Request:\n$s\n";
|
||||
}
|
||||
|
||||
my $resp = $ua->request($req);
|
||||
|
||||
if ($opt_verbose) {
|
||||
my $s = $resp->as_string;
|
||||
print STDERR "Response:\n$s\n";
|
||||
}
|
||||
|
||||
if ($resp->is_success) {
|
||||
syslog(LOG_INFO, "Nagios event in file %s ACCEPTED by the PagerDuty server.", $filename);
|
||||
unlink($filename);
|
||||
}
|
||||
elsif (is_client_error($resp->code)) {
|
||||
syslog(LOG_WARNING, "Nagios event in file %s REJECTED by the PagerDuty server. Server says: %s", $filename, $resp->content);
|
||||
unlink($filename);
|
||||
}
|
||||
else {
|
||||
# Something else went wrong.
|
||||
syslog(LOG_WARNING, "Nagios event in file %s DEFERRED due to network/server problems.", $filename);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
# Everything that needed to be sent was sent.
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
sub lock_and_flush_queue {
|
||||
# Serialize access to the queue directory while we flush.
|
||||
# (We don't want more than one flush at once.)
|
||||
|
||||
my $lock_filename = "$opt_queue_dir/lockfile";
|
||||
my $lock_fd;
|
||||
|
||||
unless (open($lock_fd, ">", $lock_filename)) {
|
||||
syslog(LOG_ERR, "open %s for write failed: %s", $lock_filename, $!);
|
||||
die $!;
|
||||
}
|
||||
|
||||
unless (flock($lock_fd, LOCK_EX)) {
|
||||
syslog(LOG_ERR, "flock %s failed: %s", $lock_filename, $!);
|
||||
die $!;
|
||||
}
|
||||
|
||||
my $ret = flush_queue();
|
||||
|
||||
close($lock_fd);
|
||||
|
||||
return $ret;
|
||||
}
|
||||
|
||||
|
||||
sub enqueue_event {
|
||||
my %event;
|
||||
|
||||
# Scoop all the Nagios related stuff out of the environment.
|
||||
while ((my $k, my $v) = each %ENV) {
|
||||
next unless $k =~ /^NAGIOS_(.*)$/;
|
||||
$event{$1} = $v;
|
||||
}
|
||||
|
||||
# Apply any other variables that were passed in.
|
||||
%event = (%event, %opt_fields);
|
||||
|
||||
$event{"pd_version"} = "1.0";
|
||||
|
||||
# Right off the bat, enqueue the event. Nothing tiem consuming should come
|
||||
# before here (i.e. no locks or remote connections), because we want to
|
||||
# make sure we get the event written out within the Nagios notification
|
||||
# timeout. If we get killed off after that, it isn't a big deal.
|
||||
|
||||
my $filename = sprintf("$opt_queue_dir/pd_%u_%u.txt", time(), $$);
|
||||
my $fd;
|
||||
|
||||
unless (open($fd, ">", $filename)) {
|
||||
syslog(LOG_ERR, "open %s for write failed: %s", $filename, $!);
|
||||
die $!;
|
||||
}
|
||||
|
||||
while ((my $k, my $v) = each %event) {
|
||||
# "=" can't occur in the keyname, and "\n" can't occur anywhere.
|
||||
# (Nagios follows this already, so I think we're safe)
|
||||
print $fd "$k=$v\n";
|
||||
}
|
||||
|
||||
close($fd);
|
||||
}
|
||||
|
||||
###########
|
||||
|
||||
GetOptions("api-base=s" => \$opt_api_base,
|
||||
"field=s%" => \%opt_fields,
|
||||
"help" => \$opt_help,
|
||||
"queue-dir=s" => \$opt_queue_dir,
|
||||
"verbose" => \$opt_verbose
|
||||
) || pod2usage(2);
|
||||
|
||||
pod2usage(2) if @ARGV < 1 ||
|
||||
(($ARGV[0] ne "enqueue") && ($ARGV[0] ne "flush"));
|
||||
|
||||
pod2usage(-verbose => 3) if $opt_help;
|
||||
|
||||
my @log_mode = ("nofatal", "pid");
|
||||
push(@log_mode, "perror") if $opt_verbose;
|
||||
|
||||
openlog("pagerduty_nagios", join(",", @log_mode), LOG_LOCAL0);
|
||||
|
||||
# This function automatically terminates the program on things like permission
|
||||
# errors.
|
||||
mkpath($opt_queue_dir);
|
||||
|
||||
if ($ARGV[0] eq "enqueue") {
|
||||
enqueue_event();
|
||||
lock_and_flush_queue();
|
||||
}
|
||||
elsif ($ARGV[0] eq "flush") {
|
||||
lock_and_flush_queue();
|
||||
}
|
||||
Reference in New Issue
Block a user