From 801821cdad991a0c5a76f79bd204d921f3f56106 Mon Sep 17 00:00:00 2001 From: Ruslan Kabalin Date: Tue, 28 Jan 2014 16:03:28 +0000 Subject: [PATCH 001/258] Add pgbouncer_maxwait check Check how long the first (oldest) client in queue has been waiting. The suggested check is more comprehensive than pgb_pool_maxwait, it supports warning and critical time limits, exclude/include database options, output the details on affected clients in warning and critical states. --- check_postgres.pl | 132 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) diff --git a/check_postgres.pl b/check_postgres.pl index fae344f6..a6e492d1 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -198,6 +198,9 @@ package check_postgres; 'pgb-backends-msg' => q{$1 of $2 connections ($3%)}, 'pgb-backends-none' => q{No connections}, 'pgb-backends-users' => q{$1 for number of users must be a number or percentage}, + 'pgb-maxwait-msg' => q{longest wait: $1s}, + 'pgb-maxwait-nomatch'=> q{No matching rows were found}, + 'pgb-maxwait-skipped'=> q{No matching rows were found (skipped rows: $1)}, 'PID' => q{PID}, 'port' => q{port}, 'preptxn-none' => q{No prepared transactions found}, @@ -1167,6 +1170,7 @@ package check_postgres; pgb_pool_maxwait => [1, 'Check the current maximum wait time for client connections in pgbouncer pools.'], pgbouncer_backends => [0, 'Check how many clients are connected to pgbouncer compared to max_client_conn.'], pgbouncer_checksum => [0, 'Check that no pgbouncer settings have changed since the last check.'], + pgbouncer_maxwait => [0, 'Check how long the first (oldest) client in queue has been waiting.'], pgagent_jobs => [0, 'Check for no failed pgAgent jobs within a specified period of time.'], prepared_txns => [1, 'Checks number and age of prepared transactions.'], query_runtime => [0, 'Check how long a specific query takes to run.'], @@ -2012,6 +2016,9 @@ sub finishup { ## Check the current maximum wait time for client connections in pgbouncer pools check_pgb_pool('maxwait') if $action eq 'pgb_pool_maxwait'; +## Check how long the first (oldest) client in queue has been waiting. +check_pgbouncer_maxwait() if $action eq 'pgbouncer_maxwait'; + ## Check how many clients are connected to pgbouncer compared to max_client_conn. check_pgbouncer_backends() if $action eq 'pgbouncer_backends'; @@ -5630,6 +5637,107 @@ sub check_pgbouncer_checksum { } ## end of check_pgbouncer_checksum +sub check_pgbouncer_maxwait { + + ## Check how long the first (oldest) client in queue has waited, in + ## seconds. + ## Supports: Nagios, MRTG + ## Warning and critical are time limits - defaults to seconds + ## Valid units: s[econd], m[inute], h[our], d[ay] + ## All above may be written as plural as well (e.g. "2 hours") + ## Can also ignore databases with exclude and limit with include + + my $arg = shift || {}; + + my ($warning, $critical) = validate_range + ({ + type => 'time', + }); + + ## Grab information from the pg_stat_activity table + ## Since we clobber old info on a qtime "tie", use an ORDER BY + $SQL = qq{SHOW POOLS}; + + my $info = run_command($SQL, { regex => qr{\d+}, emptyok => 1 } ); + + ## Default values for information gathered + my ($maxwait, $database, $user, $cl_active, $cl_waiting) = + (0,'?','?',0,0); + + for $db (@{$info->{db}}) { + + ## Parse the psql output and gather stats from the winning row + ## Read in and parse the psql output + my $skipped = 0; + ROW: for my $r (@{$db->{slurp}}) { + + ## Apply --exclude and --include arguments to the database name + if (skip_item($r->{database})) { + $skipped++; + next ROW; + } + + ## Assign stats if we have a new winner + if ($r->{maxwait} > $maxwait) { + $database = $r->{database}; + $user = $r->{user}; + $cl_active = $r->{cl_active}; + $cl_waiting = $r->{cl_waiting}; + $maxwait = $r->{maxwait}; + } + } + + ## We don't really care why things matches as far as the final output + ## But it's nice to report what we can + if ($database eq '?') { + $MRTG and do_mrtg({one => 0, msg => 'No rows'}); + $db->{perf} = "0;$warning;$critical"; + + if ($skipped) { + add_ok msg('pgb-maxwait-skipped', $skipped); + } + else { + add_ok msg('pgb-maxwait-nomatch', $maxwait); + } + return; + } + + ## Details on who the offender was + my $whodunit = sprintf q{%s:%s %s:%s cl_active:%s cl_waiting:%s}, + msg('database'), + $database, + msg('username'), + $user, + $cl_active, + $cl_waiting; + + $MRTG and do_mrtg({one => $maxwait, msg => "$whodunit"}); + + $db->{perf} .= sprintf q{'%s'=%s;%s;%s}, + $whodunit, + $maxwait, + $warning, + $critical; + + my $m = msg('pgb-maxwait-msg', $maxwait); + my $msg = sprintf '%s (%s)', $m, $whodunit; + + if (length $critical and $maxwait >= $critical) { + add_critical $msg; + } + elsif (length $warning and $maxwait >= $warning) { + add_warning $msg; + } + else { + add_ok $msg; + } + } + + return; + + +} ## end of check_pgbouncer_maxwait + sub check_pgbouncer_backends { ## Check the number of connections to pgbouncer compared to @@ -9126,6 +9234,30 @@ =head2 B checksum must be provided as the C<--mrtg> argument. The fourth line always gives the current checksum. +=head2 B + +(C) Checks how long the first +(oldest) client in the queue has been waiting, in seconds. If this starts +increasing, then the current pool of servers does not handle requests quick +enough. Reason may be either overloaded server or just too small of a +pool_size setting in pbouncer config file. Databases can be filtered by use +of the I<--include> and I<--exclude> options. See the L +section for more details. The values or the I<--warning> and I<--critical> +options are units of time, and must be provided (no default). Valid units are +'seconds', 'minutes', 'hours', or 'days'. Each may be written singular or +abbreviated to just the first letter. If no units are given, the units are +assumed to be seconds. + +This action requires Postgres 8.3 or better. + +Example 1: Give a critical if any transaction has been open for more than 10 +minutes: + + check_postgres_pgbouncer_maxwait -p 6432 -u pgbouncer --critical='10 minutes' + +For MRTG output, returns the maximum time in seconds a transaction has been +open on the first line. The fourth line gives the name of the database. + =head2 B (C) Checks that all the pgAgent jobs From 0c19c9ca42277c8aae1883c54e2148c113cf0693 Mon Sep 17 00:00:00 2001 From: Christoph Moench-Tegeder Date: Wed, 7 Jan 2015 16:44:58 +0100 Subject: [PATCH 002/258] in check_bloat(), fix MINPAGES and MINIPAGES make sure the bloat check's minimum size requirements for tables and indexes match the documentation. --- check_postgres.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/check_postgres.pl b/check_postgres.pl index 5f78bbb8..cc9aaf91 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -3526,8 +3526,8 @@ sub check_bloat { ## Can also specify percentages ## Don't bother with tables or indexes unless they have at least this many bloated pages - my $MINPAGES = 0; - my $MINIPAGES = 10; + my $MINPAGES = 10; + my $MINIPAGES = 15; my $LIMIT = 10; if ($opt{perflimit}) { From 27ad631cfd13b270ba8ae61f8a8ebbd222c47790 Mon Sep 17 00:00:00 2001 From: Giles Westwood Date: Mon, 2 Mar 2015 16:21:59 +0000 Subject: [PATCH 003/258] adding an option to pre-populate the database list with all available databases, used for bloat check --- check_postgres.pl | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/check_postgres.pl b/check_postgres.pl index 5f78bbb8..3db33074 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -959,6 +959,9 @@ package check_postgres; 'critical=s', 'include=s@', 'exclude=s@', + 'alldb', + 'includedb=s@', + 'excludedb=s@', 'includeuser=s@', 'excludeuser=s@', @@ -1222,6 +1225,9 @@ package check_postgres; -c value, --critical=value the critical threshold, range depends on the action --include=name(s) items to specifically include (e.g. tables), depends on the action --exclude=name(s) items to specifically exclude (e.g. tables), depends on the action + --alldb list all postgres databases and run action over them + --excludedb=name regex filter for the alldb option to select only certain databases + --includedb=name regex filter for the alldb option to select only certain databases --includeuser=include objects owned by certain users --excludeuser=exclude objects owned by certain users @@ -1364,6 +1370,44 @@ sub msg_en { $opt{defaultdb} = $psql_version >= 8.0 ? 'postgres' : 'template1'; $opt{defaultdb} = 'pgbouncer' if $action =~ /^pgb/; +## If alldb is set then run a psql command to find out all the databases +if (defined $opt{alldb}){ + + my $pg_port = $opt{defaultport}; + if ($opt{port}[0]){ + $pg_port = $opt{port}[0]; + } + my $psql_output = join(",", map /^([\w|-]+?)\|/, qx{$PSQL -A -l -t -p $pg_port }); + my $pg_db; + # optionally exclude or include each db + my @psql_output_array = split(/,/, $psql_output); + for $pg_db (@psql_output_array) { + if (defined $opt{includedb}){ + if ($pg_db =~ /$opt{includedb}[0]/) { + # do nothing + } else { + # strip the database from the listing + $psql_output =~ s/($pg_db),//; + } + } + if (defined $opt{excludedb}){ + if ($pg_db =~ /$opt{excludedb}[0]/) { + # strip the database from the listing + $psql_output =~ s/($pg_db),//; + } else { + # do nothing + } + } + } + # strip out some dbs we're not interested in + $psql_output =~ s/(template0,)//; + $psql_output =~ s/(root,)//; + # pg8.4 + $psql_output =~ s/(,:)//g; + $opt{dbname}[0] = $psql_output; +} + + ## Check the current database mode our $STANDBY = 0; our $MASTER = 0; From 776408aab47f29e4f7f47bf4d515671d7736ab08 Mon Sep 17 00:00:00 2001 From: Greg Sabino Mullane Date: Tue, 23 Jun 2015 09:09:07 -0400 Subject: [PATCH 004/258] Version bump. --- check_postgres.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check_postgres.pl b/check_postgres.pl index a7a2a767..df143576 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -32,7 +32,7 @@ package check_postgres; binmode STDOUT, ':encoding(UTF-8)'; -our $VERSION = '2.21.0'; +our $VERSION = '2.21.1'; use vars qw/ %opt $PGBINDIR $PSQL $res $COM $SQL $db /; From 51c6991862c9b24da0bfaa5917c1597db12ff495 Mon Sep 17 00:00:00 2001 From: Greg Sabino Mullane Date: Tue, 23 Jun 2015 09:16:26 -0400 Subject: [PATCH 005/258] Update release notes a bit --- check_postgres.pl | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/check_postgres.pl b/check_postgres.pl index df143576..42f1ab07 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -7974,7 +7974,7 @@ =head1 NAME B - a Postgres monitoring script for Nagios, MRTG, Cacti, and others -This documents describes check_postgres.pl version 2.21.0 +This documents describes check_postgres.pl version 2.21.1 =head1 SYNOPSIS @@ -9832,20 +9832,44 @@ =head1 HISTORY Add explicit ORDER BY to the slony_status check to get the most lagged server. (Jeff Frost) - Declare POD encoding to be utf8. (Christoph Berg) + Change the way tables are quoted in replicate_row. + (Glyn Astill) - Query all sequences per DB in parallel for action=sequence. (Christoph Berg) + Improved multi-slave support in replicate_row. + (Andrew Yochum) + + Add xact timestamp support to hot_standby_delay. + Allow the hot_standby_delay check to accept xlog byte position or + timestamp lag intervals as thresholds, or even both at the same time. + (Josh Williams) + + Fix and extend hot_standby_delay documentation + (Michael Renner) + + Don't swallow space before the -c flag when reporting errors + (Jeff Janes) + + Show actual long-running query in query_time output + (Peter Eisentraut) + + Declare POD encoding to be utf8. + (Christoph Berg) + + Query all sequences per DB in parallel for action=sequence. + (Christoph Berg) =item B September 24, 2013 Fix issue with SQL steps in check_pgagent_jobs for sql steps which perform deletes (Rob Emery via github pull) - Install man page in section 1. (Peter Eisentraut, bug 53, github issue 26) + Install man page in section 1. + (Peter Eisentraut, bug 53, github issue 26) Order lock types in check_locks output to make the ordering predictable; setting SKIP_NETWORK_TESTS will skip the new_version tests; other minor test - suite fixes. (Christoph Berg) + suite fixes. + (Christoph Berg) Fix same_schema check on 9.3 by ignoring relminmxid differences in pg_class (Christoph Berg) From 791a17ff1b662b1b6d4d3e5261bfdc3aac313731 Mon Sep 17 00:00:00 2001 From: Christoph Berg Date: Tue, 23 Jun 2015 16:26:10 +0200 Subject: [PATCH 006/258] Fix t/02_sequence.t for PG 9.0/1 --- t/02_sequence.t | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t/02_sequence.t b/t/02_sequence.t index efce3a02..1111d895 100644 --- a/t/02_sequence.t +++ b/t/02_sequence.t @@ -118,7 +118,7 @@ if ($ver >= 90200) { like ($cp->run('--exclude=sequence_test_id_seq'), qr{WARNING:.+public.sequence_test_smallid_seq=92% \(calls left=2767\)}, $t); } else { SKIP: { - skip '"smallserial" needs PostgreSQL 9.2 or later', 2; + skip '"smallserial" needs PostgreSQL 9.2 or later', 1; } } From 60af9f27721d62219a5ef220feff02ae7e8b79b1 Mon Sep 17 00:00:00 2001 From: Christoph Berg Date: Tue, 23 Jun 2015 16:35:31 +0200 Subject: [PATCH 007/258] Fix txn_time regression test for 9.0/9.1 Newer versions will show the last or current query here, older versions will just show " in transaction" if there is currently no query running. --- t/02_txn_time.t | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/t/02_txn_time.t b/t/02_txn_time.t index 8be2fa18..d743fe1f 100644 --- a/t/02_txn_time.t +++ b/t/02_txn_time.t @@ -76,7 +76,8 @@ sleep(1); like ($cp->run(q{-w 0}), qr{longest txn: 1s}, $t); $t .= ' (MRTG)'; -like ($cp->run(q{--output=mrtg -w 0}), qr{\d+\n0\n\nPID:\d+ database:$dbname username:\w+ query:SELECT 1\n}, $t); +my $query_patten = ($ver >= 90200) ? "SELECT 1" : " in transaction"; +like ($cp->run(q{--output=mrtg -w 0}), qr{\d+\n0\n\nPID:\d+ database:$dbname username:\w+ query:$query_patten\n}, $t); $idle_dbh->commit; From efcff69ea7678ec264e814b3e515a09d1674db87 Mon Sep 17 00:00:00 2001 From: Greg Sabino Mullane Date: Fri, 26 Jun 2015 08:52:32 -0400 Subject: [PATCH 008/258] Rearrange recent changes in rough priority order --- check_postgres.pl | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/check_postgres.pl b/check_postgres.pl index 42f1ab07..6bb63d31 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -9826,38 +9826,38 @@ =head1 HISTORY =item B + Add xact timestamp support to hot_standby_delay. + Allow the hot_standby_delay check to accept xlog byte position or + timestamp lag intervals as thresholds, or even both at the same time. + (Josh Williams) + + Query all sequences per DB in parallel for action=sequence. + (Christoph Berg) + Fix bloat check to use correct SQL depending on the server version. (Adrian Vondendriesch) + Show actual long-running query in query_time output + (Peter Eisentraut) + Add explicit ORDER BY to the slony_status check to get the most lagged server. (Jeff Frost) - Change the way tables are quoted in replicate_row. - (Glyn Astill) - Improved multi-slave support in replicate_row. (Andrew Yochum) - Add xact timestamp support to hot_standby_delay. - Allow the hot_standby_delay check to accept xlog byte position or - timestamp lag intervals as thresholds, or even both at the same time. - (Josh Williams) - - Fix and extend hot_standby_delay documentation - (Michael Renner) + Change the way tables are quoted in replicate_row. + (Glyn Astill) Don't swallow space before the -c flag when reporting errors (Jeff Janes) - Show actual long-running query in query_time output - (Peter Eisentraut) + Fix and extend hot_standby_delay documentation + (Michael Renner) Declare POD encoding to be utf8. (Christoph Berg) - Query all sequences per DB in parallel for action=sequence. - (Christoph Berg) - =item B September 24, 2013 Fix issue with SQL steps in check_pgagent_jobs for sql steps which perform deletes From d2412d5125ac991252f09ddf0aac17674feec10d Mon Sep 17 00:00:00 2001 From: Greg Sabino Mullane Date: Fri, 26 Jun 2015 09:18:02 -0400 Subject: [PATCH 009/258] Spelling --- check_postgres.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/check_postgres.pl b/check_postgres.pl index 6bb63d31..2fd53ec4 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -8857,7 +8857,7 @@ =head2 B to it. The slave server must be in hot_standby (e.g. read only) mode, therefore the minimum version to use this action is Postgres 9.0. The I<--warning> and I<--critical> options are the delta between the xlog locations. Since these values are byte offsets in the WAL they should match the expected transaction volume -of your application to prevent false postives or negatives. +of your application to prevent false positives or negatives. The first "--dbname", "--host", and "--port", etc. options are considered the master; the second belongs to the slave. @@ -8869,7 +8869,7 @@ =head2 B form 'I and I