From 913fab7bf03746d7958b5c0630683c6a2965b5ea Mon Sep 17 00:00:00 2001 From: Michael Renner Date: Wed, 3 Apr 2013 12:40:39 +0200 Subject: [PATCH 001/284] bloat query: fix totalwastedbytes totalwastedbytes returned only the number of wasted pages - fix that by multiplying with the blocksize --- check_postgres.pl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/check_postgres.pl b/check_postgres.pl index a3078f40..246025e9 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -3531,9 +3531,9 @@ sub check_bloat { CASE WHEN ipages < iotta THEN 0 ELSE bs*(ipages-iotta) END AS wastedibytes, CASE WHEN ipages < iotta THEN '0 bytes' ELSE (bs*(ipages-iotta))::bigint || ' bytes' END AS wastedisize, CASE WHEN relpages < otta THEN - CASE WHEN ipages < iotta THEN 0 ELSE ipages-iotta::bigint END - ELSE CASE WHEN ipages < iotta THEN relpages-otta::bigint - ELSE relpages-otta::bigint + ipages-iotta::bigint END + CASE WHEN ipages < iotta THEN 0 ELSE bs*(ipages-iotta::bigint) END + ELSE CASE WHEN ipages < iotta THEN bs*(relpages-otta::bigint) + ELSE bs*(relpages-otta::bigint + ipages-iotta::bigint) END END AS totalwastedbytes FROM ( SELECT From 801821cdad991a0c5a76f79bd204d921f3f56106 Mon Sep 17 00:00:00 2001 From: Ruslan Kabalin Date: Tue, 28 Jan 2014 16:03:28 +0000 Subject: [PATCH 002/284] Add pgbouncer_maxwait check Check how long the first (oldest) client in queue has been waiting. The suggested check is more comprehensive than pgb_pool_maxwait, it supports warning and critical time limits, exclude/include database options, output the details on affected clients in warning and critical states. --- check_postgres.pl | 132 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) diff --git a/check_postgres.pl b/check_postgres.pl index fae344f6..a6e492d1 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -198,6 +198,9 @@ package check_postgres; 'pgb-backends-msg' => q{$1 of $2 connections ($3%)}, 'pgb-backends-none' => q{No connections}, 'pgb-backends-users' => q{$1 for number of users must be a number or percentage}, + 'pgb-maxwait-msg' => q{longest wait: $1s}, + 'pgb-maxwait-nomatch'=> q{No matching rows were found}, + 'pgb-maxwait-skipped'=> q{No matching rows were found (skipped rows: $1)}, 'PID' => q{PID}, 'port' => q{port}, 'preptxn-none' => q{No prepared transactions found}, @@ -1167,6 +1170,7 @@ package check_postgres; pgb_pool_maxwait => [1, 'Check the current maximum wait time for client connections in pgbouncer pools.'], pgbouncer_backends => [0, 'Check how many clients are connected to pgbouncer compared to max_client_conn.'], pgbouncer_checksum => [0, 'Check that no pgbouncer settings have changed since the last check.'], + pgbouncer_maxwait => [0, 'Check how long the first (oldest) client in queue has been waiting.'], pgagent_jobs => [0, 'Check for no failed pgAgent jobs within a specified period of time.'], prepared_txns => [1, 'Checks number and age of prepared transactions.'], query_runtime => [0, 'Check how long a specific query takes to run.'], @@ -2012,6 +2016,9 @@ sub finishup { ## Check the current maximum wait time for client connections in pgbouncer pools check_pgb_pool('maxwait') if $action eq 'pgb_pool_maxwait'; +## Check how long the first (oldest) client in queue has been waiting. +check_pgbouncer_maxwait() if $action eq 'pgbouncer_maxwait'; + ## Check how many clients are connected to pgbouncer compared to max_client_conn. check_pgbouncer_backends() if $action eq 'pgbouncer_backends'; @@ -5630,6 +5637,107 @@ sub check_pgbouncer_checksum { } ## end of check_pgbouncer_checksum +sub check_pgbouncer_maxwait { + + ## Check how long the first (oldest) client in queue has waited, in + ## seconds. + ## Supports: Nagios, MRTG + ## Warning and critical are time limits - defaults to seconds + ## Valid units: s[econd], m[inute], h[our], d[ay] + ## All above may be written as plural as well (e.g. "2 hours") + ## Can also ignore databases with exclude and limit with include + + my $arg = shift || {}; + + my ($warning, $critical) = validate_range + ({ + type => 'time', + }); + + ## Grab information from the pg_stat_activity table + ## Since we clobber old info on a qtime "tie", use an ORDER BY + $SQL = qq{SHOW POOLS}; + + my $info = run_command($SQL, { regex => qr{\d+}, emptyok => 1 } ); + + ## Default values for information gathered + my ($maxwait, $database, $user, $cl_active, $cl_waiting) = + (0,'?','?',0,0); + + for $db (@{$info->{db}}) { + + ## Parse the psql output and gather stats from the winning row + ## Read in and parse the psql output + my $skipped = 0; + ROW: for my $r (@{$db->{slurp}}) { + + ## Apply --exclude and --include arguments to the database name + if (skip_item($r->{database})) { + $skipped++; + next ROW; + } + + ## Assign stats if we have a new winner + if ($r->{maxwait} > $maxwait) { + $database = $r->{database}; + $user = $r->{user}; + $cl_active = $r->{cl_active}; + $cl_waiting = $r->{cl_waiting}; + $maxwait = $r->{maxwait}; + } + } + + ## We don't really care why things matches as far as the final output + ## But it's nice to report what we can + if ($database eq '?') { + $MRTG and do_mrtg({one => 0, msg => 'No rows'}); + $db->{perf} = "0;$warning;$critical"; + + if ($skipped) { + add_ok msg('pgb-maxwait-skipped', $skipped); + } + else { + add_ok msg('pgb-maxwait-nomatch', $maxwait); + } + return; + } + + ## Details on who the offender was + my $whodunit = sprintf q{%s:%s %s:%s cl_active:%s cl_waiting:%s}, + msg('database'), + $database, + msg('username'), + $user, + $cl_active, + $cl_waiting; + + $MRTG and do_mrtg({one => $maxwait, msg => "$whodunit"}); + + $db->{perf} .= sprintf q{'%s'=%s;%s;%s}, + $whodunit, + $maxwait, + $warning, + $critical; + + my $m = msg('pgb-maxwait-msg', $maxwait); + my $msg = sprintf '%s (%s)', $m, $whodunit; + + if (length $critical and $maxwait >= $critical) { + add_critical $msg; + } + elsif (length $warning and $maxwait >= $warning) { + add_warning $msg; + } + else { + add_ok $msg; + } + } + + return; + + +} ## end of check_pgbouncer_maxwait + sub check_pgbouncer_backends { ## Check the number of connections to pgbouncer compared to @@ -9126,6 +9234,30 @@ =head2 B checksum must be provided as the C<--mrtg> argument. The fourth line always gives the current checksum. +=head2 B + +(C) Checks how long the first +(oldest) client in the queue has been waiting, in seconds. If this starts +increasing, then the current pool of servers does not handle requests quick +enough. Reason may be either overloaded server or just too small of a +pool_size setting in pbouncer config file. Databases can be filtered by use +of the I<--include> and I<--exclude> options. See the L +section for more details. The values or the I<--warning> and I<--critical> +options are units of time, and must be provided (no default). Valid units are +'seconds', 'minutes', 'hours', or 'days'. Each may be written singular or +abbreviated to just the first letter. If no units are given, the units are +assumed to be seconds. + +This action requires Postgres 8.3 or better. + +Example 1: Give a critical if any transaction has been open for more than 10 +minutes: + + check_postgres_pgbouncer_maxwait -p 6432 -u pgbouncer --critical='10 minutes' + +For MRTG output, returns the maximum time in seconds a transaction has been +open on the first line. The fourth line gives the name of the database. + =head2 B (C) Checks that all the pgAgent jobs From ff6e828dc4346194c007a79fbee98b6c0f9268d1 Mon Sep 17 00:00:00 2001 From: Greg Sabino Mullane Date: Mon, 5 May 2014 11:50:39 -0400 Subject: [PATCH 003/284] Quick TODO item --- TODO | 2 ++ 1 file changed, 2 insertions(+) diff --git a/TODO b/TODO index bd0e61c0..57015505 100644 --- a/TODO +++ b/TODO @@ -2,6 +2,8 @@ Quick list of outstanding items / bugs / feature requests for CP: NOTE: All bugzilla items are now on github +* Fix up references to dbhost2 and the like, or make them work again for backwards compatibility. + * The same_schema action does not check indexes. See bugzilla #54 * Perform automatic creation of views and function to allow all actions to be run From 13597d9ebcec190d9cc97be0dcfee8cfa904ecd8 Mon Sep 17 00:00:00 2001 From: Jeff Janes Date: Wed, 21 May 2014 10:48:01 -0700 Subject: [PATCH 004/284] Don't swallow space before the -c flag when reporting errors --- check_postgres.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check_postgres.pl b/check_postgres.pl index d3c55815..3e3b5258 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -2483,7 +2483,7 @@ sub run_command { warn "Action: $action\n"; warn "Calling line: $cline\n"; warn "Output: $line\n"; - $args =~ s/ -c (.+)/-c "$1"/s; + $args =~ s/ -c (.+)/ -c "$1"/s; warn "Command: $PSQL $args\n"; ## Last thing is to see if we can grab the PG version if (! $opt{stop_looping}) { From 2aa4a119cc52b33d2ae226640a617c4ddb582c9e Mon Sep 17 00:00:00 2001 From: Michael Renner Date: Wed, 11 Jun 2014 17:10:33 +0200 Subject: [PATCH 005/284] fix & extend hot_standby_delay documentation This should also account for all places where the refactoring of the numbered dbparameters (host2, port2, etc.) was referenced in the documentation. Drop the TODO entry. --- TODO | 2 -- check_postgres.pl | 17 ++++++++--------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/TODO b/TODO index 57015505..bd0e61c0 100644 --- a/TODO +++ b/TODO @@ -2,8 +2,6 @@ Quick list of outstanding items / bugs / feature requests for CP: NOTE: All bugzilla items are now on github -* Fix up references to dbhost2 and the like, or make them work again for backwards compatibility. - * The same_schema action does not check indexes. See bugzilla #54 * Perform automatic creation of views and function to allow all actions to be run diff --git a/check_postgres.pl b/check_postgres.pl index d3c55815..0314c559 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -8812,15 +8812,14 @@ =head2 B =head2 B (C) Checks the streaming replication lag by computing the delta -between the xlog position of a master server and the one of the slaves connected to it. The slave_ -server must be in hot_standby (e.g. read only) mode, therefore the minimum version to use this_ -action is Postgres 9.0. The I<--warning> and I<--critical> options are the delta between xlog -location. These values should match the volume of transactions needed to have the streaming -replication disconnect from the master because of too much lag. - -You must provide information on how to reach the second database by a connection -parameter ending in the number 2, such as "--dbport2=5543". If if it not given, -the action fails. +between the current xlog position of a master server and the replay location of a slave connected +to it. The slave server must be in hot_standby (e.g. read only) mode, therefore the minimum version to use +this action is Postgres 9.0. The I<--warning> and I<--critical> options are the delta between the xlog +locations. Since these values are byte offsets in the WAL they should match the expected transaction volume +of your application to prevent false postives or negatives. + +The first "--dbname", "--host", and "--port", etc. options are considered the +master; the second belongs to the slave. =head2 B From f958d61e84212de92079401fbc29a183850cce73 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Wed, 11 Jun 2014 16:04:38 -0400 Subject: [PATCH 006/284] Show actual long-running query in query_time output --- check_postgres.pl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/check_postgres.pl b/check_postgres.pl index d3c55815..32bf11c5 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -209,6 +209,7 @@ package check_postgres; 'qtime-for-msg' => q{$1 queries longer than $2s, longest: $3s$4 $5}, 'qtime-msg' => q{longest query: $1s$2 $3}, 'qtime-none' => q{no queries}, + 'query' => q{query}, 'queries' => q{queries}, 'query-time' => q{query_time}, 'range-badcs' => q{Invalid '$1' option: must be a checksum}, @@ -461,6 +462,7 @@ package check_postgres; 'qtime-for-msg' => q{$1 requêtes plus longues que $2s, requête la plus longue : $3s$4 $5}, 'qtime-msg' => q{requête la plus longue : $1s$2 $3}, 'qtime-none' => q{aucune requête}, + 'query' => q{requête}, 'queries' => q{requêtes}, 'query-time' => q{durée de la requête}, 'range-badcs' => q{Option « $1 » invalide : doit être une somme de contrôle}, @@ -7661,13 +7663,14 @@ sub check_txn_idle { ## Details on who the top offender was if ($max > 0) { - $whodunit = sprintf q{%s:%s %s:%s %s:%s%s%s}, + $whodunit = sprintf q{%s:%s %s:%s %s:%s%s%s %s:%s}, msg('PID'), $maxr->{pid}, msg('database'), $maxr->{datname}, msg('username'), $maxr->{usename}, $maxr->{client_addr} eq '' ? '' : (sprintf ' %s:%s', msg('address'), $maxr->{client_addr}), ($maxr->{client_port} eq '' or $maxr->{client_port} < 1) - ? '' : (sprintf ' %s:%s', msg('port'), $maxr->{client_port}); + ? '' : (sprintf ' %s:%s', msg('port'), $maxr->{client_port}), + msg('query'), $maxr->{query} || $maxr->{current_query}; } ## For MRTG, we can simply exit right now From 6b765c839eaf80499f68d412a897f61f11db9bfc Mon Sep 17 00:00:00 2001 From: Josh Williams Date: Thu, 12 Jun 2014 17:24:24 -0400 Subject: [PATCH 007/284] Add xact timestamp support to hot_standby_delay Allow the hot_standby_delay check to accept xlog byte position or timestamp lag intervals as thresholds, or even both at the same time. --- check_postgres.pl | 84 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 67 insertions(+), 17 deletions(-) diff --git a/check_postgres.pl b/check_postgres.pl index d3c55815..c5f7f73d 100755 --- a/check_postgres.pl +++ b/check_postgres.pl @@ -144,10 +144,13 @@ package check_postgres; 'fsm-page-msg' => q{fsm page slots used: $1 of $2 ($3%)}, 'fsm-rel-highver' => q{Cannot check fsm_relations on servers version 8.4 or greater}, 'fsm-rel-msg' => q{fsm relations used: $1 of $2 ($3%)}, + 'hs-future-replica' => q{Slave reporting master server clock is ahead, check time sync}, 'hs-no-role' => q{Not a master/slave couple}, 'hs-no-location' => q{Could not get current xlog location on $1}, 'hs-receive-delay' => q{receive-delay}, 'hs-replay-delay' => q{replay_delay}, + 'hs-time-delay' => q{time_delay}, + 'hs-time-version' => q{Database must be version 9.1 or higher to check slave lag by time}, 'index' => q{Index}, 'invalid-option' => q{Invalid option}, 'invalid-query' => q{Invalid query returned: $1}, @@ -3108,6 +3111,9 @@ sub validate_size_or_percent_with_oper { sub validate_integer_for_time { + # Used for txn_idle and hot_standby_delay + # txn_idle, et. al, use the form "$count for $interval" + # hot_standby_delay appears as "$bytes and $interval" my $arg = shift || {}; ndie qq{validate_integer_for_time must be called with a hashref\n} @@ -3123,7 +3129,7 @@ sub validate_integer_for_time { for my $spec ([ warning => $warning], [critical => $critical]) { my ($level, $val) = @{ $spec }; if (length $val) { - if ($val =~ /^(.+?)\sfor\s(.+)$/i) { + if ($val =~ /^(.+?)\s(?:for|and)\s(.+)$/i) { my ($int, $time) = ($1, $2); # Integer first, time second. @@ -3137,7 +3143,7 @@ sub validate_integer_for_time { } else { # Disambiguate int from time int by sign. - if ($val =~ /^[-+]\d+$/) { + if (($val =~ /^[-+]\d+$/) || ($val =~ /^\d+$/ && $arg->{default_to_int})) { ndie msg('range-int', $level) if $val !~ /^[-+]?\d+$/; push @ret, int $val, ''; } @@ -4741,9 +4747,17 @@ sub check_hot_standby_delay { ## Check on the delay in PITR replication between master and slave ## Supports: Nagios, MRTG ## Critical and warning are the delay between master and slave xlog locations - ## Example: --critical=1024 + ## and/or transaction timestamps. If both are specified, both are checked. + ## Examples: + ## --critical=1024 + ## --warning=5min + ## --warning='1048576 and 2min' --critical='16777216 and 10min' - my ($warning, $critical) = validate_range({type => 'integer', leastone => 1}); + my ($warning, $wtime, $critical, $ctime) = validate_integer_for_time({default_to_int => 1}); + if ($psql_version < 9.1 and (length $wtime or length $ctime)) { + add_unknown msg('hs-time-version'); + return; + } # check if master and slave comply with the check using pg_is_in_recovery() my ($master, $slave); @@ -4776,15 +4790,19 @@ sub check_hot_standby_delay { } ## Get xlog positions - my ($moffset, $s_rec_offset, $s_rep_offset); + my ($moffset, $s_rec_offset, $s_rep_offset, $time_delta); ## On slave $SQL = q{SELECT pg_last_xlog_receive_location() AS receive, pg_last_xlog_replay_location() AS replay}; + if ($psql_version >= 9.1) { + $SQL .= q{, COALESCE(ROUND(EXTRACT(epoch FROM now() - pg_last_xact_replay_timestamp())),0) AS seconds}; + } my $info = run_command($SQL, { dbnumber => $slave, regex => qr/\// }); my $saved_db; for $db (@{$info->{db}}) { my $receive = $db->{slurp}[0]{receive}; my $replay = $db->{slurp}[0]{replay}; + $time_delta = $db->{slurp}[0]{seconds}; if (defined $receive) { my ($a, $b) = split(/\//, $receive); @@ -4829,20 +4847,33 @@ sub check_hot_standby_delay { # Make sure it's always positive or zero $rec_delta = 0 if $rec_delta < 0; $rep_delta = 0 if $rep_delta < 0; + if (defined $time_delta and $time_delta < 0) { + add_unknown msg('hs-future-replica'); + return; + } - $MRTG and do_mrtg({one => $rep_delta, two => $rec_delta}); + $MRTG and do_mrtg($psql_version >= 9.1 ? + {one => $rep_delta, two => $rec_delta, three => $time_delta} : + {one => $rep_delta, two => $rec_delta}); $db->{perf} = sprintf ' %s=%s;%s;%s ', perfname(msg('hs-replay-delay')), $rep_delta, $warning, $critical; $db->{perf} .= sprintf ' %s=%s;%s;%s', perfname(msg('hs-receive-delay')), $rec_delta, $warning, $critical; + if ($psql_version >= 9.1) { + $db->{perf} .= sprintf ' %s=%s;%s;%s', + perfname(msg('hs-time-delay')), $time_delta, $wtime, $ctime; + } ## Do the check on replay delay in case SR has disconnected because it way too far behind my $msg = qq{$rep_delta}; - if (length $critical and $rep_delta > $critical) { + if ($psql_version >= 9.1) { + $msg .= qq{ and $time_delta seconds} + } + if ((length $critical or length $ctime) and (!length $critical or length $critical and $rep_delta > $critical) and (!length $ctime or length $ctime and $time_delta > $ctime)) { add_critical $msg; } - elsif (length $warning and $rep_delta > $warning) { + elsif ((length $warning or length $wtime) and (!length $warning or length $warning and $rep_delta > $warning) and (!length $wtime or length $wtime and $time_delta > $wtime)) { add_warning $msg; } else { @@ -8812,15 +8843,34 @@ =head2 B =head2 B (C) Checks the streaming replication lag by computing the delta -between the xlog position of a master server and the one of the slaves connected to it. The slave_ -server must be in hot_standby (e.g. read only) mode, therefore the minimum version to use this_ -action is Postgres 9.0. The I<--warning> and I<--critical> options are the delta between xlog -location. These values should match the volume of transactions needed to have the streaming -replication disconnect from the master because of too much lag. - -You must provide information on how to reach the second database by a connection -parameter ending in the number 2, such as "--dbport2=5543". If if it not given, -the action fails. +between the xlog position of a master server and the one of the slaves connected to it, and/or the +last transaction timestamp received by the slave. The slave server must be in hot_standby (e.g. read +only) mode, therefore the minimum version to use this action is Postgres 9.0. To support transaction +timestamps the minimum version is Postgres 9.1. + +The I<--warning> and I<--critical> options are either the delta between xlog positions in bytes, +units of time to compare timestamps, or both. + +Byte values should be based on the volume of transactions needed to have the streaming replication +disconnect from the master because of too much lag, determined by the Postgres configuration variable +B. For units of time, valid units are 'seconds', 'minutes', 'hours', or 'days'. +Each may be written singular or abbreviated to just the first letter. When specifying both, in the +form 'I and I