Skip to content

Commit 6b9e875

Browse files
committed
Track block level checksum failures in pg_stat_database
This adds a column that counts how many checksum failures have occurred on files belonging to a specific database. Both checksum failures during normal backend processing and those created when a base backup detects a checksum failure are counted. Author: Magnus Hagander Reviewed by: Julien Rouhaud
1 parent 3c59263 commit 6b9e875

File tree

10 files changed

+114
-7
lines changed

10 files changed

+114
-7
lines changed

doc/src/sgml/monitoring.sgml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2508,6 +2508,11 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
25082508
<entry><type>bigint</type></entry>
25092509
<entry>Number of deadlocks detected in this database</entry>
25102510
</row>
2511+
<row>
2512+
<entry><structfield>checksum_failures</structfield></entry>
2513+
<entry><type>bigint</type></entry>
2514+
<entry>Number of data page checksum failures detected in this database</entry>
2515+
</row>
25112516
<row>
25122517
<entry><structfield>blk_read_time</structfield></entry>
25132518
<entry><type>double precision</type></entry>

src/backend/catalog/system_views.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -823,6 +823,7 @@ CREATE VIEW pg_stat_database AS
823823
pg_stat_get_db_temp_files(D.oid) AS temp_files,
824824
pg_stat_get_db_temp_bytes(D.oid) AS temp_bytes,
825825
pg_stat_get_db_deadlocks(D.oid) AS deadlocks,
826+
pg_stat_get_db_checksum_failures(D.oid) AS checksum_failures,
826827
pg_stat_get_db_blk_read_time(D.oid) AS blk_read_time,
827828
pg_stat_get_db_blk_write_time(D.oid) AS blk_write_time,
828829
pg_stat_get_db_stat_reset_time(D.oid) AS stats_reset

src/backend/postmaster/pgstat.c

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,7 @@ static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
334334
static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
335335
static void pgstat_recv_recoveryconflict(PgStat_MsgRecoveryConflict *msg, int len);
336336
static void pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len);
337+
static void pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len);
337338
static void pgstat_recv_tempfile(PgStat_MsgTempFile *msg, int len);
338339

339340
/* ------------------------------------------------------------
@@ -1518,6 +1519,40 @@ pgstat_report_deadlock(void)
15181519
pgstat_send(&msg, sizeof(msg));
15191520
}
15201521

1522+
1523+
1524+
/* --------
1525+
* pgstat_report_checksum_failures_in_db(dboid, failure_count) -
1526+
*
1527+
* Tell the collector about one or more checksum failures.
1528+
* --------
1529+
*/
1530+
void
1531+
pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
1532+
{
1533+
PgStat_MsgChecksumFailure msg;
1534+
1535+
if (pgStatSock == PGINVALID_SOCKET || !pgstat_track_counts)
1536+
return;
1537+
1538+
pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_CHECKSUMFAILURE);
1539+
msg.m_databaseid = dboid;
1540+
msg.m_failurecount = failurecount;
1541+
pgstat_send(&msg, sizeof(msg));
1542+
}
1543+
1544+
/* --------
1545+
* pgstat_report_checksum_failure() -
1546+
*
1547+
* Tell the collector about a checksum failure.
1548+
* --------
1549+
*/
1550+
void
1551+
pgstat_report_checksum_failure(void)
1552+
{
1553+
pgstat_report_checksum_failures_in_db(MyDatabaseId, 1);
1554+
}
1555+
15211556
/* --------
15221557
* pgstat_report_tempfile() -
15231558
*
@@ -4455,6 +4490,10 @@ PgstatCollectorMain(int argc, char *argv[])
44554490
pgstat_recv_tempfile((PgStat_MsgTempFile *) &msg, len);
44564491
break;
44574492

4493+
case PGSTAT_MTYPE_CHECKSUMFAILURE:
4494+
pgstat_recv_checksum_failure((PgStat_MsgChecksumFailure *) &msg, len);
4495+
break;
4496+
44584497
default:
44594498
break;
44604499
}
@@ -4554,6 +4593,7 @@ reset_dbentry_counters(PgStat_StatDBEntry *dbentry)
45544593
dbentry->n_temp_files = 0;
45554594
dbentry->n_temp_bytes = 0;
45564595
dbentry->n_deadlocks = 0;
4596+
dbentry->n_checksum_failures = 0;
45574597
dbentry->n_block_read_time = 0;
45584598
dbentry->n_block_write_time = 0;
45594599

@@ -6196,6 +6236,22 @@ pgstat_recv_deadlock(PgStat_MsgDeadlock *msg, int len)
61966236
dbentry->n_deadlocks++;
61976237
}
61986238

6239+
/* ----------
6240+
* pgstat_recv_checksum_failure() -
6241+
*
6242+
* Process a CHECKSUMFAILURE message.
6243+
* ----------
6244+
*/
6245+
static void
6246+
pgstat_recv_checksum_failure(PgStat_MsgChecksumFailure *msg, int len)
6247+
{
6248+
PgStat_StatDBEntry *dbentry;
6249+
6250+
dbentry = pgstat_get_db_entry(msg->m_databaseid, true);
6251+
6252+
dbentry->n_checksum_failures += msg->m_failurecount;
6253+
}
6254+
61996255
/* ----------
62006256
* pgstat_recv_tempfile() -
62016257
*

src/backend/replication/basebackup.c

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ typedef struct
5858
static int64 sendDir(const char *path, int basepathlen, bool sizeonly,
5959
List *tablespaces, bool sendtblspclinks);
6060
static bool sendFile(const char *readfilename, const char *tarfilename,
61-
struct stat *statbuf, bool missing_ok);
61+
struct stat *statbuf, bool missing_ok, Oid dboid);
6262
static void sendFileWithContent(const char *filename, const char *content);
6363
static int64 _tarWriteHeader(const char *filename, const char *linktarget,
6464
struct stat *statbuf, bool sizeonly);
@@ -342,7 +342,7 @@ perform_base_backup(basebackup_options *opt)
342342
(errcode_for_file_access(),
343343
errmsg("could not stat file \"%s\": %m",
344344
XLOG_CONTROL_FILE)));
345-
sendFile(XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf, false);
345+
sendFile(XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf, false, InvalidOid);
346346
}
347347
else
348348
sendTablespace(ti->path, false);
@@ -592,7 +592,7 @@ perform_base_backup(basebackup_options *opt)
592592
(errcode_for_file_access(),
593593
errmsg("could not stat file \"%s\": %m", pathbuf)));
594594

595-
sendFile(pathbuf, pathbuf, &statbuf, false);
595+
sendFile(pathbuf, pathbuf, &statbuf, false, InvalidOid);
596596

597597
/* unconditionally mark file as archived */
598598
StatusFilePath(pathbuf, fname, ".done");
@@ -1302,7 +1302,7 @@ sendDir(const char *path, int basepathlen, bool sizeonly, List *tablespaces,
13021302

13031303
if (!sizeonly)
13041304
sent = sendFile(pathbuf, pathbuf + basepathlen + 1, &statbuf,
1305-
true);
1305+
true, isDbDir ? pg_atoi(lastDir + 1, sizeof(Oid), 0) : InvalidOid);
13061306

13071307
if (sent || sizeonly)
13081308
{
@@ -1358,12 +1358,15 @@ is_checksummed_file(const char *fullpath, const char *filename)
13581358
*
13591359
* If 'missing_ok' is true, will not throw an error if the file is not found.
13601360
*
1361+
* If dboid is anything other than InvalidOid then any checksum failures detected
1362+
* will get reported to the stats collector.
1363+
*
13611364
* Returns true if the file was successfully sent, false if 'missing_ok',
13621365
* and the file did not exist.
13631366
*/
13641367
static bool
13651368
sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf,
1366-
bool missing_ok)
1369+
bool missing_ok, Oid dboid)
13671370
{
13681371
FILE *fp;
13691372
BlockNumber blkno = 0;
@@ -1580,6 +1583,9 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
15801583
ereport(WARNING,
15811584
(errmsg("file \"%s\" has a total of %d checksum verification "
15821585
"failures", readfilename, checksum_failures)));
1586+
1587+
if (dboid != InvalidOid)
1588+
pgstat_report_checksum_failures_in_db(dboid, checksum_failures);
15831589
}
15841590
total_checksum_failures += checksum_failures;
15851591

src/backend/storage/page/bufpage.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "access/htup_details.h"
1818
#include "access/itup.h"
1919
#include "access/xlog.h"
20+
#include "pgstat.h"
2021
#include "storage/checksum.h"
2122
#include "utils/memdebug.h"
2223
#include "utils/memutils.h"
@@ -151,6 +152,8 @@ PageIsVerified(Page page, BlockNumber blkno)
151152
errmsg("page verification failed, calculated checksum %u but expected %u",
152153
checksum, p->pd_checksum)));
153154

155+
pgstat_report_checksum_failure();
156+
154157
if (header_sane && ignore_checksum_failure)
155158
return true;
156159
}

src/backend/utils/adt/pgstatfuncs.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1497,6 +1497,21 @@ pg_stat_get_db_deadlocks(PG_FUNCTION_ARGS)
14971497
PG_RETURN_INT64(result);
14981498
}
14991499

1500+
Datum
1501+
pg_stat_get_db_checksum_failures(PG_FUNCTION_ARGS)
1502+
{
1503+
Oid dbid = PG_GETARG_OID(0);
1504+
int64 result;
1505+
PgStat_StatDBEntry *dbentry;
1506+
1507+
if ((dbentry = pgstat_fetch_stat_dbentry(dbid)) == NULL)
1508+
result = 0;
1509+
else
1510+
result = (int64) (dbentry->n_checksum_failures);
1511+
1512+
PG_RETURN_INT64(result);
1513+
}
1514+
15001515
Datum
15011516
pg_stat_get_db_blk_read_time(PG_FUNCTION_ARGS)
15021517
{

src/include/catalog/catversion.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,6 @@
5353
*/
5454

5555
/* yyyymmddN */
56-
#define CATALOG_VERSION_NO 201903063
56+
#define CATALOG_VERSION_NO 201903091
5757

5858
#endif

src/include/catalog/pg_proc.dat

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5227,6 +5227,10 @@
52275227
proname => 'pg_stat_get_db_deadlocks', provolatile => 's', proparallel => 'r',
52285228
prorettype => 'int8', proargtypes => 'oid',
52295229
prosrc => 'pg_stat_get_db_deadlocks' },
5230+
{ oid => '3426', descr => 'statistics: checksum failures detected in database',
5231+
proname => 'pg_stat_get_db_checksum_failures', provolatile => 's', proparallel => 'r',
5232+
prorettype => 'int8', proargtypes => 'oid',
5233+
prosrc => 'pg_stat_get_db_checksum_failures' },
52305234
{ oid => '3074', descr => 'statistics: last reset for a database',
52315235
proname => 'pg_stat_get_db_stat_reset_time', provolatile => 's',
52325236
proparallel => 'r', prorettype => 'timestamptz', proargtypes => 'oid',

src/include/pgstat.h

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ typedef enum StatMsgType
6464
PGSTAT_MTYPE_FUNCPURGE,
6565
PGSTAT_MTYPE_RECOVERYCONFLICT,
6666
PGSTAT_MTYPE_TEMPFILE,
67-
PGSTAT_MTYPE_DEADLOCK
67+
PGSTAT_MTYPE_DEADLOCK,
68+
PGSTAT_MTYPE_CHECKSUMFAILURE
6869
} StatMsgType;
6970

7071
/* ----------
@@ -530,6 +531,18 @@ typedef struct PgStat_MsgDeadlock
530531
Oid m_databaseid;
531532
} PgStat_MsgDeadlock;
532533

534+
/* ----------
535+
* PgStat_MsgChecksumFailure Sent by the backend to tell the collector
536+
* about checksum failures noticed.
537+
* ----------
538+
*/
539+
typedef struct PgStat_MsgChecksumFailure
540+
{
541+
PgStat_MsgHdr m_hdr;
542+
Oid m_databaseid;
543+
int m_failurecount;
544+
} PgStat_MsgChecksumFailure;
545+
533546

534547
/* ----------
535548
* PgStat_Msg Union over all possible messages.
@@ -593,6 +606,7 @@ typedef struct PgStat_StatDBEntry
593606
PgStat_Counter n_temp_files;
594607
PgStat_Counter n_temp_bytes;
595608
PgStat_Counter n_deadlocks;
609+
PgStat_Counter n_checksum_failures;
596610
PgStat_Counter n_block_read_time; /* times in microseconds */
597611
PgStat_Counter n_block_write_time;
598612

@@ -1200,6 +1214,8 @@ extern void pgstat_report_analyze(Relation rel,
12001214

12011215
extern void pgstat_report_recovery_conflict(int reason);
12021216
extern void pgstat_report_deadlock(void);
1217+
extern void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount);
1218+
extern void pgstat_report_checksum_failure(void);
12031219

12041220
extern void pgstat_initialize(void);
12051221
extern void pgstat_bestart(void);

src/test/regress/expected/rules.out

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1817,6 +1817,7 @@ pg_stat_database| SELECT d.oid AS datid,
18171817
pg_stat_get_db_temp_files(d.oid) AS temp_files,
18181818
pg_stat_get_db_temp_bytes(d.oid) AS temp_bytes,
18191819
pg_stat_get_db_deadlocks(d.oid) AS deadlocks,
1820+
pg_stat_get_db_checksum_failures(d.oid) AS checksum_failures,
18201821
pg_stat_get_db_blk_read_time(d.oid) AS blk_read_time,
18211822
pg_stat_get_db_blk_write_time(d.oid) AS blk_write_time,
18221823
pg_stat_get_db_stat_reset_time(d.oid) AS stats_reset

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy