Skip to content

Commit 6ec9e99

Browse files
committed
amcheck: Distinguish interrupted page deletion from corruption.
This prevents false-positive reports about "the first child of leftmost target page is not leftmost of its level", "block %u is not leftmost" and "left link/right link pair". They appeared if amcheck ran before VACUUM cleaned things, after a cluster exited recovery between the first-stage and second-stage WAL records of a deletion. Back-patch to v11 (all supported versions). Reviewed by Peter Geoghegan. Discussion: https://postgr.es/m/20231005025232.c7.nmisch@google.com
1 parent 56b30e2 commit 6ec9e99

File tree

4 files changed

+163
-4
lines changed

4 files changed

+163
-4
lines changed

contrib/amcheck/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ PGFILEDESC = "amcheck - function for verifying relation integrity"
1212

1313
REGRESS = check check_btree check_heap
1414

15+
EXTRA_INSTALL = contrib/pg_walinspect
1516
TAP_TESTS = 1
1617

1718
ifdef USE_PGXS

contrib/amcheck/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ tests += {
4444
't/002_cic.pl',
4545
't/003_cic_2pc.pl',
4646
't/004_verify_nbtree_unique.pl',
47+
't/005_pitr.pl',
4748
],
4849
},
4950
}

contrib/amcheck/t/005_pitr.pl

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# Copyright (c) 2021-2023, PostgreSQL Global Development Group
2+
3+
# Test integrity of intermediate states by PITR to those states
4+
use strict;
5+
use warnings;
6+
use PostgreSQL::Test::Cluster;
7+
use PostgreSQL::Test::Utils;
8+
use Test::More;
9+
10+
# origin node: generate WAL records of interest.
11+
my $origin = PostgreSQL::Test::Cluster->new('origin');
12+
$origin->init(has_archiving => 1, allows_streaming => 1);
13+
$origin->append_conf('postgresql.conf', 'autovacuum = off');
14+
$origin->start;
15+
$origin->backup('my_backup');
16+
# Create a table with each of 6 PK values spanning 1/4 of a block. Delete the
17+
# first four, so one index leaf is eligible for deletion. Make a replication
18+
# slot just so pg_walinspect will always have access to later WAL.
19+
my $setup = <<EOSQL;
20+
BEGIN;
21+
CREATE EXTENSION amcheck;
22+
CREATE EXTENSION pg_walinspect;
23+
CREATE TABLE not_leftmost (c text STORAGE PLAIN);
24+
INSERT INTO not_leftmost
25+
SELECT repeat(n::text, database_block_size / 4)
26+
FROM generate_series(1,6) t(n), pg_control_init();
27+
ALTER TABLE not_leftmost ADD CONSTRAINT not_leftmost_pk PRIMARY KEY (c);
28+
DELETE FROM not_leftmost WHERE c ~ '^[1-4]';
29+
SELECT pg_create_physical_replication_slot('for_walinspect', true, false);
30+
COMMIT;
31+
EOSQL
32+
$origin->safe_psql('postgres', $setup);
33+
my $before_vacuum_lsn =
34+
$origin->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
35+
# VACUUM to delete the aforementioned leaf page. Force an XLogFlush() by
36+
# dropping a permanent table. That way, the XLogReader infrastructure can
37+
# always see VACUUM's records, even under synchronous_commit=off. Finally,
38+
# find the LSN of that VACUUM's last UNLINK_PAGE record.
39+
my $vacuum = <<EOSQL;
40+
SET synchronous_commit = off;
41+
VACUUM (VERBOSE, INDEX_CLEANUP ON) not_leftmost;
42+
CREATE TABLE XLogFlush ();
43+
DROP TABLE XLogFlush;
44+
SELECT max(start_lsn)
45+
FROM pg_get_wal_records_info('$before_vacuum_lsn', 'FFFFFFFF/FFFFFFFF')
46+
WHERE resource_manager = 'Btree' AND record_type = 'UNLINK_PAGE';
47+
EOSQL
48+
my $unlink_lsn = $origin->safe_psql('postgres', $vacuum);
49+
$origin->stop;
50+
die "did not find UNLINK_PAGE record" unless $unlink_lsn;
51+
52+
# replica node: amcheck at notable points in the WAL stream
53+
my $replica = PostgreSQL::Test::Cluster->new('replica');
54+
$replica->init_from_backup($origin, 'my_backup', has_restoring => 1);
55+
$replica->append_conf('postgresql.conf',
56+
"recovery_target_lsn = '$unlink_lsn'");
57+
$replica->append_conf('postgresql.conf', 'recovery_target_inclusive = off');
58+
$replica->append_conf('postgresql.conf', 'recovery_target_action = promote');
59+
$replica->start;
60+
$replica->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';")
61+
or die "Timed out while waiting for PITR promotion";
62+
# recovery done; run amcheck
63+
my $debug = "SET client_min_messages = 'debug1'";
64+
my ($rc, $stderr);
65+
$rc = $replica->psql(
66+
'postgres',
67+
"$debug; SELECT bt_index_parent_check('not_leftmost_pk', true)",
68+
stderr => \$stderr);
69+
print STDERR $stderr, "\n";
70+
is($rc, 0, "bt_index_parent_check passes");
71+
like(
72+
$stderr,
73+
qr/interrupted page deletion detected/,
74+
"bt_index_parent_check: interrupted page deletion detected");
75+
$rc = $replica->psql(
76+
'postgres',
77+
"$debug; SELECT bt_index_check('not_leftmost_pk', true)",
78+
stderr => \$stderr);
79+
print STDERR $stderr, "\n";
80+
is($rc, 0, "bt_index_check passes");
81+
82+
done_testing();

contrib/amcheck/verify_nbtree.c

Lines changed: 79 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,9 @@ static void bt_check_every_level(Relation rel, Relation heaprel,
157157
bool rootdescend, bool checkunique);
158158
static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state,
159159
BtreeLevel level);
160+
static bool bt_leftmost_ignoring_half_dead(BtreeCheckState *state,
161+
BlockNumber start,
162+
BTPageOpaque start_opaque);
160163
static void bt_recheck_sibling_links(BtreeCheckState *state,
161164
BlockNumber btpo_prev_from_target,
162165
BlockNumber leftcurrent);
@@ -826,7 +829,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
826829
*/
827830
if (state->readonly)
828831
{
829-
if (!P_LEFTMOST(opaque))
832+
if (!bt_leftmost_ignoring_half_dead(state, current, opaque))
830833
ereport(ERROR,
831834
(errcode(ERRCODE_INDEX_CORRUPTED),
832835
errmsg("block %u is not leftmost in index \"%s\"",
@@ -880,8 +883,16 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
880883
*/
881884
}
882885

883-
/* Sibling links should be in mutual agreement */
884-
if (opaque->btpo_prev != leftcurrent)
886+
/*
887+
* Sibling links should be in mutual agreement. There arises
888+
* leftcurrent == P_NONE && btpo_prev != P_NONE when the left sibling
889+
* of the parent's low-key downlink is half-dead. (A half-dead page
890+
* has no downlink from its parent.) Under heavyweight locking, the
891+
* last bt_leftmost_ignoring_half_dead() validated this btpo_prev.
892+
* Without heavyweight locking, validation of the P_NONE case remains
893+
* unimplemented.
894+
*/
895+
if (opaque->btpo_prev != leftcurrent && leftcurrent != P_NONE)
885896
bt_recheck_sibling_links(state, opaque->btpo_prev, leftcurrent);
886897

887898
/* Check level */
@@ -1117,6 +1128,66 @@ bt_entry_unique_check(BtreeCheckState *state, IndexTuple itup,
11171128
}
11181129
}
11191130

1131+
/*
1132+
* Like P_LEFTMOST(start_opaque), but accept an arbitrarily-long chain of
1133+
* half-dead, sibling-linked pages to the left. If a half-dead page appears
1134+
* under state->readonly, the database exited recovery between the first-stage
1135+
* and second-stage WAL records of a deletion.
1136+
*/
1137+
static bool
1138+
bt_leftmost_ignoring_half_dead(BtreeCheckState *state,
1139+
BlockNumber start,
1140+
BTPageOpaque start_opaque)
1141+
{
1142+
BlockNumber reached = start_opaque->btpo_prev,
1143+
reached_from = start;
1144+
bool all_half_dead = true;
1145+
1146+
/*
1147+
* To handle the !readonly case, we'd need to accept BTP_DELETED pages and
1148+
* potentially observe nbtree/README "Page deletion and backwards scans".
1149+
*/
1150+
Assert(state->readonly);
1151+
1152+
while (reached != P_NONE && all_half_dead)
1153+
{
1154+
Page page = palloc_btree_page(state, reached);
1155+
BTPageOpaque reached_opaque = BTPageGetOpaque(page);
1156+
1157+
CHECK_FOR_INTERRUPTS();
1158+
1159+
/*
1160+
* Try to detect btpo_prev circular links. _bt_unlink_halfdead_page()
1161+
* writes that side-links will continue to point to the siblings.
1162+
* Check btpo_next for that property.
1163+
*/
1164+
all_half_dead = P_ISHALFDEAD(reached_opaque) &&
1165+
reached != start &&
1166+
reached != reached_from &&
1167+
reached_opaque->btpo_next == reached_from;
1168+
if (all_half_dead)
1169+
{
1170+
XLogRecPtr pagelsn = PageGetLSN(page);
1171+
1172+
/* pagelsn should point to an XLOG_BTREE_MARK_PAGE_HALFDEAD */
1173+
ereport(DEBUG1,
1174+
(errcode(ERRCODE_NO_DATA),
1175+
errmsg_internal("harmless interrupted page deletion detected in index \"%s\"",
1176+
RelationGetRelationName(state->rel)),
1177+
errdetail_internal("Block=%u right block=%u page lsn=%X/%X.",
1178+
reached, reached_from,
1179+
LSN_FORMAT_ARGS(pagelsn))));
1180+
1181+
reached_from = reached;
1182+
reached = reached_opaque->btpo_prev;
1183+
}
1184+
1185+
pfree(page);
1186+
}
1187+
1188+
return all_half_dead;
1189+
}
1190+
11201191
/*
11211192
* Raise an error when target page's left link does not point back to the
11221193
* previous target page, called leftcurrent here. The leftcurrent page's
@@ -1157,6 +1228,9 @@ bt_recheck_sibling_links(BtreeCheckState *state,
11571228
BlockNumber btpo_prev_from_target,
11581229
BlockNumber leftcurrent)
11591230
{
1231+
/* passing metapage to BTPageGetOpaque() would give irrelevant findings */
1232+
Assert(leftcurrent != P_NONE);
1233+
11601234
if (!state->readonly)
11611235
{
11621236
Buffer lbuf;
@@ -2235,7 +2309,8 @@ bt_child_highkey_check(BtreeCheckState *state,
22352309
opaque = BTPageGetOpaque(page);
22362310

22372311
/* The first page we visit at the level should be leftmost */
2238-
if (first && !BlockNumberIsValid(state->prevrightlink) && !P_LEFTMOST(opaque))
2312+
if (first && !BlockNumberIsValid(state->prevrightlink) &&
2313+
!bt_leftmost_ignoring_half_dead(state, blkno, opaque))
22392314
ereport(ERROR,
22402315
(errcode(ERRCODE_INDEX_CORRUPTED),
22412316
errmsg("the first child of leftmost target page is not leftmost of its level in index \"%s\"",

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy