Skip to content

Commit fe0972e

Browse files
committed
Add further debug info to help debug 019_replslot_limit.pl failures.
See also afdeff1. Failures after that commit provided a few more hints, but not yet enough to understand what's going on. In 019_replslot_limit.pl shut down nodes with fast instead of immediate mode if we observe the failure mode. That should tell us whether the failures we're observing are just a timing issue under high load. PGCTLTIMEOUT should prevent buildfarm animals from hanging endlessly. Also adds a bit more logging to replication slot drop and ShutdownPostgres(). Discussion: https://postgr.es/m/20220225192941.hqnvefgdzaro6gzg@alap3.anarazel.de
1 parent 638300f commit fe0972e

File tree

5 files changed

+55
-1
lines changed

5 files changed

+55
-1
lines changed

src/backend/replication/slot.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -569,6 +569,10 @@ ReplicationSlotCleanup(void)
569569
if (!s->in_use)
570570
continue;
571571

572+
/* unlocked read of active_pid is ok for debugging purposes */
573+
elog(DEBUG3, "temporary replication slot cleanup: %d in use, active_pid: %d",
574+
i, s->active_pid);
575+
572576
SpinLockAcquire(&s->mutex);
573577
if (s->active_pid == MyProcPid)
574578
{
@@ -629,6 +633,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
629633
char path[MAXPGPATH];
630634
char tmppath[MAXPGPATH];
631635

636+
/* temp debugging aid to analyze 019_replslot_limit failures */
637+
elog(DEBUG3, "replication slot drop: %s: begin", NameStr(slot->data.name));
638+
632639
/*
633640
* If some other backend ran this code concurrently with us, we might try
634641
* to delete a slot with a certain name while someone else was trying to
@@ -679,6 +686,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
679686
path, tmppath)));
680687
}
681688

689+
elog(DEBUG3, "replication slot drop: %s: removed on-disk",
690+
NameStr(slot->data.name));
691+
682692
/*
683693
* The slot is definitely gone. Lock out concurrent scans of the array
684694
* long enough to kill it. It's OK to clear the active PID here without
@@ -734,6 +744,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
734744
* a slot while we're still cleaning up the detritus of the old one.
735745
*/
736746
LWLockRelease(ReplicationSlotAllocationLock);
747+
748+
elog(DEBUG3, "replication slot drop: %s: done",
749+
NameStr(slot->data.name));
737750
}
738751

739752
/*

src/backend/storage/lmgr/lwlock.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1945,3 +1945,10 @@ LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
19451945
}
19461946
return false;
19471947
}
1948+
1949+
/* temp debugging aid to analyze 019_replslot_limit failures */
1950+
int
1951+
LWLockHeldCount(void)
1952+
{
1953+
return num_held_lwlocks;
1954+
}

src/backend/utils/init/postinit.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1262,6 +1262,23 @@ ShutdownPostgres(int code, Datum arg)
12621262
* them explicitly.
12631263
*/
12641264
LockReleaseAll(USER_LOCKMETHOD, true);
1265+
1266+
/*
1267+
* temp debugging aid to analyze 019_replslot_limit failures
1268+
*
1269+
* If an error were thrown outside of a transaction nothing up to now
1270+
* would have released lwlocks. We probably will add an
1271+
* LWLockReleaseAll(). But for now make it easier to understand such cases
1272+
* by warning if any lwlocks are held.
1273+
*/
1274+
#ifdef USE_ASSERT_CHECKING
1275+
{
1276+
int held_lwlocks = LWLockHeldCount();
1277+
if (held_lwlocks)
1278+
elog(WARNING, "holding %d lwlocks at the end of ShutdownPostgres()",
1279+
held_lwlocks);
1280+
}
1281+
#endif
12651282
}
12661283

12671284

src/include/storage/lwlock.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ extern void LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val);
121121
extern void LWLockReleaseAll(void);
122122
extern bool LWLockHeldByMe(LWLock *lock);
123123
extern bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode);
124+
extern int LWLockHeldCount(void);
124125

125126
extern bool LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval);
126127
extern void LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 value);

src/test/recovery/t/019_replslot_limit.pl

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,23 @@
335335
$node_primary3->wait_for_catchup($node_standby3);
336336
my $senderpid = $node_primary3->safe_psql('postgres',
337337
"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'");
338-
like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid");
338+
339+
# We've seen occasional cases where multiple walsender pids are active. An
340+
# immediate shutdown may hide evidence of a locking bug. So if multiple
341+
# walsenders are observed, shut down in fast mode, and collect some more
342+
# information.
343+
if (not like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid"))
344+
{
345+
my ($stdout, $stderr);
346+
$node_primary3->psql('postgres',
347+
"\\a\\t\nSELECT * FROM pg_stat_activity",
348+
stdout => \$stdout, stderr => \$stderr);
349+
diag $stdout, $stderr;
350+
$node_primary3->stop('fast');
351+
$node_standby3->stop('fast');
352+
die "could not determine walsender pid, can't continue";
353+
}
354+
339355
my $receiverpid = $node_standby3->safe_psql('postgres',
340356
"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walreceiver'");
341357
like($receiverpid, qr/^[0-9]+$/, "have walreceiver pid $receiverpid");

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy