Skip to content

Commit 081e410

Browse files
committed
Ignore server-side delays when enforcing wal_sender_timeout.
Healthy clients of servers having poor I/O performance, such as buildfarm members hamster and tern, saw unexpected timeouts. That disagreed with documentation. This fix adds one gettimeofday() call whenever ProcessRepliesIfAny() finds no client reply messages. Back-patch to 9.4; the bug's symptom is rare and mild, and the code all moved between 9.3 and 9.4. Discussion: https://postgr.es/m/20180826034600.GA1105084@rfd.leadboat.com
1 parent 4a9a5bb commit 081e410

File tree

1 file changed

+36
-28
lines changed

1 file changed

+36
-28
lines changed

src/backend/replication/walsender.c

Lines changed: 36 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -154,9 +154,12 @@ static StringInfoData output_message;
154154
static StringInfoData reply_message;
155155
static StringInfoData tmpbuf;
156156

157+
/* Timestamp of last ProcessRepliesIfAny(). */
158+
static TimestampTz last_processing = 0;
159+
157160
/*
158-
* Timestamp of the last receipt of the reply from the standby. Set to 0 if
159-
* wal_sender_timeout doesn't need to be active.
161+
* Timestamp of last ProcessRepliesIfAny() that saw a reply from the
162+
* standby. Set to 0 if wal_sender_timeout doesn't need to be active.
160163
*/
161164
static TimestampTz last_reply_timestamp = 0;
162165

@@ -213,8 +216,8 @@ static void ProcessStandbyReplyMessage(void);
213216
static void ProcessStandbyHSFeedbackMessage(void);
214217
static void ProcessRepliesIfAny(void);
215218
static void WalSndKeepalive(bool requestReply);
216-
static void WalSndKeepaliveIfNecessary(TimestampTz now);
217-
static void WalSndCheckTimeOut(TimestampTz now);
219+
static void WalSndKeepaliveIfNecessary(void);
220+
static void WalSndCheckTimeOut(void);
218221
static long WalSndComputeSleeptime(TimestampTz now);
219222
static void WalSndPrepareWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, bool last_write);
220223
static void WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, bool last_write);
@@ -1117,18 +1120,16 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
11171120
/* Check for input from the client */
11181121
ProcessRepliesIfAny();
11191122

1120-
now = GetCurrentTimestamp();
1121-
11221123
/* die if timeout was reached */
1123-
WalSndCheckTimeOut(now);
1124+
WalSndCheckTimeOut();
11241125

11251126
/* Send keepalive if the time has come */
1126-
WalSndKeepaliveIfNecessary(now);
1127+
WalSndKeepaliveIfNecessary();
11271128

11281129
if (!pq_is_send_pending())
11291130
break;
11301131

1131-
sleeptime = WalSndComputeSleeptime(now);
1132+
sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp());
11321133

11331134
wakeEvents = WL_LATCH_SET | WL_POSTMASTER_DEATH |
11341135
WL_SOCKET_WRITEABLE | WL_SOCKET_READABLE | WL_TIMEOUT;
@@ -1198,7 +1199,6 @@ WalSndWaitForWal(XLogRecPtr loc)
11981199
for (;;)
11991200
{
12001201
long sleeptime;
1201-
TimestampTz now;
12021202

12031203
/*
12041204
* Emergency bailout if postmaster has died. This is to avoid the
@@ -1283,13 +1283,11 @@ WalSndWaitForWal(XLogRecPtr loc)
12831283
!pq_is_send_pending())
12841284
break;
12851285

1286-
now = GetCurrentTimestamp();
1287-
12881286
/* die if timeout was reached */
1289-
WalSndCheckTimeOut(now);
1287+
WalSndCheckTimeOut();
12901288

12911289
/* Send keepalive if the time has come */
1292-
WalSndKeepaliveIfNecessary(now);
1290+
WalSndKeepaliveIfNecessary();
12931291

12941292
/*
12951293
* Sleep until something happens or we time out. Also wait for the
@@ -1298,7 +1296,7 @@ WalSndWaitForWal(XLogRecPtr loc)
12981296
* new WAL to be generated. (But if we have nothing to send, we don't
12991297
* want to wake on socket-writable.)
13001298
*/
1301-
sleeptime = WalSndComputeSleeptime(now);
1299+
sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp());
13021300

13031301
wakeEvents = WL_LATCH_SET | WL_POSTMASTER_DEATH |
13041302
WL_SOCKET_READABLE | WL_TIMEOUT;
@@ -1438,6 +1436,8 @@ ProcessRepliesIfAny(void)
14381436
int r;
14391437
bool received = false;
14401438

1439+
last_processing = GetCurrentTimestamp();
1440+
14411441
for (;;)
14421442
{
14431443
pq_startmsgread();
@@ -1525,7 +1525,7 @@ ProcessRepliesIfAny(void)
15251525
*/
15261526
if (received)
15271527
{
1528-
last_reply_timestamp = GetCurrentTimestamp();
1528+
last_reply_timestamp = last_processing;
15291529
waiting_for_ping_response = false;
15301530
}
15311531
}
@@ -1818,10 +1818,18 @@ WalSndComputeSleeptime(TimestampTz now)
18181818

18191819
/*
18201820
* Check whether there have been responses by the client within
1821-
* wal_sender_timeout and shutdown if not.
1821+
* wal_sender_timeout and shutdown if not. Using last_processing as the
1822+
* reference point avoids counting server-side stalls against the client.
1823+
* However, a long server-side stall can make WalSndKeepaliveIfNecessary()
1824+
* postdate last_processing by more than wal_sender_timeout. If that happens,
1825+
* the client must reply almost immediately to avoid a timeout. This rarely
1826+
* affects the default configuration, under which clients spontaneously send a
1827+
* message every standby_message_timeout = wal_sender_timeout/6 = 10s. We
1828+
* could eliminate that problem by recognizing timeout expiration at
1829+
* wal_sender_timeout/2 after the keepalive.
18221830
*/
18231831
static void
1824-
WalSndCheckTimeOut(TimestampTz now)
1832+
WalSndCheckTimeOut(void)
18251833
{
18261834
TimestampTz timeout;
18271835

@@ -1832,7 +1840,7 @@ WalSndCheckTimeOut(TimestampTz now)
18321840
timeout = TimestampTzPlusMilliseconds(last_reply_timestamp,
18331841
wal_sender_timeout);
18341842

1835-
if (wal_sender_timeout > 0 && now >= timeout)
1843+
if (wal_sender_timeout > 0 && last_processing >= timeout)
18361844
{
18371845
/*
18381846
* Since typically expiration of replication timeout means
@@ -1863,8 +1871,6 @@ WalSndLoop(WalSndSendDataCallback send_data)
18631871
*/
18641872
for (;;)
18651873
{
1866-
TimestampTz now;
1867-
18681874
/*
18691875
* Emergency bailout if postmaster has died. This is to avoid the
18701876
* necessity for manual cleanup of all postmaster children.
@@ -1942,13 +1948,11 @@ WalSndLoop(WalSndSendDataCallback send_data)
19421948
WalSndDone(send_data);
19431949
}
19441950

1945-
now = GetCurrentTimestamp();
1946-
19471951
/* Check for replication timeout. */
1948-
WalSndCheckTimeOut(now);
1952+
WalSndCheckTimeOut();
19491953

19501954
/* Send keepalive if the time has come */
1951-
WalSndKeepaliveIfNecessary(now);
1955+
WalSndKeepaliveIfNecessary();
19521956

19531957
/*
19541958
* We don't block if not caught up, unless there is unsent data
@@ -1966,7 +1970,11 @@ WalSndLoop(WalSndSendDataCallback send_data)
19661970
wakeEvents = WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_TIMEOUT |
19671971
WL_SOCKET_READABLE;
19681972

1969-
sleeptime = WalSndComputeSleeptime(now);
1973+
/*
1974+
* Use fresh timestamp, not last_processed, to reduce the chance
1975+
* of reaching wal_sender_timeout before sending a keepalive.
1976+
*/
1977+
sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp());
19701978

19711979
if (pq_is_send_pending())
19721980
wakeEvents |= WL_SOCKET_WRITEABLE;
@@ -3037,7 +3045,7 @@ WalSndKeepalive(bool requestReply)
30373045
* Send keepalive message if too much time has elapsed.
30383046
*/
30393047
static void
3040-
WalSndKeepaliveIfNecessary(TimestampTz now)
3048+
WalSndKeepaliveIfNecessary(void)
30413049
{
30423050
TimestampTz ping_time;
30433051

@@ -3058,7 +3066,7 @@ WalSndKeepaliveIfNecessary(TimestampTz now)
30583066
*/
30593067
ping_time = TimestampTzPlusMilliseconds(last_reply_timestamp,
30603068
wal_sender_timeout / 2);
3061-
if (now >= ping_time)
3069+
if (last_processing >= ping_time)
30623070
{
30633071
WalSndKeepalive(true);
30643072
waiting_for_ping_response = true;

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy