@@ -303,6 +303,11 @@ static bool doPageWrites;
303
303
* so it's a plain spinlock. The other locks are held longer (potentially
304
304
* over I/O operations), so we use LWLocks for them. These locks are:
305
305
*
306
+ * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
307
+ * It is only held while initializing and changing the mapping. If the
308
+ * contents of the buffer being replaced haven't been written yet, the mapping
309
+ * lock is released while the write is done, and reacquired afterwards.
310
+ *
306
311
* WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
307
312
* XLogFlush).
308
313
*
@@ -468,37 +473,21 @@ typedef struct XLogCtlData
468
473
pg_atomic_uint64 logFlushResult ; /* last byte + 1 flushed */
469
474
470
475
/*
471
- * First initialized page in the cache (first byte position).
472
- */
473
- XLogRecPtr InitializedFrom ;
474
-
475
- /*
476
- * Latest reserved for initialization page in the cache (last byte
477
- * position + 1).
476
+ * Latest initialized page in the cache (last byte position + 1).
478
477
*
479
- * To change the identity of a buffer, you need to advance
480
- * InitializeReserved first . To change the identity of a buffer that's
478
+ * To change the identity of a buffer (and InitializedUpTo) , you need to
479
+ * hold WALBufMappingLock . To change the identity of a buffer that's
481
480
* still dirty, the old page needs to be written out first, and for that
482
481
* you need WALWriteLock, and you need to ensure that there are no
483
482
* in-progress insertions to the page by calling
484
483
* WaitXLogInsertionsToFinish().
485
484
*/
486
- pg_atomic_uint64 InitializeReserved ;
487
-
488
- /*
489
- * Latest initialized page in the cache (last byte position + 1).
490
- *
491
- * InitializedUpTo is updated after the buffer initialization. After
492
- * update, waiters got notification using InitializedUpToCondVar.
493
- */
494
- pg_atomic_uint64 InitializedUpTo ;
495
- ConditionVariable InitializedUpToCondVar ;
485
+ XLogRecPtr InitializedUpTo ;
496
486
497
487
/*
498
488
* These values do not change after startup, although the pointed-to pages
499
- * and xlblocks values certainly do. xlblocks values are changed
500
- * lock-free according to the check for the xlog write position and are
501
- * accompanied by changes of InitializeReserved and InitializedUpTo.
489
+ * and xlblocks values certainly do. xlblocks values are protected by
490
+ * WALBufMappingLock.
502
491
*/
503
492
char * pages ; /* buffers for unwritten XLOG pages */
504
493
pg_atomic_uint64 * xlblocks ; /* 1st byte ptr-s + XLOG_BLCKSZ */
@@ -821,9 +810,9 @@ XLogInsertRecord(XLogRecData *rdata,
821
810
* fullPageWrites from changing until the insertion is finished.
822
811
*
823
812
* Step 2 can usually be done completely in parallel. If the required WAL
824
- * page is not initialized yet, you have to go through AdvanceXLInsertBuffer,
825
- * which will ensure it is initialized. But the WAL writer tries to do that
826
- * ahead of insertions to avoid that from happening in the critical path.
813
+ * page is not initialized yet, you have to grab WALBufMappingLock to
814
+ * initialize it, but the WAL writer tries to do that ahead of insertions
815
+ * to avoid that from happening in the critical path.
827
816
*
828
817
*----------
829
818
*/
@@ -2005,79 +1994,32 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
2005
1994
XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr ;
2006
1995
XLogRecPtr NewPageBeginPtr ;
2007
1996
XLogPageHeader NewPage ;
2008
- XLogRecPtr ReservedPtr ;
2009
1997
int npages pg_attribute_unused () = 0 ;
2010
1998
2011
- /*
2012
- * We must run the loop below inside the critical section as we expect
2013
- * XLogCtl->InitializedUpTo to eventually keep up. The most of callers
2014
- * already run inside the critical section. Except for WAL writer, which
2015
- * passed 'opportunistic == true', and therefore we don't perform
2016
- * operations that could error out.
2017
- *
2018
- * Start an explicit critical section anyway though.
2019
- */
2020
- Assert (CritSectionCount > 0 || opportunistic );
2021
- START_CRIT_SECTION ();
1999
+ LWLockAcquire (WALBufMappingLock , LW_EXCLUSIVE );
2022
2000
2023
- /*--
2024
- * Loop till we get all the pages in WAL buffer before 'upto' reserved for
2025
- * initialization. Multiple process can initialize different buffers with
2026
- * this loop in parallel as following.
2027
- *
2028
- * 1. Reserve page for initialization using XLogCtl->InitializeReserved.
2029
- * 2. Initialize the reserved page.
2030
- * 3. Attempt to advance XLogCtl->InitializedUpTo,
2001
+ /*
2002
+ * Now that we have the lock, check if someone initialized the page
2003
+ * already.
2031
2004
*/
2032
- ReservedPtr = pg_atomic_read_u64 (& XLogCtl -> InitializeReserved );
2033
- while (upto >= ReservedPtr || opportunistic )
2005
+ while (upto >= XLogCtl -> InitializedUpTo || opportunistic )
2034
2006
{
2035
- Assert ( ReservedPtr % XLOG_BLCKSZ == 0 );
2007
+ nextidx = XLogRecPtrToBufIdx ( XLogCtl -> InitializedUpTo );
2036
2008
2037
2009
/*
2038
- * Get ending-offset of the buffer page we need to replace.
2039
- *
2040
- * We don't lookup into xlblocks, but rather calculate position we
2041
- * must wait to be written. If it was written, xlblocks will have this
2042
- * position (or uninitialized)
2010
+ * Get ending-offset of the buffer page we need to replace (this may
2011
+ * be zero if the buffer hasn't been used yet). Fall through if it's
2012
+ * already written out.
2043
2013
*/
2044
- if (ReservedPtr + XLOG_BLCKSZ > XLogCtl -> InitializedFrom + XLOG_BLCKSZ * XLOGbuffers )
2045
- OldPageRqstPtr = ReservedPtr + XLOG_BLCKSZ - (XLogRecPtr ) XLOG_BLCKSZ * XLOGbuffers ;
2046
- else
2047
- OldPageRqstPtr = InvalidXLogRecPtr ;
2048
-
2049
- if (LogwrtResult .Write < OldPageRqstPtr && opportunistic )
2014
+ OldPageRqstPtr = pg_atomic_read_u64 (& XLogCtl -> xlblocks [nextidx ]);
2015
+ if (LogwrtResult .Write < OldPageRqstPtr )
2050
2016
{
2051
2017
/*
2052
- * If we just want to pre-initialize as much as we can without
2053
- * flushing, give up now.
2018
+ * Nope, got work to do. If we just want to pre-initialize as much
2019
+ * as we can without flushing, give up now.
2054
2020
*/
2055
- upto = ReservedPtr - 1 ;
2056
- break ;
2057
- }
2058
-
2059
- /*
2060
- * Attempt to reserve the page for initialization. Failure means that
2061
- * this page got reserved by another process.
2062
- */
2063
- if (!pg_atomic_compare_exchange_u64 (& XLogCtl -> InitializeReserved ,
2064
- & ReservedPtr ,
2065
- ReservedPtr + XLOG_BLCKSZ ))
2066
- continue ;
2067
-
2068
- /*
2069
- * Wait till page gets correctly initialized up to OldPageRqstPtr.
2070
- */
2071
- nextidx = XLogRecPtrToBufIdx (ReservedPtr );
2072
- while (pg_atomic_read_u64 (& XLogCtl -> InitializedUpTo ) < OldPageRqstPtr )
2073
- ConditionVariableSleep (& XLogCtl -> InitializedUpToCondVar , WAIT_EVENT_WAL_BUFFER_INIT );
2074
- ConditionVariableCancelSleep ();
2075
- Assert (pg_atomic_read_u64 (& XLogCtl -> xlblocks [nextidx ]) == OldPageRqstPtr );
2076
-
2077
- /* Fall through if it's already written out. */
2078
- if (LogwrtResult .Write < OldPageRqstPtr )
2079
- {
2080
- /* Nope, got work to do. */
2021
+ if (opportunistic )
2022
+ break ;
2081
2023
2082
2024
/* Advance shared memory write request position */
2083
2025
SpinLockAcquire (& XLogCtl -> info_lck );
@@ -2092,6 +2034,14 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
2092
2034
RefreshXLogWriteResult (LogwrtResult );
2093
2035
if (LogwrtResult .Write < OldPageRqstPtr )
2094
2036
{
2037
+ /*
2038
+ * Must acquire write lock. Release WALBufMappingLock first,
2039
+ * to make sure that all insertions that we need to wait for
2040
+ * can finish (up to this same position). Otherwise we risk
2041
+ * deadlock.
2042
+ */
2043
+ LWLockRelease (WALBufMappingLock );
2044
+
2095
2045
WaitXLogInsertionsToFinish (OldPageRqstPtr );
2096
2046
2097
2047
LWLockAcquire (WALWriteLock , LW_EXCLUSIVE );
@@ -2119,16 +2069,21 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
2119
2069
*/
2120
2070
pgstat_report_fixed = true;
2121
2071
}
2072
+ /* Re-acquire WALBufMappingLock and retry */
2073
+ LWLockAcquire (WALBufMappingLock , LW_EXCLUSIVE );
2074
+ continue ;
2122
2075
}
2123
2076
}
2124
2077
2125
2078
/*
2126
2079
* Now the next buffer slot is free and we can set it up to be the
2127
2080
* next output page.
2128
2081
*/
2129
- NewPageBeginPtr = ReservedPtr ;
2082
+ NewPageBeginPtr = XLogCtl -> InitializedUpTo ;
2130
2083
NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ ;
2131
2084
2085
+ Assert (XLogRecPtrToBufIdx (NewPageBeginPtr ) == nextidx );
2086
+
2132
2087
NewPage = (XLogPageHeader ) (XLogCtl -> pages + nextidx * (Size ) XLOG_BLCKSZ );
2133
2088
2134
2089
/*
@@ -2192,100 +2147,12 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
2192
2147
*/
2193
2148
pg_write_barrier ();
2194
2149
2195
- /*-----
2196
- * Update the value of XLogCtl->xlblocks[nextidx] and try to advance
2197
- * XLogCtl->InitializedUpTo in a lock-less manner.
2198
- *
2199
- * First, let's provide a formal proof of the algorithm. Let it be 'n'
2200
- * process with the following variables in shared memory:
2201
- * f - an array of 'n' boolean flags,
2202
- * v - atomic integer variable.
2203
- *
2204
- * Also, let
2205
- * i - a number of a process,
2206
- * j - local integer variable,
2207
- * CAS(var, oldval, newval) - compare-and-swap atomic operation
2208
- * returning true on success,
2209
- * write_barrier()/read_barrier() - memory barriers.
2210
- *
2211
- * The pseudocode for each process is the following.
2212
- *
2213
- * j := i
2214
- * f[i] := true
2215
- * write_barrier()
2216
- * while CAS(v, j, j + 1):
2217
- * j := j + 1
2218
- * read_barrier()
2219
- * if not f[j]:
2220
- * break
2221
- *
2222
- * Let's prove that v eventually reaches the value of n.
2223
- * 1. Prove by contradiction. Assume v doesn't reach n and stucks
2224
- * on k, where k < n.
2225
- * 2. Process k attempts CAS(v, k, k + 1). 1). If, as we assumed, v
2226
- * gets stuck at k, then this CAS operation must fail. Therefore,
2227
- * v < k when process k attempts CAS(v, k, k + 1).
2228
- * 3. If, as we assumed, v gets stuck at k, then the value k of v
2229
- * must be achieved by some process m, where m < k. The process
2230
- * m must observe f[k] == false. Otherwise, it will later attempt
2231
- * CAS(v, k, k + 1) with success.
2232
- * 4. Therefore, corresponding read_barrier() (while j == k) on
2233
- * process m reached before write_barrier() of process k. But then
2234
- * process k attempts CAS(v, k, k + 1) after process m successfully
2235
- * incremented v to k, and that CAS operation must succeed.
2236
- * That leads to a contradiction. So, there is no such k (k < n)
2237
- * where v gets stuck. Q.E.D.
2238
- *
2239
- * To apply this proof to the code below, we assume
2240
- * XLogCtl->InitializedUpTo will play the role of v with XLOG_BLCKSZ
2241
- * granularity. We also assume setting XLogCtl->xlblocks[nextidx] to
2242
- * NewPageEndPtr to play the role of setting f[i] to true. Also, note
2243
- * that processes can't concurrently map different xlog locations to
2244
- * the same nextidx because we previously requested that
2245
- * XLogCtl->InitializedUpTo >= OldPageRqstPtr. So, a xlog buffer can
2246
- * be taken for initialization only once the previous initialization
2247
- * takes effect on XLogCtl->InitializedUpTo.
2248
- */
2249
-
2250
2150
pg_atomic_write_u64 (& XLogCtl -> xlblocks [nextidx ], NewPageEndPtr );
2251
-
2252
- pg_write_barrier ();
2253
-
2254
- while (pg_atomic_compare_exchange_u64 (& XLogCtl -> InitializedUpTo , & NewPageBeginPtr , NewPageEndPtr ))
2255
- {
2256
- NewPageBeginPtr = NewPageEndPtr ;
2257
- NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ ;
2258
- nextidx = XLogRecPtrToBufIdx (NewPageBeginPtr );
2259
-
2260
- pg_read_barrier ();
2261
-
2262
- if (pg_atomic_read_u64 (& XLogCtl -> xlblocks [nextidx ]) != NewPageEndPtr )
2263
- {
2264
- /*
2265
- * Page at nextidx wasn't initialized yet, so we can't move
2266
- * InitializedUpto further. It will be moved by backend which
2267
- * will initialize nextidx.
2268
- */
2269
- ConditionVariableBroadcast (& XLogCtl -> InitializedUpToCondVar );
2270
- break ;
2271
- }
2272
- }
2151
+ XLogCtl -> InitializedUpTo = NewPageEndPtr ;
2273
2152
2274
2153
npages ++ ;
2275
2154
}
2276
-
2277
- END_CRIT_SECTION ();
2278
-
2279
- /*
2280
- * All the pages in WAL buffer before 'upto' were reserved for
2281
- * initialization. However, some pages might be reserved by concurrent
2282
- * processes. Wait till they finish initialization.
2283
- */
2284
- while (upto >= pg_atomic_read_u64 (& XLogCtl -> InitializedUpTo ))
2285
- ConditionVariableSleep (& XLogCtl -> InitializedUpToCondVar , WAIT_EVENT_WAL_BUFFER_INIT );
2286
- ConditionVariableCancelSleep ();
2287
-
2288
- pg_read_barrier ();
2155
+ LWLockRelease (WALBufMappingLock );
2289
2156
2290
2157
#ifdef WAL_DEBUG
2291
2158
if (XLOG_DEBUG && npages > 0 )
@@ -5178,10 +5045,6 @@ XLOGShmemInit(void)
5178
5045
pg_atomic_init_u64 (& XLogCtl -> logWriteResult , InvalidXLogRecPtr );
5179
5046
pg_atomic_init_u64 (& XLogCtl -> logFlushResult , InvalidXLogRecPtr );
5180
5047
pg_atomic_init_u64 (& XLogCtl -> unloggedLSN , InvalidXLogRecPtr );
5181
-
5182
- pg_atomic_init_u64 (& XLogCtl -> InitializeReserved , InvalidXLogRecPtr );
5183
- pg_atomic_init_u64 (& XLogCtl -> InitializedUpTo , InvalidXLogRecPtr );
5184
- ConditionVariableInit (& XLogCtl -> InitializedUpToCondVar );
5185
5048
}
5186
5049
5187
5050
/*
@@ -6205,8 +6068,7 @@ StartupXLOG(void)
6205
6068
memset (page + len , 0 , XLOG_BLCKSZ - len );
6206
6069
6207
6070
pg_atomic_write_u64 (& XLogCtl -> xlblocks [firstIdx ], endOfRecoveryInfo -> lastPageBeginPtr + XLOG_BLCKSZ );
6208
- pg_atomic_write_u64 (& XLogCtl -> InitializedUpTo , endOfRecoveryInfo -> lastPageBeginPtr + XLOG_BLCKSZ );
6209
- XLogCtl -> InitializedFrom = endOfRecoveryInfo -> lastPageBeginPtr ;
6071
+ XLogCtl -> InitializedUpTo = endOfRecoveryInfo -> lastPageBeginPtr + XLOG_BLCKSZ ;
6210
6072
}
6211
6073
else
6212
6074
{
@@ -6215,10 +6077,8 @@ StartupXLOG(void)
6215
6077
* let the first attempt to insert a log record to initialize the next
6216
6078
* buffer.
6217
6079
*/
6218
- pg_atomic_write_u64 (& XLogCtl -> InitializedUpTo , EndOfLog );
6219
- XLogCtl -> InitializedFrom = EndOfLog ;
6080
+ XLogCtl -> InitializedUpTo = EndOfLog ;
6220
6081
}
6221
- pg_atomic_write_u64 (& XLogCtl -> InitializeReserved , pg_atomic_read_u64 (& XLogCtl -> InitializedUpTo ));
6222
6082
6223
6083
/*
6224
6084
* Update local and shared status. This is OK to do without any locks
0 commit comments