Skip to content

Commit 7bea093

Browse files
committed
Add switch from recovery to normal mode
1 parent 0542358 commit 7bea093

File tree

5 files changed

+110
-54
lines changed

5 files changed

+110
-54
lines changed

contrib/mmts/arbiter.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ typedef struct
8383
TransactionId dxid; /* Transaction ID at destination node */
8484
TransactionId sxid; /* Transaction IO at sender node */
8585
csn_t csn; /* local CSN in case of sending data from replica to master, global CSN master->replica */
86-
int64 disabledNodeMask; /* bitmask of disabled nodes at the sender of message */
86+
nodemask_t disabledNodeMask; /* bitmask of disabled nodes at the sender of message */
8787
} MtmArbiterMessage;
8888

8989
typedef struct

contrib/mmts/multimaster.c

Lines changed: 98 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
#include "storage/proc.h"
4646
#include "utils/syscache.h"
4747
#include "replication/walsender.h"
48+
#include "replication/walsender_private.h"
4849
#include "replication/slot.h"
4950
#include "port/atomics.h"
5051
#include "tcop/utility.h"
@@ -60,16 +61,14 @@ typedef struct {
6061
bool isReplicated; /* transaction on replica */
6162
bool isDistributed; /* transaction performed INSERT/UPDATE/DELETE and has to be replicated to other nodes */
6263
bool containsDML; /* transaction contains DML statements */
63-
bool isPrepared; /* transaction was prepared for commit */
6464
csn_t snapshot; /* transaction snaphsot */
6565
} MtmCurrentTrans;
6666

6767
/* #define USE_SPINLOCK 1 */
6868

6969
typedef enum
7070
{
71-
HASH_LOCK_ID,
72-
COMMIT_LOCK_ID,
71+
MTM_STATE_LOCK_ID,
7372
N_LOCKS
7473
} MtmLockIds;
7574

@@ -142,6 +141,7 @@ int MtmReconnectAttempts;
142141
static int MtmQueueSize;
143142
static int MtmWorkers;
144143
static int MtmVacuumDelay;
144+
static int MtmMinRecoveryLag;
145145

146146
static ExecutorFinish_hook_type PreviousExecutorFinishHook;
147147
static ProcessUtility_hook_type PreviousProcessUtilityHook;
@@ -158,7 +158,7 @@ void MtmLock(LWLockMode mode)
158158
#ifdef USE_SPINLOCK
159159
SpinLockAcquire(&dtm->spinlock);
160160
#else
161-
LWLockAcquire(dtm->locks[HASH_LOCK_ID], mode);
161+
LWLockAcquire((LWLockId)&dtm->locks[MTM_STATE_LOCK_ID], mode);
162162
#endif
163163
}
164164

@@ -167,7 +167,7 @@ void MtmUnlock(void)
167167
#ifdef USE_SPINLOCK
168168
SpinLockRelease(&dtm->spinlock);
169169
#else
170-
LWLockRelease(dtm->locks[HASH_LOCK_ID]);
170+
LWLockRelease((LWLockId)&dtm->locks[MTM_STATE_LOCK_ID]);
171171
#endif
172172
}
173173

@@ -426,12 +426,14 @@ static void MtmInitialize()
426426
dtm->nNodes = MtmNodes;
427427
dtm->disabledNodeMask = 0;
428428
dtm->pglogicalNodeMask = 0;
429+
dtm->walSenderLockerMask = 0;
430+
dtm->nodeLockerMask = 0;
431+
dtm->nLockers = 0;
429432
dtm->votingTransactions = NULL;
430433
dtm->transListHead = NULL;
431434
dtm->transListTail = &dtm->transListHead;
432435
dtm->nReceivers = 0;
433436
dtm->timeShift = 0;
434-
pg_atomic_write_u32(&dtm->nCommittingTrans, 0);
435437
PGSemaphoreCreate(&dtm->votingSemaphore);
436438
PGSemaphoreReset(&dtm->votingSemaphore);
437439
SpinLockInit(&dtm->spinlock);
@@ -476,7 +478,6 @@ MtmBeginTransaction(MtmCurrentTrans* x)
476478
x->xid = GetCurrentTransactionIdIfAny();
477479
x->isReplicated = false;
478480
x->isDistributed = IsNormalProcessingMode() && dtm->status == MTM_ONLINE && MtmDoReplication && !am_walsender && !IsBackgroundWorker && !IsAutoVacuumWorkerProcess();
479-
x->isPrepared = false;
480481
x->containsDML = false;
481482
x->snapshot = MtmAssignCSN();
482483
x->gtid.xid = InvalidTransactionId;
@@ -487,6 +488,53 @@ MtmBeginTransaction(MtmCurrentTrans* x)
487488
}
488489

489490

491+
/* This function is called at transaction start with multimaster ock set */
492+
static void
493+
MtmCheckClusterLock()
494+
{
495+
while (true)
496+
{
497+
nodemask_t mask = dtm->walSenderLockerMask;
498+
if (mask != 0) {
499+
XLogRecPtr currLogPos = GetXLogInsertRecPtr();
500+
int i;
501+
timestamp_t delay = MIN_WAIT_TIMEOUT;
502+
for (i = 0; mask != 0; i++, mask >>= 1) {
503+
if (mask & 1) {
504+
if (WalSndCtl->walsnds[i].sentPtr != currLogPos) {
505+
/* recovery is in progress */
506+
break;
507+
} else {
508+
/* recovered replica catched up with master */
509+
dtm->walSenderLockerMask &= ~((nodemask_t)1 << i);
510+
}
511+
}
512+
}
513+
if (mask != 0) {
514+
/* some "almost catch-up" wal-senders are still working */
515+
/* Do not start new transactions until them complete */
516+
MtmUnlock();
517+
MtmSleep(delay);
518+
if (delay*2 <= MAX_WAIT_TIMEOUT) {
519+
delay *= 2;
520+
}
521+
MtmLock(LW_EXCLUSIVE);
522+
continue;
523+
} else {
524+
/* All lockers are synchronized their logs */
525+
/* Remove lock and mark them as receovered */
526+
Assert(dtm->walSenderLockerMask == 0);
527+
Assert((dtm->nodeLockerMask & dtm->disabledNodeMask) == dtm->nodeLockerMask);
528+
dtm->disabledNodeMask &= ~dtm->nodeLockerMask;
529+
dtm->nNodes += dtm->nLockers;
530+
dtm->nLockers = 0;
531+
dtm->nodeLockerMask = 0;
532+
}
533+
}
534+
break;
535+
}
536+
}
537+
490538
/*
491539
* We need to pass snapshot to WAL-sender, so create record in transaction status hash table
492540
* before commit
@@ -499,15 +547,12 @@ static void MtmPrepareTransaction(MtmCurrentTrans* x)
499547
if (!x->isDistributed) {
500548
return;
501549
}
502-
/* Check that commits are not disabled */
503-
LWLockAcquire(dtm->locks[COMMIT_LOCK_ID], LW_SHARED);
504-
LWLockRelease(dtm->locks[COMMIT_LOCK_ID]);
505550

506-
pg_atomic_fetch_add_u32(dtm->nCommittingTransactions, 1);
507-
x->isPrepared = true;
508551
x->xid = GetCurrentTransactionId();
509552

510553
MtmLock(LW_EXCLUSIVE);
554+
MtmCheckClusterLock();
555+
511556
ts = hash_search(xid2state, &x->xid, HASH_ENTER, NULL);
512557
ts->status = TRANSACTION_STATUS_IN_PROGRESS;
513558
ts->snapshot = x->isReplicated || !x->containsDML ? INVALID_CSN : x->snapshot;
@@ -546,9 +591,6 @@ MtmEndTransaction(MtmCurrentTrans* x, bool commit)
546591
MtmAdjustSubtransactions(ts);
547592
MtmUnlock();
548593
}
549-
if (x->isPrepared) {
550-
pg_atomic_fetch_add_u32(dtm->nCommittingTransactions, -1);
551-
}
552594
x->snapshot = INVALID_CSN;
553595
x->xid = InvalidTransactionId;
554596
x->gtid.xid = InvalidTransactionId;
@@ -568,39 +610,29 @@ void MtmSendNotificationMessage(MtmTransState* ts)
568610
}
569611
}
570612

571-
void MtmUpdateStatus(bool recovered)
572-
{
573-
if (dtm->status == MTM_RECOVERY) {
574-
MtmLock(LW_EXCLUSIVE);
575-
dtm->status = MTM_ONLINE; /* Is it all we shoudl do t switch to nortmal state */
576-
MtmUnlock();
577-
}
578-
}
579-
580-
void MtmRecoveryCompleted(int nodeId)
613+
/*
614+
* This function is called by WAL sender when start sending new transaction
615+
*/
616+
bool MtmIsRecoveredNode(int nodeId)
581617
{
582-
if (BIT_CHECK(dtm->pglogicalNodeMask, nodeId-1)) {
583-
if (MyWalSnd->sentPtr == GetXLogInsertRecPtr()) {
584-
/* Ok, now we done with recovery of node */
618+
if (BIT_CHECK(dtm->disabledNodeMask, nodeId-1)) {
619+
Assert(MyWalSnd != NULL);
620+
if (!BIT_CHECK(dtm->nodeLockerMask, nodeId-1)
621+
&& MyWalSnd->sentPtr + MtmMinRecoveryLag > GetXLogInsertRecPtr())
622+
{
623+
/* Wal sender almost catched up */
624+
/* Lock cluster preventing new transaction to start until wal is completely replayed */
585625
MtmLock(LW_EXCLUSIVE);
586-
dtm->pglogicalNodeMask &= (int64)1 << (nodeId-1); /* now node is assumed as recovered */
587-
dtm->nNodes += 1;
626+
dtm->nodeLockerMask |= (nodemask_t)1 << (nodeId-1);
627+
dtm->walSenderLockerMask |= (nodemask_t)1 << (MyWalSnd - WalSndCtl->walsnds);
628+
dtm->nLockers += 1;
588629
MtmUnlock();
589-
590-
LWLockRelease(dtm->locks[COMMIT_LOCK_ID]); /* enable commits */
591-
592-
return true;
593-
} else if (MyWalSnd->sentPtr + MtmSlotDelayThreashold > GetXLogInsertRecPtr()) {
594-
/* we almost done with recovery of node.. */
595-
LWLockAcquire(dtm->locks[COMMIT_LOCK_ID], LW_EXCLUSIVE); /* disable new commits */
596630
}
597-
return false;
598-
} else {
599631
return true;
600632
}
633+
return false;
601634
}
602635

603-
604636
static bool
605637
MtmCommitTransaction(TransactionId xid, int nsubxids, TransactionId *subxids)
606638
{
@@ -717,6 +749,21 @@ _PG_init(void)
717749
if (!process_shared_preload_libraries_in_progress)
718750
return;
719751

752+
DefineCustomIntVariable(
753+
"multimaster.min_recovery_lag",
754+
"Minamal lag of WAL-sender performing recovery after which cluster is locked until recovery is completed",
755+
NULL,
756+
&MtmMinRecoveryLag,
757+
100000,
758+
1,
759+
INT_MAX,
760+
PGC_BACKEND,
761+
0,
762+
NULL,
763+
NULL,
764+
NULL
765+
);
766+
720767
DefineCustomIntVariable(
721768
"multimaster.vacuum_delay",
722769
"Minimal age of records which can be vacuumed (seconds)",
@@ -897,6 +944,12 @@ _PG_fini(void)
897944
* ***************************************************************************
898945
*/
899946

947+
static void MtmSwitchFromRecoveryToNormalMode()
948+
{
949+
dtm->status = MTM_ONLINE;
950+
/* ??? Something else to do here? */
951+
}
952+
900953
void MtmJoinTransaction(GlobalTransactionId* gtid, csn_t globalSnapshot)
901954
{
902955
csn_t localSnapshot;
@@ -910,6 +963,11 @@ void MtmJoinTransaction(GlobalTransactionId* gtid, csn_t globalSnapshot)
910963
elog(ERROR, "Too old snapshot: requested %ld, current %ld", globalSnapshot, localSnapshot);
911964
}
912965

966+
if (!TransactionIdIsValid(gtid->xid)) {
967+
Assert(dtm->status == MTM_RECOVERY);
968+
} else if (dtm->status == MTM_RECOVERY) {
969+
MtmSwitchFromRecoveryToNormalMode();
970+
}
913971
dtmTx.gtid = *gtid;
914972
dtmTx.xid = GetCurrentTransactionId();
915973
dtmTx.snapshot = globalSnapshot;

contrib/mmts/multimaster.h

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
#define MTM_TUPLE_TRACE(fmt, ...) fprintf(stderr, fmt, ## __VA_ARGS__)
1313
*/
1414

15-
#define BIT_CHECK(mask, bit) ((mask) & ((int64)1 << (bit)))
15+
#define BIT_CHECK(mask, bit) (((mask) & ((nodemask_t)1 << (bit))) != 0)
1616

1717
#define MULTIMASTER_NAME "mtm"
1818
#define MULTIMASTER_SCHEMA_NAME "mtm"
@@ -30,6 +30,7 @@ typedef uint64 csn_t; /* commit serial number */
3030
#define INVALID_CSN ((csn_t)-1)
3131

3232
typedef uint64 timestamp_t;
33+
typedef uint64_t nodemask_t;
3334

3435
/* Identifier of global transaction */
3536
typedef struct
@@ -95,13 +96,15 @@ typedef struct
9596
PGSemaphoreData votingSemaphore; /* semaphore used to notify mtm-sender about new responses to coordinator */
9697
LWLockPadded *locks; /* multimaster lock tranche */
9798
TransactionId oldestXid; /* XID of oldest transaction visible by any active transaction (local or global) */
98-
int64 disabledNodeMask; /* bitmask of disabled nodes (so no more than 64 nodes in multimaster:) */
99-
int64 pglogicalNodeMask; /* bitmask of started pglogic receviers */
99+
nodemask_t disabledNodeMask; /* bitmask of disabled nodes (so no more than 64 nodes in multimaster:) */
100+
nodemask_t pglogicalNodeMask; /* bitmask of started pglogic receivers */
101+
nodemask_t walSenderLockerMask; /* Mask of WAL-senders IDs locking the cluster */
102+
nodemask_t nodeLockerMask; /* Mask of node IDs which WAL-senders are locking the cluster */
100103
int nNodes; /* number of active nodes */
101104
int nReceivers; /* number of initialized logical receivers (used to determine moment when Mtm intialization is completed */
105+
int nLockers; /* number of lockers */
102106
long timeShift; /* local time correction */
103107
csn_t csn; /* last obtained CSN: used to provide unique acending CSNs based on system time */
104-
pg_atomic_uint32 nCommittingTrans; /* nubmer of transactions i process of commit */
105108
MtmTransState* votingTransactions; /* L1-list of replicated transactions sendings notifications to coordinator.
106109
This list is used to pass information to mtm-sender BGW */
107110
MtmTransState* transListHead; /* L1 list of all finished transactions present in xid2state hash.
@@ -141,6 +144,5 @@ extern void MtmDropNode(int nodeId, bool dropSlot);
141144
extern MtmState* MtmGetState(void);
142145
extern timestamp_t MtmGetCurrentTime(void);
143146
extern void MtmSleep(timestamp_t interval);
144-
extern void MtmRecoveryCompleted(int nodeId);
145-
extern void MtmUpdateStatus(bool recovered);
147+
extern bool MtmIsRecoveredNode(int nodeId);
146148
#endif

contrib/mmts/pglogical_apply.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -466,9 +466,7 @@ read_rel(StringInfo s, LOCKMODE mode)
466466
static void
467467
process_remote_commit(StringInfo s)
468468
{
469-
bool recovered = pq_getmsgbyte(s);
470469
CommitTransactionCommand();
471-
MtmUpdateStatus(recovered);
472470
}
473471

474472
static void

contrib/mmts/pglogical_proto.c

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@
3939
typedef struct PGLogicalProtoMM
4040
{
4141
PGLogicalProtoAPI api;
42-
MtmState* state;
4342
int nodeId;
4443
bool isLocal;
4544
} PGLogicalProtoMM;
@@ -107,14 +106,15 @@ pglogical_write_begin(StringInfo out, PGLogicalOutputData *data,
107106
{
108107
PGLogicalProtoMM* mm = (PGLogicalProtoMM*)data->api;
109108
csn_t csn = MtmTransactionSnapshot(txn->xid);
109+
bool isRecovery = MtmIsRecoveredNode(mm->nodeId);
110110
MTM_TRACE("pglogical_write_begin %d CSN=%ld\n", txn->xid, csn);
111-
if (csn == INVALID_CSN || BIT_CHECK(mm->state->disabledNodeMask, mm->nodeId-1)) {
111+
if (csn == INVALID_CSN && !isRecovery) {
112112
mm->isLocal = true;
113113
} else {
114114
mm->isLocal = false;
115115
pq_sendbyte(out, 'B'); /* BEGIN */
116116
pq_sendint(out, MtmNodeId, 4);
117-
pq_sendint(out, txn->xid, 4);
117+
pq_sendint(out, isRecovery ? InvalidTransactionId : txn->xid, 4);
118118
pq_sendint64(out, csn);
119119
}
120120
}
@@ -129,7 +129,6 @@ pglogical_write_commit(StringInfo out, PGLogicalOutputData *data,
129129
PGLogicalProtoMM* mm = (PGLogicalProtoMM*)data->api;
130130
if (!mm->isLocal) {
131131
pq_sendbyte(out, 'C'); /* sending COMMIT */
132-
pq_sendbyte(out, MtmRecoveryCompleted(mm->nodeId));
133132
}
134133
}
135134

@@ -380,7 +379,6 @@ pglogical_init_api(PGLogicalProtoType typ)
380379
PGLogicalProtoMM* pmm = palloc0(sizeof(PGLogicalProtoMM));
381380
PGLogicalProtoAPI* res = &pmm->api;
382381
pmm->isLocal = false;
383-
pmm->state = MtmGetState();
384382
sscanf(MyReplicationSlot->data.name.data, MULTIMASTER_SLOT_PATTERN, &pmm->nodeId);
385383
res->write_rel = pglogical_write_rel;
386384
res->write_begin = pglogical_write_begin;

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy