Skip to content

Commit 6e5d06a

Browse files
committed
Update XTM documentation
1 parent ab36908 commit 6e5d06a

File tree

7 files changed

+357
-54
lines changed

7 files changed

+357
-54
lines changed

contrib/mmts/arbiter.c

Lines changed: 46 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ typedef struct
8383
TransactionId dxid; /* Transaction ID at destination node */
8484
TransactionId sxid; /* Transaction IO at sender node */
8585
csn_t csn; /* local CSN in case of sending data from replica to master, global CSN master->replica */
86+
int64 disabledNodeMask; /* bitmask of disabled nodes at the sender of message */
8687
} MtmArbiterMessage;
8788

8889
typedef struct
@@ -109,7 +110,8 @@ static char const* const messageText[] =
109110
"ABORT",
110111
"PREPARED",
111112
"COMMITTED",
112-
"ABORTED"
113+
"ABORTED",
114+
"STATUS"
113115
};
114116

115117

@@ -276,11 +278,28 @@ static int MtmConnectSocket(char const* host, int port, int max_attempts)
276278
msg.dxid = HANDSHAKE_MAGIC;
277279
msg.sxid = ShmemVariableCache->nextXid;
278280
msg.csn = MtmGetCurrentTime();
281+
msg.disabledNodeMask = ds->disabledNodeMask;
279282
if (!MtmWriteSocket(sd, &msg, sizeof msg)) {
280283
elog(WARNING, "Arbiter failed to send handshake message to %s:%d: %d", host, port, errno);
281284
close(sd);
282285
goto Retry;
283286
}
287+
if (MtmReadSocket(sd, &msg, sizeof msg) != sizeof(msg)) {
288+
elog(WARNING, "Arbiter failed to receive response for handshake message from %s:%d: %d", host, port, errno);
289+
close(sd);
290+
goto Retry;
291+
}
292+
if (msg.code != MSG_STATUS || msg.dxid != HANDSHAKE_MAGIC) {
293+
elog(WARNING, "Arbiter get unexpected response %d for handshake message from %s:%d: %d", msg.code, host, port, errno);
294+
close(sd);
295+
goto Retry;
296+
}
297+
298+
if (BIT_CHECK(msg.disabledNodeMask, MtmNodeId-1)) {
299+
elog(WARNING, "Node is switched to recovery mode");
300+
ds->status = MTM_RECOVERY;
301+
}
302+
ds->disabledNodeMask = msg.disabledNodeMask;
284303
return sd;
285304
}
286305
}
@@ -315,11 +334,16 @@ static void MtmOpenConnections()
315334
sockets[i] = MtmConnectSocket(host, MtmArbiterPort + i + 1, MtmConnectAttempts);
316335
if (sockets[i] < 0) {
317336
MtmDropNode(i+1, false);
318-
}
337+
}
319338
} else {
320339
sockets[i] = -1;
321340
}
322341
}
342+
if (ds->nNodes < MtmNodes/2+1) { /* no quorum */
343+
ds->status = MTM_OFFLINE;
344+
} else if (ds->status == MTM_INITIALIZATION) {
345+
ds->status = MTM_CONNECTED;
346+
}
323347
}
324348

325349

@@ -362,8 +386,13 @@ static void MtmAcceptOneConnection()
362386
close(fd);
363387
} else{
364388
Assert(msg.node > 0 && msg.node <= MtmNodes && msg.node != MtmNodeId);
365-
if (BIT_SET(ds->disabledNodeMask, msg.node-1)) {
366-
elog(WARNING, "Reject attempt to reconnect from disabled node %d", msg.node);
389+
msg.code = MSG_STATUS;
390+
msg.disabledNodeMask = ds->disabledNodeMask;
391+
msg.dxid = HANDSHAKE_MAGIC;
392+
msg.sxid = ShmemVariableCache->nextXid;
393+
msg.csn = MtmGetCurrentTime();
394+
if (!MtmWriteSocket(fd, &msg, sizeof msg)) {
395+
elog(WARNING, "Arbiter failed to write response for handshake message from node %d", msg.node);
367396
close(fd);
368397
} else {
369398
elog(NOTICE, "Arbiter established connection with node %d", msg.node);
@@ -427,6 +456,7 @@ static void MtmAppendBuffer(MtmBuffer* txBuffer, TransactionId xid, int node, Mt
427456
buf->data[buf->used].sxid = ts->xid;
428457
buf->data[buf->used].csn = ts->csn;
429458
buf->data[buf->used].node = MtmNodeId;
459+
buf->data[buf->used].disabledNodeMask = ds->disabledNodeMask;
430460
buf->used += 1;
431461
}
432462

@@ -659,12 +689,20 @@ static void MtmTransReceiver(Datum arg)
659689
switch (msg->code) {
660690
case MSG_PREPARE:
661691
Assert(ts->status == TRANSACTION_STATUS_IN_PROGRESS);
662-
ts->status = TRANSACTION_STATUS_UNKNOWN;
663-
ts->csn = MtmAssignCSN();
664-
ts->cmd = MSG_PREPARED;
692+
if ((msg->disabledNodeMask & ~ds->disabledNodeMask) != 0) {
693+
/* Coordinator's disabled mask is wider than my:so reject such transaction to avoid
694+
commit on smaller subset of nodes */
695+
ts->status = TRANSACTION_STATUS_ABORTED;
696+
ts->cmd = MSG_ABORT;
697+
MtmAdjustSubtransactions(ts);
698+
MtmWakeUpBackend(ts);
699+
} else {
700+
ts->status = TRANSACTION_STATUS_UNKNOWN;
701+
ts->csn = MtmAssignCSN();
702+
ts->cmd = MSG_PREPARED;
703+
}
665704
MtmSendNotificationMessage(ts);
666705
break;
667-
break;
668706
case MSG_COMMIT:
669707
Assert(ts->status == TRANSACTION_STATUS_UNKNOWN);
670708
Assert(ts->csn < msg->csn);

contrib/mmts/multimaster.c

Lines changed: 37 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ typedef struct {
7070
#define USEC 1000000
7171
#define MIN_WAIT_TIMEOUT 1000
7272
#define MAX_WAIT_TIMEOUT 100000
73+
#define STATUS_POLL_DELAY USEC
7374

7475
void _PG_init(void);
7576
void _PG_fini(void);
@@ -147,7 +148,7 @@ static void MtmProcessUtility(Node *parsetree, const char *queryString,
147148
void MtmLock(LWLockMode mode)
148149
{
149150
#ifdef USE_SPINLOCK
150-
SpinLockAcquire(&dtm->hashSpinlock);
151+
SpinLockAcquire(&dtm->spinlock);
151152
#else
152153
LWLockAcquire(dtm->hashLock, mode);
153154
#endif
@@ -156,7 +157,7 @@ void MtmLock(LWLockMode mode)
156157
void MtmUnlock(void)
157158
{
158159
#ifdef USE_SPINLOCK
159-
SpinLockRelease(&dtm->hashSpinlock);
160+
SpinLockRelease(&dtm->spinlock);
160161
#else
161162
LWLockRelease(dtm->hashLock);
162163
#endif
@@ -409,20 +410,22 @@ static void MtmInitialize()
409410
dtm = (MtmState*)ShmemInitStruct(MULTIMASTER_NAME, sizeof(MtmState), &found);
410411
if (!found)
411412
{
413+
dtm->status = MTM_INITIALIZATION;
414+
dtm->recoverySlot = 0;
412415
dtm->hashLock = (LWLock*)GetNamedLWLockTranche(MULTIMASTER_NAME);
413416
dtm->csn = MtmGetCurrentTime();
414417
dtm->oldestXid = FirstNormalTransactionId;
415418
dtm->nNodes = MtmNodes;
416419
dtm->disabledNodeMask = 0;
420+
dtm->pglogicalNodeMask = 0;
417421
dtm->votingTransactions = NULL;
418422
dtm->transListHead = NULL;
419-
dtm->transListTail = &dtm->transListHead;
420-
pg_atomic_write_u32(&dtm->nReceivers, 0);
423+
dtm->transListTail = &dtm->transListHead;
424+
dtm->nReceivers = 0;
421425
dtm->timeShift = 0;
422-
dtm->initialized = false;
423426
PGSemaphoreCreate(&dtm->votingSemaphore);
424427
PGSemaphoreReset(&dtm->votingSemaphore);
425-
SpinLockInit(&dtm->hashSpinlock);
428+
SpinLockInit(&dtm->spinlock);
426429
BgwPoolInit(&dtm->pool, MtmExecutor, MtmDatabaseName, MtmQueueSize);
427430
RegisterXactCallback(MtmXactCallback, NULL);
428431
dtmTx.snapshot = INVALID_CSN;
@@ -463,7 +466,7 @@ MtmBeginTransaction(MtmCurrentTrans* x)
463466
MtmLock(LW_EXCLUSIVE);
464467
x->xid = GetCurrentTransactionIdIfAny();
465468
x->isReplicated = false;
466-
x->isDistributed = IsNormalProcessingMode() && dtm->initialized && MtmDoReplication && !am_walsender && !IsBackgroundWorker && !IsAutoVacuumWorkerProcess();
469+
x->isDistributed = IsNormalProcessingMode() && dtm->status == MTM_ONLINE && MtmDoReplication && !am_walsender && !IsBackgroundWorker && !IsAutoVacuumWorkerProcess();
467470
x->containsDML = false;
468471
x->snapshot = MtmAssignCSN();
469472
x->gtid.xid = InvalidTransactionId;
@@ -575,8 +578,6 @@ MtmFinishTransaction(TransactionId xid, int nsubxids, TransactionId *subxids, Xi
575578
XidStatus prevStatus = TRANSACTION_STATUS_UNKNOWN;
576579
bool found;
577580

578-
Assert(status == TRANSACTION_STATUS_ABORTED);
579-
580581
MtmLock(LW_EXCLUSIVE);
581582
ts = hash_search(xid2state, &xid, HASH_ENTER, &found);
582583
if (!found) {
@@ -590,7 +591,7 @@ MtmFinishTransaction(TransactionId xid, int nsubxids, TransactionId *subxids, Xi
590591
ts->status = status;
591592
MtmAdjustSubtransactions(ts);
592593

593-
if (prevStatus != TRANSACTION_STATUS_ABORTED) {
594+
if (dtm->status != MTM_RECOVERY && prevStatus != TRANSACTION_STATUS_ABORTED) {
594595
ts->cmd = MSG_ABORTED;
595596
MtmSendNotificationMessage(ts);
596597
}
@@ -607,7 +608,7 @@ MtmSetTransactionStatus(TransactionId xid, int nsubxids, TransactionId *subxids,
607608
MTM_TRACE("%d: MtmSetTransactionStatus %u(%u) = %u, isDistributed=%d\n", getpid(), xid, dtmTx.xid, status, dtmTx.isDistributed);
608609
if (xid == dtmTx.xid && dtmTx.isDistributed)
609610
{
610-
if (status == TRANSACTION_STATUS_ABORTED || !dtmTx.containsDML)
611+
if (status == TRANSACTION_STATUS_ABORTED || !dtmTx.containsDML || dtm->status == MTM_RECOVERY)
611612
{
612613
MtmFinishTransaction(xid, nsubxids, subxids, status);
613614
MTM_TRACE("Finish transaction %d, status=%d, DML=%d\n", xid, status, dtmTx.containsDML);
@@ -863,11 +864,17 @@ void MtmJoinTransaction(GlobalTransactionId* gtid, csn_t globalSnapshot)
863864
dtmTx.containsDML = true;
864865
}
865866

866-
void MtmReceiverStarted()
867+
void MtmReceiverStarted(int nodeId)
867868
{
868-
if (pg_atomic_fetch_add_u32(&dtm->nReceivers, 1) == dtm->nNodes-2) {
869-
dtm->initialized = true;
869+
SpinLockAcquire(&dtm->spinlock);
870+
if (!BIT_CHECK(dtm->pglogicalNodeMask, nodeId-1)) {
871+
dtm->pglogicalNodeMask |= (int64)1 << (nodeId-1);
872+
if (++dtm->nReceivers == dtm->nNodes-1) {
873+
Assert(dtm->status == MTM_CONNECTED);
874+
dtm->status = MTM_ONLINE;
875+
}
870876
}
877+
SpinLockRelease(&dtm->spinlock);
871878
}
872879

873880
csn_t MtmTransactionSnapshot(TransactionId xid)
@@ -885,10 +892,23 @@ csn_t MtmTransactionSnapshot(TransactionId xid)
885892
return snapshot;
886893
}
887894

888-
895+
MtmSlotMode MtmReceiverSlotMode(int nodeId)
896+
{
897+
while (dtm->status != MTM_CONNECTED && dtm->status != MTM_ONLINE) {
898+
if (dtm->status == MTM_RECOVERY) {
899+
if (dtm->recoverySlot == 0 || dtm->recoverySlot == nodeId) {
900+
dtm->recoverySlot = nodeId;
901+
return SLOT_OPEN_EXISTED;
902+
}
903+
}
904+
MtmSleep(STATUS_POLL_DELAY);
905+
}
906+
return dtm->recoverySlot ? SLOT_CREATE_NEW : SLOT_OPEN_ALWAYS;
907+
}
908+
889909
void MtmDropNode(int nodeId, bool dropSlot)
890910
{
891-
if (!BIT_SET(dtm->disabledNodeMask, nodeId-1))
911+
if (!BIT_CHECK(dtm->disabledNodeMask, nodeId-1))
892912
{
893913
if (nodeId <= 0 || nodeId > dtm->nNodes)
894914
{
@@ -969,7 +989,7 @@ static void MtmBroadcastUtilityStmt(char const* sql, bool ignoreError)
969989
p = conn_str_end;
970990
}
971991
*p = '\0';
972-
if (!BIT_SET(disabledNodeMask, i))
992+
if (!BIT_CHECK(disabledNodeMask, i))
973993
{
974994
conns[i] = PQconnectdb(conn_str);
975995
if (PQstatus(conns[i]) != CONNECTION_OK)

contrib/mmts/multimaster.h

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
#define MTM_TUPLE_TRACE(fmt, ...) fprintf(stderr, fmt, ## __VA_ARGS__)
1313
*/
1414

15-
#define BIT_SET(mask, bit) ((mask) & ((int64)1 << (bit)))
15+
#define BIT_CHECK(mask, bit) ((mask) & ((int64)1 << (bit)))
1616

1717
#define MULTIMASTER_NAME "mtm"
1818
#define MULTIMASTER_SCHEMA_NAME "mtm"
@@ -46,9 +46,25 @@ typedef enum
4646
MSG_ABORT,
4747
MSG_PREPARED,
4848
MSG_COMMITTED,
49-
MSG_ABORTED
49+
MSG_ABORTED,
50+
MSG_STATUS
5051
} MtmMessageCode;
5152

53+
typedef enum
54+
{
55+
MTM_INITIALIZATION, /* Initial status */
56+
MTM_OFFLINE, /* Node is out of quorum */
57+
MTM_CONNECTED, /* Arbiter is established connections with other nodes */
58+
MTM_ONLINE, /* Ready to receive client's queries */
59+
MTM_RECOVERY /* Node is in recovery process */
60+
} MtmNodeStatus;
61+
62+
typedef enum
63+
{
64+
SLOT_CREATE_NEW, /* create new slot (drop existed) */
65+
SLOT_OPEN_EXISTED, /* open existed slot */
66+
SLOT_OPEN_ALWAYS, /* open existed slot or create new if noty exists */
67+
} MtmSlotMode;
5268

5369
typedef struct MtmTransState
5470
{
@@ -71,16 +87,18 @@ typedef struct MtmTransState
7187

7288
typedef struct
7389
{
74-
volatile slock_t hashSpinlock; /* spinlock used to protect access to hash table */
90+
MtmNodeStatus status; /* Status of this node */
91+
int recoverySlot; /* NodeId of recovery slot or 0 if none */
92+
volatile slock_t spinlock; /* spinlock used to protect access to hash table */
7593
PGSemaphoreData votingSemaphore; /* semaphore used to notify mtm-sender about new responses to coordinator */
7694
LWLockId hashLock; /* lock to synchronize access to hash table */
7795
TransactionId oldestXid; /* XID of oldest transaction visible by any active transaction (local or global) */
78-
int64 disabledNodeMask; /* bitmask of disable nodes (so no more than 64 nodes in multimaster:) */
96+
int64 disabledNodeMask; /* bitmask of disabled nodes (so no more than 64 nodes in multimaster:) */
97+
int64 pglogicalNodeMask; /* bitmask of started pglogic receviers */
7998
int nNodes; /* number of active nodes */
80-
pg_atomic_uint32 nReceivers; /* number of initialized logical receivers (used to determine moment when Mtm intialization is completed */
81-
long timeShift; /* local time correction */
82-
bool initialized;
83-
csn_t csn; /* last obtained CSN: used to provide unique acending CSNs based on system time */
99+
int nReceivers; /* number of initialized logical receivers (used to determine moment when Mtm intialization is completed */
100+
long timeShift; /* local time correction */
101+
csn_t csn; /* last obtained CSN: used to provide unique acending CSNs based on system time */
84102
MtmTransState* votingTransactions; /* L1-list of replicated transactions sendings notifications to coordinator.
85103
This list is used to pass information to mtm-sender BGW */
86104
MtmTransState* transListHead; /* L1 list of all finished transactions present in xid2state hash.
@@ -107,7 +125,8 @@ extern csn_t MtmTransactionSnapshot(TransactionId xid);
107125
extern csn_t MtmAssignCSN(void);
108126
extern csn_t MtmSyncClock(csn_t csn);
109127
extern void MtmJoinTransaction(GlobalTransactionId* gtid, csn_t snapshot);
110-
extern void MtmReceiverStarted(void);
128+
extern void MtmReceiverStarted(int nodeId);
129+
extern MtmSlotMode MtmReceiverSlotMode(int nodeId);
111130
extern void MtmExecute(void* work, int size);
112131
extern void MtmExecutor(int id, void* work, size_t size);
113132
extern HTAB* MtmCreateHash(void);

contrib/mmts/pglogical_proto.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ pglogical_write_begin(StringInfo out, PGLogicalOutputData *data,
108108
PGLogicalProtoMM* mm = (PGLogicalProtoMM*)data->api;
109109
csn_t csn = MtmTransactionSnapshot(txn->xid);
110110
MTM_TRACE("pglogical_write_begin %d CSN=%ld\n", txn->xid, csn);
111-
if (csn == INVALID_CSN || BIT_SET(mm->state->disabledNodeMask, mm->nodeId-1)) {
111+
if (csn == INVALID_CSN || BIT_CHECK(mm->state->disabledNodeMask, mm->nodeId-1)) {
112112
mm->isLocal = true;
113113
} else {
114114
mm->isLocal = false;

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy