Skip to content

Commit f36de77

Browse files
committed
Change transaction status polling
1 parent 127dc15 commit f36de77

File tree

3 files changed

+224
-17
lines changed

3 files changed

+224
-17
lines changed

contrib/mmts/arbiter.c

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ static bool send_heartbeat;
8484
static timestamp_t last_sent_heartbeat;
8585
static TimeoutId heartbeat_timer;
8686
static nodemask_t busy_mask;
87+
static timestamp_t last_heartbeat_to_node[MAX_NODES];
8788

8889
static void MtmSender(Datum arg);
8990
static void MtmReceiver(Datum arg);
@@ -369,11 +370,15 @@ static void MtmSendHeartbeat()
369370
if (!MtmSendToNode(i, &msg, sizeof(msg))) {
370371
elog(LOG, "Arbiter failed to send heartbeat to node %d", i+1);
371372
} else {
373+
if (last_heartbeat_to_node[i] + MSEC_TO_USEC(MtmHeartbeatSendTimeout)*2 < now) {
374+
MTM_LOG1("Last hearbeat to node %d was sent %ld microseconds ago", i+1, now - last_heartbeat_to_node[i]);
375+
}
376+
last_heartbeat_to_node[i] = now;
372377
/* Connectivity mask can be cleared by MtmWatchdog: in this case sockets[i] >= 0 */
373378
if (BIT_CHECK(Mtm->connectivityMask, i)) {
374379
close(sockets[i]);
375380
sockets[i] = -1;
376-
MtmReconnectNode(i+1);
381+
MtmReconnectNode(i+1); /* set reconnect mask to force node reconnent */
377382
//MtmOnNodeConnect(i+1);
378383
}
379384
MTM_LOG4("Send heartbeat to node %d with timestamp %ld", i+1, now);
@@ -386,6 +391,9 @@ static void MtmSendHeartbeat()
386391

387392
}
388393

394+
/* This function shoudl be called from all places where sender can be blocked.
395+
* It checks send_heartbeat flag set by timer and if it is set hthen sends heartbeats to all alive nodes
396+
*/
389397
void MtmCheckHeartbeat()
390398
{
391399
if (send_heartbeat && !stop) {
@@ -544,6 +552,9 @@ static bool MtmSendToNode(int node, void const* buf, int size)
544552
BIT_SET(busy_mask, node);
545553
while (true) {
546554
#if 0
555+
/* Original intention was to reestablish connectect when reconnet mask is set to avoid hanged-up connection.
556+
* But reconnectMask is set not only when connection is broken, so breaking connection in all this cases cause avalunch of connection failures.
557+
*/
547558
if (sockets[node] >= 0 && BIT_CHECK(Mtm->reconnectMask, node)) {
548559
elog(WARNING, "Arbiter is forced to reconnect to node %d", node+1);
549560
close(sockets[node]);
@@ -687,6 +698,7 @@ static void MtmAppendBuffer(MtmBuffer* txBuffer, MtmArbiterMessage* msg)
687698
buf->data[buf->used++] = *msg;
688699
}
689700

701+
690702
static void MtmSender(Datum arg)
691703
{
692704
sigset_t sset;
@@ -709,7 +721,7 @@ static void MtmSender(Datum arg)
709721
/* Connect to a database */
710722
BackgroundWorkerInitializeConnection(MtmDatabaseName, NULL);
711723

712-
724+
/* Start heartbeat times */
713725
heartbeat_timer = RegisterTimeout(USER_TIMEOUT, MtmScheduleHeartbeat);
714726
enable_timeout_after(heartbeat_timer, MtmHeartbeatSendTimeout);
715727

@@ -942,11 +954,14 @@ static void MtmReceiver(Datum arg)
942954
} else {
943955
ts = tm->state;
944956
BIT_SET(ts->votedMask, node-1);
945-
if (ts->status == TRANSACTION_STATUS_UNKNOWN) {
957+
if (ts->status == TRANSACTION_STATUS_UNKNOWN || ts->status == TRANSACTION_STATUS_IN_PROGRESS) {
946958
if (msg->status == TRANSACTION_STATUS_IN_PROGRESS || msg->status == TRANSACTION_STATUS_ABORTED) {
947959
elog(LOG, "Abort prepared transaction %s because it is in state %s at node %d",
948960
msg->gid, MtmTxnStatusMnem[msg->status], node);
961+
962+
replorigin_session_origin = DoNotReplicateId;
949963
MtmFinishPreparedTransaction(ts, false);
964+
replorigin_session_origin = InvalidRepOriginId;
950965
}
951966
else if (msg->status == TRANSACTION_STATUS_COMMITTED || msg->status == TRANSACTION_STATUS_UNKNOWN)
952967
{
@@ -956,7 +971,10 @@ static void MtmReceiver(Datum arg)
956971
}
957972
if ((ts->participantsMask & ~Mtm->disabledNodeMask & ~ts->votedMask) == 0) {
958973
elog(LOG, "Commit transaction %s because it is prepared at all live nodes", msg->gid);
974+
975+
replorigin_session_origin = DoNotReplicateId;
959976
MtmFinishPreparedTransaction(ts, true);
977+
replorigin_session_origin = InvalidRepOriginId;
960978
} else {
961979
MTM_LOG1("Receive response for transaction %s -> %s, participants=%llx, voted=%llx",
962980
msg->gid, MtmTxnStatusMnem[msg->status], (long long)ts->participantsMask, (long long)ts->votedMask);
@@ -1082,7 +1100,7 @@ static void MtmReceiver(Datum arg)
10821100
} else {
10831101
switch (msg->code) {
10841102
case MSG_PRECOMMIT:
1085-
Assert(false); // Now send through pglogical
1103+
Assert(false); // Now sent through pglogical
10861104
if (ts->status == TRANSACTION_STATUS_IN_PROGRESS) {
10871105
ts->status = TRANSACTION_STATUS_UNKNOWN;
10881106
ts->csn = MtmAssignCSN();

contrib/mmts/multimaster.c

Lines changed: 60 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1601,11 +1601,7 @@ static void MtmPollStatusOfPreparedTransactions(int disabledNodeId)
16011601
Assert(ts->gid[0]);
16021602
if (ts->status == TRANSACTION_STATUS_IN_PROGRESS) {
16031603
elog(LOG, "Abort transaction %s because its coordinator is disabled and it is not prepared at node %d", ts->gid, MtmNodeId);
1604-
ts->isPinned = true;
1605-
MtmUnlock();
16061604
MtmFinishPreparedTransaction(ts, false);
1607-
MtmLock(LW_EXCLUSIVE);
1608-
ts->isPinned = false;
16091605
} else {
16101606
MTM_LOG1("Poll state of transaction %d (%s)", ts->xid, ts->gid);
16111607
MtmBroadcastPollMessage(ts);
@@ -2017,7 +2013,7 @@ void MtmOnNodeConnect(int nodeId)
20172013
MtmLock(LW_EXCLUSIVE);
20182014
elog(LOG, "Connect node %d connectivity mask %llx", nodeId, (long long) Mtm->connectivityMask);
20192015
BIT_CLEAR(Mtm->connectivityMask, nodeId-1);
2020-
BIT_CLEAR(Mtm->reconnectMask, nodeId-1);
2016+
BIT_SET(Mtm->reconnectMask, nodeId-1); /* force sender to reestablish connection and send heartbeat */
20212017
MtmUnlock();
20222018
}
20232019

@@ -2945,26 +2941,36 @@ void MtmRollbackPreparedTransaction(int nodeId, char const* gid)
29452941
}
29462942
}
29472943

2948-
2944+
/*
2945+
* This function is call with MTM mutex locked.
2946+
* It shoudl unlock mutex before calling FinishPreparedTransaction to avoid deadlocks.
2947+
* ts object is pinned to prevent deallocation while lock is released.
2948+
*/
29492949
void MtmFinishPreparedTransaction(MtmTransState* ts, bool commit)
29502950
{
29512951
bool insideTransaction = IsTransactionState();
2952+
29522953
Assert(ts->votingCompleted);
2954+
2955+
ts->isPinned = true;
2956+
MtmUnlock();
2957+
29532958
MtmResetTransaction();
29542959

29552960
if (!insideTransaction) {
29562961
StartTransactionCommand();
29572962
}
2958-
//MtmBeginSession(MtmNodeId);
29592963
MtmSetCurrentTransactionCSN(ts->csn);
29602964
MtmSetCurrentTransactionGID(ts->gid);
29612965
FinishPreparedTransaction(ts->gid, commit);
29622966

29632967
if (!insideTransaction) {
29642968
CommitTransactionCommand();
2965-
//MtmEndSession(MtmNodeId, true);
29662969
Assert(ts->status == commit ? TRANSACTION_STATUS_COMMITTED : TRANSACTION_STATUS_ABORTED);
29672970
}
2971+
2972+
MtmLock(LW_EXCLUSIVE);
2973+
ts->isPinned = false;
29682974
}
29692975

29702976
/*
@@ -2980,6 +2986,7 @@ MtmReplicationMode MtmGetReplicationMode(int nodeId, sig_atomic_t volatile* shut
29802986

29812987
if (!Mtm->preparedTransactionsLoaded)
29822988
{
2989+
/* We must restore state of prepared (but no committed or aborted) transaction before start of recovery. */
29832990
MtmLoadPreparedTransactions();
29842991
Mtm->preparedTransactionsLoaded = true;
29852992
}
@@ -2991,6 +2998,7 @@ MtmReplicationMode MtmGetReplicationMode(int nodeId, sig_atomic_t volatile* shut
29912998
MtmUnlock();
29922999
return REPLMODE_EXIT;
29933000
}
3001+
/* We are not interested in receiving any deteriorated logical messages from recovered node, do recreate slot */
29943002
if (BIT_CHECK(Mtm->disabledNodeMask, nodeId-1)) {
29953003
mode = REPLMODE_CREATE_NEW;
29963004
}
@@ -3147,6 +3155,8 @@ MtmReplicationStartupHook(struct PGLogicalStartupHookArgs* args)
31473155
MtmEnableNode(MtmReplicationNodeId);
31483156
MtmCheckQuorum();
31493157
} else {
3158+
/* Force arbiter to reestablish connection with this nodem send heartbeat to inform this node that it was disabled and should perform recovery */
3159+
BIT_SET(Mtm->reconnectMask, MtmReplicationNodeId-1);
31503160
MtmUnlock();
31513161
elog(ERROR, "Disabled node %d tries to reconnect without recovery", MtmReplicationNodeId);
31523162
}
@@ -3156,9 +3166,10 @@ MtmReplicationStartupHook(struct PGLogicalStartupHookArgs* args)
31563166
elog(LOG, "Start %d senders and %d receivers from %d cluster status %s", Mtm->nSenders+1, Mtm->nReceivers, Mtm->nLiveNodes-1, MtmNodeStatusMnem[Mtm->status]);
31573167
MtmSenderStarted = 1;
31583168
if (++Mtm->nSenders == Mtm->nLiveNodes-1 && Mtm->nReceivers == Mtm->nLiveNodes-1 && Mtm->status == MTM_CONNECTED) {
3169+
/* All logical replication connections from and to this node are established, so we can switch cluster to online mode */
31593170
MtmSwitchClusterMode(MTM_ONLINE);
31603171
}
3161-
BIT_SET(Mtm->reconnectMask, MtmReplicationNodeId-1); /* arbiter should try to reestblish connection with this node */
3172+
BIT_SET(Mtm->reconnectMask, MtmReplicationNodeId-1); /* arbiter should try to reestablish connection with this node */
31623173
MtmUnlock();
31633174
on_shmem_exit(MtmOnProcExit, 0);
31643175
}
@@ -3168,6 +3179,16 @@ XLogRecPtr MtmGetFlushPosition(int nodeId)
31683179
return Mtm->nodes[nodeId-1].flushPos;
31693180
}
31703181

3182+
/**
3183+
* Keep track of progress of WAL writer.
3184+
* We need to notify WAL senders at other nodes which logical records
3185+
* are flushed to the disk and so can survive failure. In asynchronous commit mode
3186+
* WAL is flushed by WAL writer. Current flish position can be obtained by GetFlushRecPtr().
3187+
* So on applying new logical record we insert it in the MtmLsnMapping and compare
3188+
* their poistions in local WAL log with current flush position.
3189+
* The records which are flushed to the disk by WAL writer are removed from the list
3190+
* and mapping ing mtm->nodes[].flushPos is updated for this node.
3191+
*/
31713192
void MtmUpdateLsnMapping(int node_id, XLogRecPtr end_lsn)
31723193
{
31733194
dlist_mutable_iter iter;
@@ -3216,9 +3237,21 @@ MtmReplicationShutdownHook(struct PGLogicalShutdownHookArgs* args)
32163237
}
32173238
}
32183239

3240+
/*
3241+
* Filter transactions which should be replicated to other nodes.
3242+
* This filter is applied at sender side (WAL sender).
3243+
* Final filtering is also done at destination side by MtmFilterTransaction function.
3244+
*/
32193245
static bool
32203246
MtmReplicationTxnFilterHook(struct PGLogicalTxnFilterArgs* args)
32213247
{
3248+
/* Do not replicate any transactions in recovery mode (because we should apply
3249+
* changes sent to us rather than send our own pending changes)
3250+
* and transactions received from other nodes
3251+
* (originId should be non-zero in this case)
3252+
* unless we are performing recovery of disabled node
3253+
* (in this case all transactions should be sent)
3254+
*/
32223255
bool res = Mtm->status != MTM_RECOVERY
32233256
&& (args->origin_id == InvalidRepOriginId
32243257
|| MtmIsRecoveredNode(MtmReplicationNodeId));
@@ -3228,6 +3261,9 @@ MtmReplicationTxnFilterHook(struct PGLogicalTxnFilterArgs* args)
32283261
return res;
32293262
}
32303263

3264+
/**
3265+
* Filter record corresponding to local (non-distributed) tables
3266+
*/
32313267
static bool
32323268
MtmReplicationRowFilterHook(struct PGLogicalRowFilterArgs* args)
32333269
{
@@ -3248,7 +3284,11 @@ MtmReplicationRowFilterHook(struct PGLogicalRowFilterArgs* args)
32483284

32493285
/*
32503286
* Filter received transactions at destination side.
3251-
* This function is executed by receiver, so there are no race conditions and it is possible to update nodes[i].restartLSN without lock
3287+
* This function is executed by receiver,
3288+
* so there are no race conditions and it is possible to update nodes[i].restartLSN without lock.
3289+
* It is more efficient to filter records at senders size (done by MtmReplicationTxnFilterHook) to avoid sending useless data through network. But asynchronous nature of
3290+
* logical replications makes it not possible to guarantee (at least I failed to do it)
3291+
* that replica do not receive deteriorated data.
32523292
*/
32533293
bool MtmFilterTransaction(char* record, int size)
32543294
{
@@ -3308,8 +3348,8 @@ bool MtmFilterTransaction(char* record, int size)
33083348
}
33093349

33103350
if (duplicate) {
3311-
MTM_LOG1("Ignore transaction %s from node %d flags=%x, our restartLSN for node: %lx,restart_lsn = (origin node %d == MtmReplicationNodeId %d) ? end_lsn=%lx, origin_lsn=%lx",
3312-
gid, replication_node, flags, Mtm->nodes[origin_node-1].restartLSN, origin_node, MtmReplicationNodeId, end_lsn, origin_lsn);
3351+
MTM_LOG1("Ignore transaction %s from node %d flags=%x because our LSN position %lx for origin node %d is greater or equal than LSN %lx of this transaction (end_lsn=%lx, origin_lsn=%lx)",
3352+
gid, replication_node, flags, Mtm->nodes[origin_node-1].restartLSN, origin_node, restart_lsn, end_lsn, origin_lsn);
33133353
} else {
33143354
MTM_LOG2("Apply transaction %s from node %d lsn %lx, flags=%x, origin node %d, original lsn=%lx, current lsn=%lx",
33153355
gid, replication_node, end_lsn, flags, origin_node, origin_lsn, restart_lsn);
@@ -3326,7 +3366,11 @@ void MtmSetupReplicationHooks(struct PGLogicalHooks* hooks)
33263366
hooks->row_filter_hook = MtmReplicationRowFilterHook;
33273367
}
33283368

3329-
3369+
/*
3370+
* Setup replication session origin to include origin location in WAL and
3371+
* update slot position.
3372+
* Sessions are not reetrant so we have to use exclusive lock here.
3373+
*/
33303374
void MtmBeginSession(int nodeId)
33313375
{
33323376
MtmLockNode(nodeId, LW_EXCLUSIVE);
@@ -3338,6 +3382,9 @@ void MtmBeginSession(int nodeId)
33383382
MTM_LOG3("%d: End setup replorigin session: %d", MyProcPid, replorigin_session_origin);
33393383
}
33403384

3385+
/*
3386+
* Release replication session
3387+
*/
33413388
void MtmEndSession(int nodeId, bool unlock)
33423389
{
33433390
if (replorigin_session_origin != InvalidRepOriginId) {

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy