Skip to content

Commit 737e9a1

Browse files
committed
Correctly handle connect timeouts
1 parent 2135458 commit 737e9a1

File tree

3 files changed

+29
-38
lines changed

3 files changed

+29
-38
lines changed

contrib/mmts/arbiter.c

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -366,14 +366,16 @@ static void MtmCheckHeartbeat()
366366
}
367367

368368

369-
static int MtmConnectSocket(char const* host, int port, int max_attempts)
369+
static int MtmConnectSocket(char const* host, int port, int timeout)
370370
{
371371
struct sockaddr_in sock_inet;
372372
unsigned addrs[MAX_ROUTES];
373373
unsigned i, n_addrs = sizeof(addrs) / sizeof(addrs[0]);
374374
MtmHandshakeMessage req;
375375
MtmArbiterMessage resp;
376376
int sd;
377+
timestamp_t start = MtmGetSystemTime();
378+
377379

378380
sock_inet.sin_family = AF_INET;
379381
sock_inet.sin_port = htons(port);
@@ -390,7 +392,10 @@ static int MtmConnectSocket(char const* host, int port, int max_attempts)
390392
if (sd < 0) {
391393
elog(ERROR, "Arbiter failed to create socket: %d", errno);
392394
}
393-
fcntl(sd, F_SETFL, O_NONBLOCK);
395+
rc = fcntl(sd, F_SETFL, O_NONBLOCK);
396+
if (rc < 0) {
397+
elog(ERROR, "Arbiter failed to switch socket to non-blocking mode: %d", errno);
398+
}
394399
busy_socket = sd;
395400
for (i = 0; i < n_addrs; ++i) {
396401
memcpy(&sock_inet.sin_addr, &addrs[i], sizeof sock_inet.sin_addr);
@@ -405,17 +410,19 @@ static int MtmConnectSocket(char const* host, int port, int max_attempts)
405410
if (rc == 0) {
406411
break;
407412
}
408-
if (errno != EINPROGRESS || max_attempts == 0) {
413+
if (errno != EINPROGRESS || start + MSEC_TO_USEC(timeout) < MtmGetSystemTime()) {
409414
elog(WARNING, "Arbiter failed to connect to %s:%d: error=%d", host, port, errno);
410415
busy_socket = -1;
416+
close(sd);
411417
return -1;
412418
} else {
413-
rc = MtmWaitSocket(sd, true, MtmConnectTimeout);
419+
rc = MtmWaitSocket(sd, true, MtmHeartbeatSendTimeout);
414420
if (rc == 1) {
415421
socklen_t optlen = sizeof(int);
416422
if (getsockopt(sd, SOL_SOCKET, SO_ERROR, (void*)&rc, &optlen) < 0) {
417423
elog(WARNING, "Arbiter failed to getsockopt for %s:%d: error=%d", host, port, errno);
418424
busy_socket = -1;
425+
close(sd);
419426
return -1;
420427
}
421428
if (rc == 0) {
@@ -426,8 +433,8 @@ static int MtmConnectSocket(char const* host, int port, int max_attempts)
426433
} else {
427434
elog(WARNING, "Arbiter waiting socket to %s:%d: rc=%d, error=%d", host, port, rc, errno);
428435
}
429-
max_attempts -= 1;
430-
MtmSleep(MSEC_TO_USEC(MtmConnectTimeout));
436+
close(sd);
437+
MtmSleep(MSEC_TO_USEC(MtmHeartbeatSendTimeout));
431438
}
432439
}
433440
MtmSetSocketOptions(sd);
@@ -479,7 +486,7 @@ static void MtmOpenConnections()
479486
}
480487
for (i = 0; i < nNodes; i++) {
481488
if (i+1 != MtmNodeId && i < Mtm->nAllNodes) {
482-
sockets[i] = MtmConnectSocket(Mtm->nodes[i].con.hostName, MtmArbiterPort + i + 1, MtmConnectAttempts);
489+
sockets[i] = MtmConnectSocket(Mtm->nodes[i].con.hostName, MtmArbiterPort + i + 1, MtmConnectTimeout);
483490
if (sockets[i] < 0) {
484491
MtmOnNodeDisconnect(i+1);
485492
}
@@ -511,7 +518,7 @@ static bool MtmSendToNode(int node, void const* buf, int size)
511518
close(sockets[node]);
512519
sockets[node] = -1;
513520
}
514-
sockets[node] = MtmConnectSocket(Mtm->nodes[node].con.hostName, MtmArbiterPort + node + 1, MtmReconnectAttempts);
521+
sockets[node] = MtmConnectSocket(Mtm->nodes[node].con.hostName, MtmArbiterPort + node + 1, MtmReconnectTimeout);
515522
if (sockets[node] < 0) {
516523
MtmOnNodeDisconnect(node+1);
517524
return false;

contrib/mmts/multimaster.c

Lines changed: 13 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -194,10 +194,9 @@ int MtmNodes;
194194
int MtmNodeId;
195195
int MtmReplicationNodeId;
196196
int MtmArbiterPort;
197-
int MtmConnectAttempts;
198197
int MtmConnectTimeout;
198+
int MtmReconnectTimeout;
199199
int MtmRaftPollDelay;
200-
int MtmReconnectAttempts;
201200
int MtmNodeDisableDelay;
202201
int MtmTransSpillThreshold;
203202
int MtmMaxNodes;
@@ -2031,9 +2030,9 @@ _PG_init(void)
20312030
DefineCustomIntVariable(
20322031
"multimaster.connect_timeout",
20332032
"Multimaster nodes connect timeout",
2034-
"Interval in milliseconds between connection attempts",
2033+
"Interval in milliseconds for establishing connection with cluster node",
20352034
&MtmConnectTimeout,
2036-
1000,
2035+
10000, /* 10 seconds */
20372036
1,
20382037
INT_MAX,
20392038
PGC_BACKEND,
@@ -2044,11 +2043,11 @@ _PG_init(void)
20442043
);
20452044

20462045
DefineCustomIntVariable(
2047-
"multimaster.raft_poll_delay",
2048-
"Multimaster delay of polling cluster state from Raftable after updating local node status",
2049-
"Timeout in milliseconds before polling state of nodes",
2050-
&MtmRaftPollDelay,
2051-
1000,
2046+
"multimaster.reconnect_timeout",
2047+
"Multimaster nodes reconnect timeout",
2048+
"Interval in milliseconds for establishing connection with cluster node",
2049+
&MtmReconnectTimeout,
2050+
5000, /* 5 seconds */
20522051
1,
20532052
INT_MAX,
20542053
PGC_BACKEND,
@@ -2059,11 +2058,11 @@ _PG_init(void)
20592058
);
20602059

20612060
DefineCustomIntVariable(
2062-
"multimaster.connect_attempts",
2063-
"Multimaster number of connect attemts",
2064-
"Maximal number of attempt to establish connection with other node after which multimaster is give up",
2065-
&MtmConnectAttempts,
2066-
10,
2061+
"multimaster.raft_poll_delay",
2062+
"Multimaster delay of polling cluster state from Raftable after updating local node status",
2063+
"Timeout in milliseconds before polling state of nodes",
2064+
&MtmRaftPollDelay,
2065+
1000,
20672066
1,
20682067
INT_MAX,
20692068
PGC_BACKEND,
@@ -2073,20 +2072,6 @@ _PG_init(void)
20732072
NULL
20742073
);
20752074

2076-
DefineCustomIntVariable(
2077-
"multimaster.reconnect_attempts",
2078-
"Multimaster number of reconnect attemts",
2079-
"Maximal number of attempt to reestablish connection with other node after which node is considered to be offline",
2080-
&MtmReconnectAttempts,
2081-
10,
2082-
1,
2083-
INT_MAX,
2084-
PGC_BACKEND,
2085-
0,
2086-
NULL,
2087-
NULL,
2088-
NULL
2089-
);
20902075

20912076
MtmSplitConnStrs();
20922077
MtmStartReceivers();

contrib/mmts/multimaster.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,9 +215,8 @@ extern int MtmReplicationNodeId;
215215
extern int MtmNodes;
216216
extern int MtmArbiterPort;
217217
extern char* MtmDatabaseName;
218-
extern int MtmConnectAttempts;
219218
extern int MtmConnectTimeout;
220-
extern int MtmReconnectAttempts;
219+
extern int MtmReconnectTimeout;
221220
extern int MtmRaftPollDelay;
222221
extern int MtmNodeDisableDelay;
223222
extern int MtmTransSpillThreshold;

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy