Skip to content

Commit 2332ab5

Browse files
knizhnikkelvich
authored andcommitted
Correctly handle connect timeouts
1 parent 4ac3eec commit 2332ab5

File tree

3 files changed

+29
-38
lines changed

3 files changed

+29
-38
lines changed

arbiter.c

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -366,14 +366,16 @@ static void MtmCheckHeartbeat()
366366
}
367367

368368

369-
static int MtmConnectSocket(char const* host, int port, int max_attempts)
369+
static int MtmConnectSocket(char const* host, int port, int timeout)
370370
{
371371
struct sockaddr_in sock_inet;
372372
unsigned addrs[MAX_ROUTES];
373373
unsigned i, n_addrs = sizeof(addrs) / sizeof(addrs[0]);
374374
MtmHandshakeMessage req;
375375
MtmArbiterMessage resp;
376376
int sd;
377+
timestamp_t start = MtmGetSystemTime();
378+
377379

378380
sock_inet.sin_family = AF_INET;
379381
sock_inet.sin_port = htons(port);
@@ -390,7 +392,10 @@ static int MtmConnectSocket(char const* host, int port, int max_attempts)
390392
if (sd < 0) {
391393
elog(ERROR, "Arbiter failed to create socket: %d", errno);
392394
}
393-
fcntl(sd, F_SETFL, O_NONBLOCK);
395+
rc = fcntl(sd, F_SETFL, O_NONBLOCK);
396+
if (rc < 0) {
397+
elog(ERROR, "Arbiter failed to switch socket to non-blocking mode: %d", errno);
398+
}
394399
busy_socket = sd;
395400
for (i = 0; i < n_addrs; ++i) {
396401
memcpy(&sock_inet.sin_addr, &addrs[i], sizeof sock_inet.sin_addr);
@@ -405,17 +410,19 @@ static int MtmConnectSocket(char const* host, int port, int max_attempts)
405410
if (rc == 0) {
406411
break;
407412
}
408-
if (errno != EINPROGRESS || max_attempts == 0) {
413+
if (errno != EINPROGRESS || start + MSEC_TO_USEC(timeout) < MtmGetSystemTime()) {
409414
elog(WARNING, "Arbiter failed to connect to %s:%d: error=%d", host, port, errno);
410415
busy_socket = -1;
416+
close(sd);
411417
return -1;
412418
} else {
413-
rc = MtmWaitSocket(sd, true, MtmConnectTimeout);
419+
rc = MtmWaitSocket(sd, true, MtmHeartbeatSendTimeout);
414420
if (rc == 1) {
415421
socklen_t optlen = sizeof(int);
416422
if (getsockopt(sd, SOL_SOCKET, SO_ERROR, (void*)&rc, &optlen) < 0) {
417423
elog(WARNING, "Arbiter failed to getsockopt for %s:%d: error=%d", host, port, errno);
418424
busy_socket = -1;
425+
close(sd);
419426
return -1;
420427
}
421428
if (rc == 0) {
@@ -426,8 +433,8 @@ static int MtmConnectSocket(char const* host, int port, int max_attempts)
426433
} else {
427434
elog(WARNING, "Arbiter waiting socket to %s:%d: rc=%d, error=%d", host, port, rc, errno);
428435
}
429-
max_attempts -= 1;
430-
MtmSleep(MSEC_TO_USEC(MtmConnectTimeout));
436+
close(sd);
437+
MtmSleep(MSEC_TO_USEC(MtmHeartbeatSendTimeout));
431438
}
432439
}
433440
MtmSetSocketOptions(sd);
@@ -479,7 +486,7 @@ static void MtmOpenConnections()
479486
}
480487
for (i = 0; i < nNodes; i++) {
481488
if (i+1 != MtmNodeId && i < Mtm->nAllNodes) {
482-
sockets[i] = MtmConnectSocket(Mtm->nodes[i].con.hostName, MtmArbiterPort + i + 1, MtmConnectAttempts);
489+
sockets[i] = MtmConnectSocket(Mtm->nodes[i].con.hostName, MtmArbiterPort + i + 1, MtmConnectTimeout);
483490
if (sockets[i] < 0) {
484491
MtmOnNodeDisconnect(i+1);
485492
}
@@ -511,7 +518,7 @@ static bool MtmSendToNode(int node, void const* buf, int size)
511518
close(sockets[node]);
512519
sockets[node] = -1;
513520
}
514-
sockets[node] = MtmConnectSocket(Mtm->nodes[node].con.hostName, MtmArbiterPort + node + 1, MtmReconnectAttempts);
521+
sockets[node] = MtmConnectSocket(Mtm->nodes[node].con.hostName, MtmArbiterPort + node + 1, MtmReconnectTimeout);
515522
if (sockets[node] < 0) {
516523
MtmOnNodeDisconnect(node+1);
517524
return false;

multimaster.c

Lines changed: 13 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -193,10 +193,9 @@ int MtmNodes;
193193
int MtmNodeId;
194194
int MtmReplicationNodeId;
195195
int MtmArbiterPort;
196-
int MtmConnectAttempts;
197196
int MtmConnectTimeout;
197+
int MtmReconnectTimeout;
198198
int MtmRaftPollDelay;
199-
int MtmReconnectAttempts;
200199
int MtmNodeDisableDelay;
201200
int MtmTransSpillThreshold;
202201
int MtmMaxNodes;
@@ -2030,9 +2029,9 @@ _PG_init(void)
20302029
DefineCustomIntVariable(
20312030
"multimaster.connect_timeout",
20322031
"Multimaster nodes connect timeout",
2033-
"Interval in milliseconds between connection attempts",
2032+
"Interval in milliseconds for establishing connection with cluster node",
20342033
&MtmConnectTimeout,
2035-
1000,
2034+
10000, /* 10 seconds */
20362035
1,
20372036
INT_MAX,
20382037
PGC_BACKEND,
@@ -2043,11 +2042,11 @@ _PG_init(void)
20432042
);
20442043

20452044
DefineCustomIntVariable(
2046-
"multimaster.raft_poll_delay",
2047-
"Multimaster delay of polling cluster state from Raftable after updating local node status",
2048-
"Timeout in milliseconds before polling state of nodes",
2049-
&MtmRaftPollDelay,
2050-
1000,
2045+
"multimaster.reconnect_timeout",
2046+
"Multimaster nodes reconnect timeout",
2047+
"Interval in milliseconds for establishing connection with cluster node",
2048+
&MtmReconnectTimeout,
2049+
5000, /* 5 seconds */
20512050
1,
20522051
INT_MAX,
20532052
PGC_BACKEND,
@@ -2058,11 +2057,11 @@ _PG_init(void)
20582057
);
20592058

20602059
DefineCustomIntVariable(
2061-
"multimaster.connect_attempts",
2062-
"Multimaster number of connect attemts",
2063-
"Maximal number of attempt to establish connection with other node after which multimaster is give up",
2064-
&MtmConnectAttempts,
2065-
10,
2060+
"multimaster.raft_poll_delay",
2061+
"Multimaster delay of polling cluster state from Raftable after updating local node status",
2062+
"Timeout in milliseconds before polling state of nodes",
2063+
&MtmRaftPollDelay,
2064+
1000,
20662065
1,
20672066
INT_MAX,
20682067
PGC_BACKEND,
@@ -2072,20 +2071,6 @@ _PG_init(void)
20722071
NULL
20732072
);
20742073

2075-
DefineCustomIntVariable(
2076-
"multimaster.reconnect_attempts",
2077-
"Multimaster number of reconnect attemts",
2078-
"Maximal number of attempt to reestablish connection with other node after which node is considered to be offline",
2079-
&MtmReconnectAttempts,
2080-
10,
2081-
1,
2082-
INT_MAX,
2083-
PGC_BACKEND,
2084-
0,
2085-
NULL,
2086-
NULL,
2087-
NULL
2088-
);
20892074

20902075
MtmSplitConnStrs();
20912076
MtmStartReceivers();

multimaster.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,9 +215,8 @@ extern int MtmReplicationNodeId;
215215
extern int MtmNodes;
216216
extern int MtmArbiterPort;
217217
extern char* MtmDatabaseName;
218-
extern int MtmConnectAttempts;
219218
extern int MtmConnectTimeout;
220-
extern int MtmReconnectAttempts;
219+
extern int MtmReconnectTimeout;
221220
extern int MtmRaftPollDelay;
222221
extern int MtmNodeDisableDelay;
223222
extern int MtmTransSpillThreshold;

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy