Skip to content

Commit f639b4c

Browse files
committed
referee version 2
1 parent 426b00d commit f639b4c

File tree

8 files changed

+218
-27
lines changed

8 files changed

+218
-27
lines changed

multimaster--1.0.sql

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,3 +151,27 @@ $$
151151
LANGUAGE plpgsql;
152152

153153
-- select mtm.alter_sequences();
154+
155+
-- referee stuff
156+
CREATE TABLE IF NOT EXISTS mtm.referee_decision(key text primary key not null, node_id int);
157+
158+
CREATE OR REPLACE FUNCTION mtm.referee_get_winner(applicant_id int) RETURNS int AS
159+
$$
160+
DECLARE
161+
winner_id int;
162+
BEGIN
163+
insert into mtm.referee_decision values ('winner', applicant_id);
164+
select node_id into winner_id from mtm.referee_decision where key = 'winner';
165+
return winner_id;
166+
EXCEPTION WHEN others THEN
167+
select node_id into winner_id from mtm.referee_decision where key = 'winner';
168+
return winner_id;
169+
END
170+
$$
171+
LANGUAGE plpgsql;
172+
173+
CREATE OR REPLACE FUNCTION mtm.referee_clean() RETURNS void AS
174+
$$
175+
delete from mtm.referee_decision where key = 'winner';
176+
$$
177+
LANGUAGE sql;

multimaster.c

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,7 @@ bool MtmUseRDMA;
254254
bool MtmPreserveCommitOrder;
255255
bool MtmVolksWagenMode; /* Pretend to be normal postgres. This means skip some NOTICE's and use local sequences */
256256
bool MtmMajorNode;
257+
char* MtmRefereeConnStr;
257258

258259
static char* MtmConnStrs;
259260
static char* MtmRemoteFunctionsList;
@@ -2372,6 +2373,7 @@ static void MtmInitialize()
23722373
Mtm->nAllNodes = MtmNodes;
23732374
Mtm->disabledNodeMask = (((nodemask_t)1 << MtmNodes) - 1);
23742375
Mtm->clique = 0;
2376+
Mtm->refereeGrant = false;
23752377
Mtm->stalledNodeMask = 0;
23762378
Mtm->stoppedNodeMask = 0;
23772379
Mtm->deadNodeMask = 0;
@@ -2618,8 +2620,7 @@ static void MtmSplitConnStrs(void)
26182620
}
26192621
pfree(copy);
26202622
}
2621-
if (!MtmReferee)
2622-
{
2623+
26232624
if (MtmNodeId == INT_MAX) {
26242625
if (gethostname(buf, sizeof buf) != 0) {
26252626
MTM_ELOG(ERROR, "Failed to get host name: %m");
@@ -2679,7 +2680,6 @@ static void MtmSplitConnStrs(void)
26792680
len = end - dbName;
26802681
MtmDatabaseName = pnstrdup(dbName, len);
26812682
}
2682-
}
26832683
MemoryContextSwitchTo(old_context);
26842684
}
26852685

@@ -2976,6 +2976,19 @@ _PG_init(void)
29762976
NULL
29772977
);
29782978

2979+
DefineCustomStringVariable(
2980+
"multimaster.referee_connstring",
2981+
"Referee connection string",
2982+
NULL,
2983+
&MtmRefereeConnStr,
2984+
"",
2985+
PGC_POSTMASTER,
2986+
0,
2987+
NULL,
2988+
NULL,
2989+
NULL
2990+
);
2991+
29792992
DefineCustomBoolVariable(
29802993
"multimaster.use_rdma",
29812994
"Use RDMA sockets",
@@ -3177,8 +3190,6 @@ _PG_init(void)
31773190

31783191
if (MtmReferee)
31793192
{
3180-
MtmSplitConnStrs();
3181-
MtmRefereeInitialize();
31823193
return;
31833194
}
31843195

multimaster.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,7 @@ typedef struct
287287
TransactionId oldestXid; /* XID of oldest transaction visible by any active transaction (local or global) */
288288
nodemask_t disabledNodeMask; /* Bitmask of disabled nodes */
289289
nodemask_t clique; /* Bitmask of nodes that are connected and we allowed to connect/send wal/receive wal with them */
290+
bool refereeGrant; /* Referee allowed us to work with half of the nodes */
290291
nodemask_t deadNodeMask; /* Bitmask of nodes considered as dead by referee */
291292
nodemask_t recoveredNodeMask; /* Bitmask of nodes recoverd after been reported as dead by referee */
292293
nodemask_t stalledNodeMask; /* Bitmask of stalled nodes (node with dropped replication slot which makes it not possible automatic recovery of such node) */
@@ -379,6 +380,7 @@ extern timestamp_t MtmRefreshClusterStatusSchedule;
379380
extern MtmConnectionInfo* MtmConnections;
380381
extern bool MtmMajorNode;
381382
extern bool MtmBackgroundWorker;
383+
extern char* MtmRefereeConnStr;
382384

383385

384386
extern void MtmArbiterInitialize(void);

state.c

Lines changed: 108 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ char const* const MtmEventMnem[] =
2525
"MTM_NONRECOVERABLE_ERROR"
2626
};
2727

28+
static int MtmGetRefereeWinner(void);
2829

2930
// XXXX: allocate in context and clean it
3031
static char *
@@ -91,12 +92,14 @@ MtmCheckState(void)
9192
maskToString(Mtm->pglogicalReceiverMask, Mtm->nAllNodes),
9293
maskToString(Mtm->pglogicalSenderMask, Mtm->nAllNodes),
9394
Mtm->nAllNodes,
94-
MtmMajorNode);
95+
(MtmMajorNode || Mtm->refereeGrant) );
9596

9697
isEnabledState =
97-
( (nConnected >= Mtm->nAllNodes/2+1) /* majority */
98-
|| (nConnected == Mtm->nAllNodes/2 && MtmMajorNode) ) /* or half + major node */
99-
&& BIT_CHECK(Mtm->clique, MtmNodeId-1); /* in clique */
98+
( (nConnected >= Mtm->nAllNodes/2+1) /* majority */
99+
// XXXX: should we restrict major with two nodes setup?
100+
|| (nConnected == Mtm->nAllNodes/2 && MtmMajorNode) /* or half + major node */
101+
|| (nConnected == Mtm->nAllNodes/2 && Mtm->refereeGrant) ) /* or half + referee */
102+
&& BIT_CHECK(Mtm->clique, MtmNodeId-1); /* in clique */
100103

101104
/* ANY -> MTM_DISABLED */
102105
if (!isEnabledState)
@@ -135,6 +138,12 @@ MtmCheckState(void)
135138
case MTM_RECOVERED:
136139
if (nReceivers == nEnabled-1 && nSenders == nEnabled-1 && nEnabled == nConnected)
137140
{
141+
/*
142+
* It should be already cleaned by RECOVERY_CAUGHTUP, but
143+
* in major mode or with referee we can be working alone
144+
* so nobody will clean it.
145+
*/
146+
BIT_CLEAR(Mtm->originLockNodeMask, MtmNodeId-1);
138147
MtmSetClusterStatus(MTM_ONLINE);
139148
return;
140149
}
@@ -376,9 +385,35 @@ MtmRefreshClusterStatus()
376385
* Periodical check that we are still in RECOVERED state.
377386
* See comment to MTM_RECOVERED -> MTM_ONLINE transition in MtmCheckState()
378387
*/
379-
if (Mtm->status == MTM_RECOVERED)
380-
MtmCheckState();
388+
MtmCheckState();
389+
390+
/*
391+
* Check for referee decidion when pnly half of nodes are visible.
392+
*/
393+
if (MtmRefereeConnStr && *MtmRefereeConnStr && !Mtm->refereeGrant &&
394+
// XXXX visibility & ~clique?
395+
countZeroBits(SELF_CONNECTIVITY_MASK, Mtm->nAllNodes) == Mtm->nAllNodes/2)
396+
{
397+
int winner_node_id = MtmGetRefereeWinner();
398+
if (winner_node_id != -1 &&
399+
// XXXX visibility & ~clique?
400+
!BIT_CHECK(SELF_CONNECTIVITY_MASK, winner_node_id - 1))
401+
{
402+
MTM_LOG1("[STATE] Referee allowed to proceed with half of the nodes (winner_id = %d)",
403+
winner_node_id);
404+
Mtm->refereeGrant = true;
405+
406+
MtmLock(LW_EXCLUSIVE);
407+
MtmEnableNode(MtmNodeId);
408+
MtmCheckState();
409+
MtmUnlock();
410+
}
411+
}
381412

413+
414+
/*
415+
* Check for clique.
416+
*/
382417
MtmBuildConnectivityMatrix(matrix);
383418
newClique = MtmFindMaxClique(matrix, Mtm->nAllNodes, &cliqueSize);
384419

@@ -436,3 +471,70 @@ MtmRefreshClusterStatus()
436471
MtmCheckState();
437472
MtmUnlock();
438473
}
474+
475+
static int
476+
MtmGetRefereeWinner(void)
477+
{
478+
int socket_fd;
479+
PGconn* conn;
480+
PGresult *res;
481+
struct timeval timeout = { 5, 0 };
482+
char sql[128];
483+
int winner_node_id;
484+
485+
conn = PQconnectdb_safe(MtmRefereeConnStr);
486+
if (PQstatus(conn) != CONNECTION_OK)
487+
{
488+
MTM_ELOG(WARNING, "Could not connect to referee (%s): %s",
489+
MtmRefereeConnStr, PQerrorMessage(conn));
490+
PQfinish(conn);
491+
return -1;
492+
}
493+
494+
socket_fd = PQsocket(conn);
495+
if (socket_fd < 0)
496+
{
497+
MTM_ELOG(WARNING, "Referee socket is invalid");
498+
PQfinish(conn);
499+
return -1;
500+
}
501+
502+
if (setsockopt(socket_fd, SOL_SOCKET, SO_RCVTIMEO,
503+
(char *)&timeout, sizeof(timeout)) < 0)
504+
{
505+
MTM_ELOG(WARNING, "Could not set referee socket timeout: %s",
506+
strerror(errno));
507+
PQfinish(conn);
508+
return -1;
509+
}
510+
511+
sprintf(sql, "select mtm.referee_get_winner(%d)", MtmNodeId);
512+
res = PQexec(conn, sql);
513+
if (PQresultStatus(res) != PGRES_TUPLES_OK ||
514+
PQntuples(res) != 1 ||
515+
PQnfields(res) != 1)
516+
{
517+
MTM_ELOG(WARNING, "Refusing unexpected result (r=%d, n=%d, w=%d, k=%s) from referee.",
518+
PQresultStatus(res), PQntuples(res), PQnfields(res), PQgetvalue(res, 0, 0));
519+
PQclear(res);
520+
PQfinish(conn);
521+
return -1;
522+
}
523+
524+
winner_node_id = atoi(PQgetvalue(res, 0, 0));
525+
526+
if (winner_node_id < 1 || winner_node_id > Mtm->nAllNodes)
527+
{
528+
MTM_ELOG(WARNING,
529+
"Referee responded with node_id=%d, it's out of our node range",
530+
winner_node_id);
531+
PQclear(res);
532+
PQfinish(conn);
533+
return -1;
534+
}
535+
536+
MTM_LOG1("Got referee response, winner node_id=%d.", winner_node_id);
537+
/* Ok, we finally got it! */
538+
return winner_node_id;
539+
}
540+

tests2/docker-entrypoint.sh

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,21 +58,39 @@ if [ "$1" = 'postgres' ]; then
5858
max_wal_senders = 10
5959
shared_preload_libraries = 'multimaster'
6060
default_transaction_isolation = 'repeatable read'
61-
log_line_prefix = '%m: '
61+
log_line_prefix = '%m: '
62+
# log_statement = all
6263
6364
multimaster.workers = 4
6465
multimaster.max_workers = 16
6566
multimaster.max_nodes = 3
6667
multimaster.volkswagen_mode = 1
6768
multimaster.queue_size=52857600
6869
multimaster.ignore_tables_without_pk = 1
69-
multimaster.node_id = $NODE_ID
70-
multimaster.conn_strings = '$CONNSTRS'
71-
multimaster.major_node = $MAJOR
7270
multimaster.heartbeat_recv_timeout = 1100
7371
multimaster.heartbeat_send_timeout = 250
7472
EOF
7573

74+
if [ -n "$NODE_ID" ]; then
75+
echo "multimaster.node_id = $NODE_ID" >> $PGDATA/postgresql.conf
76+
fi
77+
78+
if [ -n "$CONNSTRS" ]; then
79+
echo "multimaster.conn_strings = '$CONNSTRS'" >> $PGDATA/postgresql.conf
80+
fi
81+
82+
if [ -n "$MAJOR" ]; then
83+
echo 'multimaster.major_node = on' >> $PGDATA/postgresql.conf
84+
fi
85+
86+
if [ -n "$REFEREE" ]; then
87+
echo 'multimaster.referee = on' >> $PGDATA/postgresql.conf
88+
fi
89+
90+
if [ -n "$REFEREE_CONNSTR" ]; then
91+
echo "multimaster.referee_connstring = '$REFEREE_CONNSTR'" >> $PGDATA/postgresql.conf
92+
fi
93+
7694
cat $PGDATA/postgresql.conf
7795

7896
pg_ctl -D "$PGDATA" -m fast -w stop

tests2/lib/test_helper.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import unittest
22
import time
33
import datetime
4+
import psycopg2
45

56
TEST_WARMING_TIME = 5
67
TEST_DURATION = 10
@@ -50,6 +51,7 @@ def performFailure(self, failure):
5051
aggs_failure = self.client.get_aggregates()
5152

5253

54+
# time.sleep(10000)
5355
failure.stop()
5456

5557
print('Eliminate failure at ',datetime.datetime.utcnow())
@@ -59,3 +61,12 @@ def performFailure(self, failure):
5961
aggs = self.client.get_aggregates()
6062

6163
return (aggs_failure, aggs)
64+
65+
def nodeExecute(dsn, statements):
66+
con = psycopg2.connect(dsn)
67+
con.autocommit = True
68+
cur = con.cursor()
69+
for statement in statements:
70+
cur.execute(statement)
71+
cur.close()
72+
con.close()

tests2/support/two_nodes.yml

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,10 @@ services:
1212
POSTGRES_USER: 'pg'
1313
POSTGRES_DB: 'regression'
1414
NODE_ID: 1
15-
MAJOR: 'off'
1615
CONNSTRS: >-
1716
dbname=regression user=pg host=node1,
1817
dbname=regression user=pg host=node2
18+
REFEREE_CONNSTR: 'dbname=regression user=pg host=referee'
1919
ports:
2020
- "15432:5432"
2121

@@ -29,9 +29,23 @@ services:
2929
POSTGRES_USER: 'pg'
3030
POSTGRES_DB: 'regression'
3131
NODE_ID: 2
32-
MAJOR: 'off'
3332
CONNSTRS: >-
3433
dbname=regression user=pg host=node1,
3534
dbname=regression user=pg host=node2
35+
REFEREE_CONNSTR: 'dbname=regression user=pg host=referee'
3636
ports:
3737
- "15433:5432"
38+
39+
referee:
40+
container_name: referee
41+
build: ../..
42+
privileged: true
43+
ulimits:
44+
core: 14294967296
45+
environment:
46+
POSTGRES_USER: 'pg'
47+
POSTGRES_DB: 'regression'
48+
NODE_ID: 1
49+
REFEREE: 'on'
50+
ports:
51+
- "15435:5432"

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy