Skip to content

Commit a72d35d

Browse files
committed
Reimplement CFS GC lock
1 parent b88747c commit a72d35d

File tree

2 files changed

+102
-112
lines changed

2 files changed

+102
-112
lines changed

src/backend/storage/file/cfs.c

Lines changed: 101 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -540,86 +540,72 @@ static bool cfs_write_file(int fd, void const* data, uint32 size)
540540
void cfs_lock_file(FileMap* map, char const* file_path)
541541
{
542542
long delay = CFS_LOCK_MIN_TIMEOUT;
543-
int n_attempts = 0;
544543

545544
while (true)
546545
{
547-
uint64 count = pg_atomic_fetch_add_u32(&map->lock, 1);
548-
bool revokeLock = false;
546+
uint32 count = pg_atomic_fetch_add_u32(&map->lock, 1);
549547

550548
if (count < CFS_GC_LOCK)
551-
break;
552-
553-
if (InRecovery)
554-
{
555-
revokeLock = true;
556-
}
557-
else
558549
{
559-
if (pg_atomic_unlocked_test_flag(&cfs_state->gc_started))
560-
{
561-
if (++n_attempts > MAX_LOCK_ATTEMPTS)
562-
{
563-
/* So there is GC lock, but no active GC process during MAX_LOCK_ATTEMPTS.
564-
* Most likely it means that GC is crashed (may be together with other postgres processes or even OS)
565-
* without releasing lock. And for some reasons recovery was not performed and this page left locked.
566-
* We should revoke the the lock to allow access to this segment.
567-
*/
568-
revokeLock = true;
569-
elog(WARNING, "CFS revokes lock on file %s\n", file_path);
570-
}
571-
}
572-
else
573-
{
574-
n_attempts = 0; /* Reset counter of attempts because GC is in progress */
575-
}
550+
/* No GC is active for this segment */
551+
break;
576552
}
577-
if (revokeLock
578-
/* use gc_started flag to prevent race condition with other backends and GC */
579-
&& pg_atomic_test_set_flag(&cfs_state->gc_started))
580-
{
581-
/* Ugggh... looks like last GC was interrupted.
582-
* Try to recover the file.
583-
*/
584-
char* map_bck_path = psprintf("%s.cfm.bck", file_path);
585-
char* file_bck_path = psprintf("%s.bck", file_path);
586553

587-
elog(WARNING, "CFS indicates that GC of %s was interrupted: try to perform recovery", file_path);
554+
if (pg_atomic_read_u32(&cfs_state->n_active_gc) == 0)
555+
{
556+
/* There is no active GC, so lock is set by crashed GC */
588557

589-
if (access(file_bck_path, R_OK) != 0)
590-
{
591-
/* There is no backup file: new map should be constructed */
592-
int md2 = open(map_bck_path, O_RDWR|PG_BINARY, 0);
593-
if (md2 >= 0)
594-
{
595-
/* Recover map. */
596-
if (!cfs_read_file(md2, map, sizeof(FileMap)))
597-
elog(WARNING, "CFS failed to read file %s: %m", map_bck_path);
558+
LWLockAcquire(CfsGcLock, LW_EXCLUSIVE); /* Prevent race condition with GC */
598559

599-
close(md2);
600-
}
601-
}
602-
else
560+
/* Recheck under CfsGcLock that map->lock was not released */
561+
if (pg_atomic_read_u32(&map->lock) >= CFS_GC_LOCK)
603562
{
604-
/* Presence of backup file means that we still have
605-
* unchanged data and map files. Just remove backup files and
606-
* revoke GC lock.
563+
/* Uhhh... looks like last GC was interrupted.
564+
* Try to recover the file.
607565
*/
608-
unlink(file_bck_path);
609-
unlink(map_bck_path);
566+
char* map_bck_path = psprintf("%s.cfm.bck", file_path);
567+
char* file_bck_path = psprintf("%s.bck", file_path);
568+
569+
elog(WARNING, "CFS indicates that GC of %s was interrupted: try to perform recovery", file_path);
570+
571+
if (access(file_bck_path, R_OK) != 0)
572+
{
573+
/* There is no backup file: new map should be constructed */
574+
int md2 = open(map_bck_path, O_RDWR|PG_BINARY, 0);
575+
if (md2 >= 0)
576+
{
577+
/* Recover map. */
578+
if (!cfs_read_file(md2, map, sizeof(FileMap)))
579+
elog(WARNING, "CFS failed to read file %s: %m", map_bck_path);
580+
581+
close(md2);
582+
}
583+
}
584+
else
585+
{
586+
/* Presence of backup file means that we still have
587+
* unchanged data and map files. Just remove backup files and
588+
* revoke GC lock.
589+
*/
590+
unlink(file_bck_path);
591+
unlink(map_bck_path);
592+
}
593+
594+
count = pg_atomic_fetch_sub_u32(&map->lock, CFS_GC_LOCK); /* revoke GC lock */
595+
Assert((int)count > 0);
596+
pfree(file_bck_path);
597+
pfree(map_bck_path);
610598
}
611-
612-
pg_atomic_clear_flag(&cfs_state->gc_started);
613-
count = pg_atomic_fetch_sub_u32(&map->lock, CFS_GC_LOCK); /* revoke GC lock */
614-
Assert((int)count > 0);
615-
pfree(file_bck_path);
616-
pfree(map_bck_path);
599+
LWLockRelease(CfsGcLock);
617600
break;
618-
}
601+
}
602+
/* Wait until GC of segment is completed */
619603
pg_atomic_fetch_sub_u32(&map->lock, 1);
620604
pg_usleep(delay);
621605
if (delay < CFS_LOCK_MAX_TIMEOUT)
606+
{
622607
delay *= 2;
608+
}
623609
}
624610

625611
if (IsUnderPostmaster && cfs_gc_workers != 0
@@ -649,11 +635,11 @@ static int cfs_cmp_page_offs(void const* p1, void const* p2)
649635
/*
650636
* Perform garbage collection (if required) on the file
651637
* @param map_path - path to the map file (*.cfm).
652-
* @param noerror - surpress error message (when this function is called by cfs_gc_relation until there are available segments)
638+
* @param bacground - GC is performed in background by BGW: surpress error message and set CfsGcLock
653639
*/
654-
static bool cfs_gc_file(char* map_path, bool noerror)
640+
static bool cfs_gc_file(char* map_path, bool background)
655641
{
656-
int md = open(map_path, O_RDWR|PG_BINARY, 0);
642+
int md;
657643
FileMap* map;
658644
uint32 physSize;
659645
uint32 usedSize;
@@ -663,29 +649,33 @@ static bool cfs_gc_file(char* map_path, bool noerror)
663649
int fd2 = -1;
664650
int md2 = -1;
665651
bool succeed = false;
652+
int rc;
653+
666654

667655
pg_atomic_fetch_add_u32(&cfs_state->n_active_gc, 1);
668656

669-
while (!cfs_state->gc_enabled)
657+
if (background)
670658
{
671-
int rc;
672-
673-
pg_atomic_fetch_sub_u32(&cfs_state->n_active_gc, 1);
674-
675-
rc = WaitLatch(MyLatch,
676-
WL_TIMEOUT | WL_POSTMASTER_DEATH,
677-
CFS_DISABLE_TIMEOUT /* ms */);
678-
if (cfs_gc_stop || (rc & WL_POSTMASTER_DEATH))
679-
exit(1);
659+
while (!cfs_state->gc_enabled)
660+
{
661+
pg_atomic_fetch_sub_u32(&cfs_state->n_active_gc, 1);
662+
663+
rc = WaitLatch(MyLatch,
664+
WL_TIMEOUT | WL_POSTMASTER_DEATH,
665+
CFS_DISABLE_TIMEOUT /* ms */);
666+
if (cfs_gc_stop || (rc & WL_POSTMASTER_DEATH))
667+
exit(1);
668+
669+
pg_atomic_fetch_add_u32(&cfs_state->n_active_gc, 1);
670+
}
680671

681-
pg_atomic_fetch_add_u32(&cfs_state->n_active_gc, 1);
672+
LWLockAcquire(CfsGcLock, LW_SHARED); /* avoid race condition with cfs_file_lock */
682673
}
683674

675+
md = open(map_path, O_RDWR|PG_BINARY, 0);
684676
if (md < 0)
685677
{
686-
if (!noerror) {
687-
elog(WARNING, "CFS failed to open map file %s: %m", map_path);
688-
}
678+
elog(DEBUG1, "CFS failed to open map file %s: %m", map_path);
689679
goto FinishGC;
690680
}
691681

@@ -1032,7 +1022,12 @@ static bool cfs_gc_file(char* map_path, bool noerror)
10321022
}
10331023

10341024
FinishGC:
1025+
if (background)
1026+
{
1027+
LWLockRelease(CfsGcLock);
1028+
}
10351029
pg_atomic_fetch_sub_u32(&cfs_state->n_active_gc, 1);
1030+
10361031
return succeed;
10371032
}
10381033

@@ -1066,7 +1061,7 @@ static bool cfs_gc_directory(int worker_id, char const* path)
10661061
strcmp(file_path + len - 4, ".cfm") == 0)
10671062
{
10681063
if (entry->d_ino % cfs_state->n_workers == worker_id
1069-
&& !cfs_gc_file(file_path, false))
1064+
&& !cfs_gc_file(file_path, true))
10701065
{
10711066
success = false;
10721067
break;
@@ -1395,31 +1390,35 @@ Datum cfs_fragmentation(PG_FUNCTION_ARGS)
13951390

13961391
Datum cfs_gc_relation(PG_FUNCTION_ARGS)
13971392
{
1398-
cfs_gc_processed_segments = 0;
1399-
1400-
if (cfs_gc_workers == 0 && pg_atomic_test_set_flag(&cfs_state->gc_started))
1393+
Oid oid = PG_GETARG_OID(0);
1394+
Relation rel = try_relation_open(oid, AccessShareLock);
1395+
int processed_segments = 0;
1396+
1397+
if (rel != NULL)
14011398
{
1402-
Oid oid = PG_GETARG_OID(0);
1403-
Relation rel = try_relation_open(oid, AccessShareLock);
1404-
1405-
if (rel != NULL)
1406-
{
1407-
char* path = relpathbackend(rel->rd_node, rel->rd_backend, MAIN_FORKNUM);
1408-
char* map_path = (char*)palloc(strlen(path) + 16);
1409-
int i = 0;
1410-
sprintf(map_path, "%s.cfm", path);
1399+
char* path;
1400+
char* map_path;
1401+
int i = 0;
1402+
1403+
LWLockAcquire(CfsGcLock, LW_EXCLUSIVE); /* Prevent interaction with background GC */
1404+
1405+
processed_segments = cfs_gc_processed_segments;
1406+
1407+
path = relpathbackend(rel->rd_node, rel->rd_backend, MAIN_FORKNUM);
1408+
map_path = (char*)palloc(strlen(path) + 16);
1409+
sprintf(map_path, "%s.cfm", path);
14111410

1412-
while (true)
1413-
{
1414-
if (!cfs_gc_file(map_path, true))
1415-
break;
1416-
sprintf(map_path, "%s.%u.cfm", path, ++i);
1417-
}
1418-
pfree(path);
1419-
pfree(map_path);
1420-
relation_close(rel, AccessShareLock);
1411+
while (cfs_gc_file(map_path, false))
1412+
{
1413+
sprintf(map_path, "%s.%u.cfm", path, ++i);
14211414
}
1422-
pg_atomic_clear_flag(&cfs_state->gc_started);
1415+
pfree(path);
1416+
pfree(map_path);
1417+
relation_close(rel, AccessShareLock);
1418+
1419+
processed_segments -= cfs_gc_processed_segments;
1420+
1421+
LWLockRelease(CfsGcLock);
14231422
}
14241423
PG_RETURN_INT32(cfs_gc_processed_segments);
14251424
}

src/include/storage/cfs.h

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
#include "port/atomics.h"
77
#include "storage/rijndael.h"
88

9-
#define CFS_VERSION "0.24"
9+
#define CFS_VERSION "0.25"
1010

1111
#define CFS_GC_LOCK 0x10000000
1212

@@ -16,15 +16,6 @@
1616
#define CFS_RETRY_TIMEOUT 10000 /* milliseconds: delay between GC iterations in case of error */
1717
#define CFS_ESTIMATE_PROBES 10
1818

19-
/*
20-
* Maximal number of attempts backend should perform to lock file segment.
21-
* If during all this attempts gc_started flag is not set (it means that no GC is active at this moment),
22-
* then it means that for some reasons GC process was crashed (may be together with other postgres processes or even OS)
23-
* without releasing lock. And for some reasons recovery was not performed and this page left locked. Цe should revoke this lock
24-
* to allow access to this database segment.
25-
*/
26-
#define MAX_LOCK_ATTEMPTS 100
27-
2819
/* Maximal size of buffer for compressing (size) bytes where (size) is equal to PostgreSQL page size.
2920
* Some compression algorithms requires to provide buffer large enough for worst case and immediately returns error is buffer is not enough.
3021
* Accurate calculation of required buffer size is not needed here and doubling buffer size works for all used compression algorithms. */

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy