Skip to content

Commit 30ba0d0

Browse files
committed
Recursively fsync() the data directory after a crash.
Otherwise, if there's another crash, some writes from after the first crash might make it to disk while writes from before the crash fail to make it to disk. This could lead to data corruption. Back-patch to all supported versions. Abhijit Menon-Sen, reviewed by Andres Freund and slightly revised by me.
1 parent f1d7516 commit 30ba0d0

File tree

5 files changed

+160
-1
lines changed

5 files changed

+160
-1
lines changed

src/backend/access/transam/xlog.c

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -628,6 +628,7 @@ static bool read_backup_label(XLogRecPtr *checkPointLoc);
628628
static void rm_redo_error_callback(void *arg);
629629
static int get_sync_bit(int method);
630630

631+
static void fsync_pgdata(char *datadir);
631632

632633
/*
633634
* Insert an XLOG record having the specified RMID and info bytes,
@@ -5924,6 +5925,18 @@ StartupXLOG(void)
59245925
(errmsg("database system was interrupted; last known up at %s",
59255926
str_time(ControlFile->time))));
59265927

5928+
/*
5929+
* If we previously crashed, there might be data which we had written,
5930+
* intending to fsync it, but which we had not actually fsync'd yet.
5931+
* Therefore, a power failure in the near future might cause earlier
5932+
* unflushed writes to be lost, even though more recent data written to
5933+
* disk from here on would be persisted. To avoid that, fsync the entire
5934+
* data directory.
5935+
*/
5936+
if (ControlFile->state != DB_SHUTDOWNED &&
5937+
ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
5938+
fsync_pgdata(data_directory);
5939+
59275940
/* This is just to allow attaching to startup process with a debugger */
59285941
#ifdef XLOG_REPLAY_DELAY
59295942
if (ControlFile->state != DB_SHUTDOWNED)
@@ -9967,3 +9980,31 @@ CheckForStandbyTrigger(void)
99679980
}
99689981
return false;
99699982
}
9983+
9984+
/*
9985+
* Issue fsync recursively on PGDATA and all its contents.
9986+
*/
9987+
static void
9988+
fsync_pgdata(char *datadir)
9989+
{
9990+
if (!enableFsync)
9991+
return;
9992+
9993+
/*
9994+
* If possible, hint to the kernel that we're soon going to fsync
9995+
* the data directory and its contents.
9996+
*/
9997+
#if defined(HAVE_SYNC_FILE_RANGE) || \
9998+
(defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED))
9999+
walkdir(datadir, pre_sync_fname);
10000+
#endif
10001+
10002+
/*
10003+
* Now we do the fsync()s in the same order.
10004+
*
10005+
* It's important to fsync the destination directory itself as individual
10006+
* file fsyncs don't guarantee that the directory entry for the file is
10007+
* synced.
10008+
*/
10009+
walkdir(datadir, fsync_fname);
10010+
}

src/backend/storage/file/fd.c

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2004,3 +2004,118 @@ fsync_fname(char *fname, bool isdir)
20042004

20052005
close(fd);
20062006
}
2007+
2008+
/*
2009+
* Hint to the OS that it should get ready to fsync() this file.
2010+
*
2011+
* Adapted from pre_sync_fname in initdb.c
2012+
*/
2013+
void
2014+
pre_sync_fname(char *fname, bool isdir)
2015+
{
2016+
int fd;
2017+
2018+
fd = open(fname, O_RDONLY | PG_BINARY);
2019+
2020+
/*
2021+
* Some OSs don't allow us to open directories at all (Windows returns
2022+
* EACCES)
2023+
*/
2024+
if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
2025+
return;
2026+
2027+
if (fd < 0)
2028+
ereport(FATAL,
2029+
(errmsg("could not open file \"%s\" before fsync",
2030+
fname)));
2031+
2032+
pg_flush_data(fd, 0, 0);
2033+
2034+
close(fd);
2035+
}
2036+
2037+
/*
2038+
* walkdir: recursively walk a directory, applying the action to each
2039+
* regular file and directory (including the named directory itself)
2040+
* and following symbolic links.
2041+
*
2042+
* NB: There is another version of walkdir in initdb.c, but that version
2043+
* behaves differently with respect to symbolic links. Caveat emptor!
2044+
*/
2045+
void
2046+
walkdir(char *path, void (*action) (char *fname, bool isdir))
2047+
{
2048+
DIR *dir;
2049+
struct dirent *de;
2050+
2051+
dir = AllocateDir(path);
2052+
while ((de = ReadDir(dir, path)) != NULL)
2053+
{
2054+
char subpath[MAXPGPATH];
2055+
struct stat fst;
2056+
2057+
CHECK_FOR_INTERRUPTS();
2058+
2059+
if (strcmp(de->d_name, ".") == 0 ||
2060+
strcmp(de->d_name, "..") == 0)
2061+
continue;
2062+
2063+
snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name);
2064+
2065+
if (lstat(subpath, &fst) < 0)
2066+
ereport(ERROR,
2067+
(errcode_for_file_access(),
2068+
errmsg("could not stat file \"%s\": %m", subpath)));
2069+
2070+
if (S_ISREG(fst.st_mode))
2071+
(*action) (subpath, false);
2072+
else if (S_ISDIR(fst.st_mode))
2073+
walkdir(subpath, action);
2074+
#ifndef WIN32
2075+
else if (S_ISLNK(fst.st_mode))
2076+
#else
2077+
else if (pg_win32_is_junction(subpath))
2078+
#endif
2079+
{
2080+
#if defined(HAVE_READLINK) || defined(WIN32)
2081+
char linkpath[MAXPGPATH];
2082+
int len;
2083+
struct stat lst;
2084+
2085+
len = readlink(subpath, linkpath, sizeof(linkpath)-1);
2086+
if (len < 0)
2087+
ereport(ERROR,
2088+
(errcode_for_file_access(),
2089+
errmsg("could not read symbolic link \"%s\": %m",
2090+
subpath)));
2091+
2092+
if (len >= sizeof(linkpath)-1)
2093+
ereport(ERROR,
2094+
(errmsg("symbolic link \"%s\" target is too long",
2095+
subpath)));
2096+
2097+
linkpath[len] = '\0';
2098+
2099+
if (lstat(linkpath, &lst) == 0)
2100+
{
2101+
if (S_ISREG(lst.st_mode))
2102+
(*action) (linkpath, false);
2103+
else if (S_ISDIR(lst.st_mode))
2104+
walkdir(subpath, action);
2105+
}
2106+
else if (errno != ENOENT)
2107+
ereport(ERROR,
2108+
(errcode_for_file_access(),
2109+
errmsg("could not stat file \"%s\": %m", linkpath)));
2110+
#else
2111+
ereport(WARNING,
2112+
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
2113+
errmsg("this platform does not support symbolic links; ignoring \"%s\"",
2114+
subpath)));
2115+
#endif
2116+
}
2117+
}
2118+
FreeDir(dir);
2119+
2120+
(*action) (path, true);
2121+
}

src/backend/utils/misc/guc.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,7 @@ int trace_recovery_messages = LOG;
377377

378378
int num_temp_buffers = 1000;
379379

380+
char *data_directory;
380381
char *ConfigFileName;
381382
char *HbaFileName;
382383
char *IdentFileName;
@@ -414,7 +415,6 @@ static char *timezone_string;
414415
static char *log_timezone_string;
415416
static char *timezone_abbreviations_string;
416417
static char *XactIsoLevel_string;
417-
static char *data_directory;
418418
static char *custom_variable_classes;
419419
static int max_function_args;
420420
static int max_index_keys;

src/include/storage/fd.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,8 @@ extern int pg_fsync_writethrough(int fd);
100100
extern int pg_fdatasync(int fd);
101101
extern int pg_flush_data(int fd, off_t offset, off_t amount);
102102
extern void fsync_fname(char *fname, bool isdir);
103+
extern void pre_sync_fname(char *fname, bool isdir);
104+
extern void walkdir(char *path, void (*action) (char *fname, bool isdir));
103105

104106
/* Filename components for OpenTemporaryFile */
105107
#define PG_TEMP_FILES_DIR "pgsql_tmp"

src/include/utils/guc.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ extern int log_temp_files;
176176

177177
extern int num_temp_buffers;
178178

179+
extern char *data_directory;
179180
extern char *ConfigFileName;
180181
extern char *HbaFileName;
181182
extern char *IdentFileName;

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy