Skip to content

Commit c34bb00

Browse files
committed
Use O_DIRECT if available when using O_SYNC for wal_sync_method. Also, write multiple WAL buffers out in one write() operation. ITAGAKI Takahiro --------------------------------------------------------------------------- > If we disable writeback-cache and use open_sync, the per-page writing > behavior in WAL module will show up as bad result. O_DIRECT is similar > to O_DSYNC (at least on linux), so that the benefit of it will disappear > behind the slow disk revolution. > > In the current source, WAL is written as: > for (i = 0; i < N; i++) { write(&buffers[i], BLCKSZ); } > Is this intentional? Can we rewrite it as follows? > write(&buffers[0], N * BLCKSZ); > > In order to achieve it, I wrote a 'gather-write' patch (xlog.gw.diff). > Aside from this, I'll also send the fixed direct io patch (xlog.dio.diff). > These two patches are independent, so they can be applied either or both. > > > I tested them on my machine and the results as follows. It shows that > direct-io and gather-write is the best choice when writeback-cache is off. > Are these two patches worth trying if they are used together? > > > | writeback | fsync= | fdata | open_ | fsync_ | open_ > patch | cache | false | sync | sync | direct | direct > ------------+-----------+--------+-------+-------+--------+--------- > direct io | off | 124.2 | 105.7 | 48.3 | 48.3 | 48.2 > direct io | on | 129.1 | 112.3 | 114.1 | 142.9 | 144.5 > gather-write| off | 124.3 | 108.7 | 105.4 | (N/A) | (N/A) > both | off | 131.5 | 115.5 | 114.4 | 145.4 | 145.2 > > - 20runs * pgbench -s 100 -c 50 -t 200 > - with tuning (wal_buffers=64, commit_delay=500, checkpoint_segments=8) > - using 2 ATA disks: > - hda(reiserfs) includes system and wal. > - hdc(jfs) includes database files. writeback-cache is always on. > > --- > ITAGAKI Takahiro
1 parent 722f31f commit c34bb00

File tree

1 file changed

+149
-48
lines changed
  • src/backend/access/transam

1 file changed

+149
-48
lines changed

src/backend/access/transam/xlog.c

Lines changed: 149 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.210 2005/07/23 15:31:16 momjian Exp $
10+
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.211 2005/07/29 03:22:33 momjian Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -47,31 +47,71 @@
4747
#include "utils/relcache.h"
4848

4949

50+
/*
51+
* Becauase O_DIRECT bypasses the kernel buffers, and because we never
52+
* read those buffers except during crash recovery, it is a win to use
53+
* it in all cases where we sync on each write(). We could allow O_DIRECT
54+
* with fsync(), but because skipping the kernel buffer forces writes out
55+
* quickly, it seems best just to use it for O_SYNC. It is hard to imagine
56+
* how fsync() could be a win for O_DIRECT compared to O_SYNC and O_DIRECT.
57+
*/
58+
#ifdef O_DIRECT
59+
#define PG_O_DIRECT O_DIRECT
60+
#else
61+
#define PG_O_DIRECT 0
62+
#endif
63+
5064
/*
5165
* This chunk of hackery attempts to determine which file sync methods
5266
* are available on the current platform, and to choose an appropriate
5367
* default method. We assume that fsync() is always available, and that
5468
* configure determined whether fdatasync() is.
5569
*/
5670
#if defined(O_SYNC)
57-
#define OPEN_SYNC_FLAG O_SYNC
71+
#define CMP_OPEN_SYNC_FLAG O_SYNC
5872
#else
5973
#if defined(O_FSYNC)
60-
#define OPEN_SYNC_FLAG O_FSYNC
74+
#define CMP_OPEN_SYNC_FLAG O_FSYNC
6175
#endif
6276
#endif
77+
#define OPEN_SYNC_FLAG (CMP_OPEN_SYNC_FLAG | PG_O_DIRECT)
6378

6479
#if defined(O_DSYNC)
6580
#if defined(OPEN_SYNC_FLAG)
66-
#if O_DSYNC != OPEN_SYNC_FLAG
67-
#define OPEN_DATASYNC_FLAG O_DSYNC
81+
#if O_DSYNC != CMP_OPEN_SYNC_FLAG
82+
#define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT)
6883
#endif
6984
#else /* !defined(OPEN_SYNC_FLAG) */
7085
/* Win32 only has O_DSYNC */
71-
#define OPEN_DATASYNC_FLAG O_DSYNC
86+
#define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT)
7287
#endif
7388
#endif
7489

90+
/*
91+
* Limitation of buffer-alignment for direct io depend on OS and filesystem,
92+
* but BLCKSZ is assumed to be enough for it.
93+
*/
94+
#ifdef O_DIRECT
95+
#define ALIGNOF_XLOG_BUFFER BLCKSZ
96+
#else
97+
#define ALIGNOF_XLOG_BUFFER MAXIMUM_ALIGNOF
98+
#endif
99+
100+
/*
101+
* Switch the alignment routine because ShmemAlloc() returns a max-aligned
102+
* buffer and ALIGNOF_XLOG_BUFFER may be greater than MAXIMUM_ALIGNOF.
103+
*/
104+
#if ALIGNOF_XLOG_BUFFER <= MAXIMUM_ALIGNOF
105+
#define XLOG_BUFFER_ALIGN(LEN) MAXALIGN((LEN))
106+
#else
107+
#define XLOG_BUFFER_ALIGN(LEN) ((LEN) + (ALIGNOF_XLOG_BUFFER))
108+
#endif
109+
/* assume sizeof(ptrdiff_t) == sizeof(void*) */
110+
#define POINTERALIGN(ALIGNVAL,PTR) \
111+
((char *)(((ptrdiff_t) (PTR) + (ALIGNVAL-1)) & ~((ptrdiff_t) (ALIGNVAL-1))))
112+
#define XLOG_BUFFER_POINTERALIGN(PTR) \
113+
POINTERALIGN((ALIGNOF_XLOG_BUFFER), (PTR))
114+
75115
#if defined(OPEN_DATASYNC_FLAG)
76116
#define DEFAULT_SYNC_METHOD_STR "open_datasync"
77117
#define DEFAULT_SYNC_METHOD SYNC_METHOD_OPEN
@@ -469,6 +509,17 @@ static void ReadControlFile(void);
469509
static char *str_time(time_t tnow);
470510
static void issue_xlog_fsync(void);
471511

512+
/* XLog gather-write staffs */
513+
typedef struct XLogPages
514+
{
515+
char *head; /* Head of first page */
516+
int size; /* Total bytes of pages == count(pages) * BLCKSZ */
517+
int offset; /* Offset in xlog segment file */
518+
} XLogPages;
519+
static void XLogPageReset(XLogPages *pages);
520+
static void XLogPageWrite(XLogPages *pages, int index);
521+
static void XLogPageFlush(XLogPages *pages, int index);
522+
472523
#ifdef WAL_DEBUG
473524
static void xlog_outrec(char *buf, XLogRecord *record);
474525
#endif
@@ -1245,9 +1296,10 @@ static void
12451296
XLogWrite(XLogwrtRqst WriteRqst)
12461297
{
12471298
XLogCtlWrite *Write = &XLogCtl->Write;
1248-
char *from;
12491299
bool ispartialpage;
12501300
bool use_existent;
1301+
int currentIndex = Write->curridx;
1302+
XLogPages pages;
12511303

12521304
/* We should always be inside a critical section here */
12531305
Assert(CritSectionCount > 0);
@@ -1258,6 +1310,8 @@ XLogWrite(XLogwrtRqst WriteRqst)
12581310
*/
12591311
LogwrtResult = Write->LogwrtResult;
12601312

1313+
XLogPageReset(&pages);
1314+
12611315
while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
12621316
{
12631317
/*
@@ -1266,21 +1320,22 @@ XLogWrite(XLogwrtRqst WriteRqst)
12661320
* end of the last page that's been initialized by
12671321
* AdvanceXLInsertBuffer.
12681322
*/
1269-
if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[Write->curridx]))
1323+
if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[currentIndex]))
12701324
elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
12711325
LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
1272-
XLogCtl->xlblocks[Write->curridx].xlogid,
1273-
XLogCtl->xlblocks[Write->curridx].xrecoff);
1326+
XLogCtl->xlblocks[currentIndex].xlogid,
1327+
XLogCtl->xlblocks[currentIndex].xrecoff);
12741328

12751329
/* Advance LogwrtResult.Write to end of current buffer page */
1276-
LogwrtResult.Write = XLogCtl->xlblocks[Write->curridx];
1330+
LogwrtResult.Write = XLogCtl->xlblocks[currentIndex];
12771331
ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);
12781332

12791333
if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
12801334
{
12811335
/*
12821336
* Switch to new logfile segment.
12831337
*/
1338+
XLogPageFlush(&pages, currentIndex);
12841339
if (openLogFile >= 0)
12851340
{
12861341
if (close(openLogFile))
@@ -1354,31 +1409,8 @@ XLogWrite(XLogwrtRqst WriteRqst)
13541409
openLogOff = 0;
13551410
}
13561411

1357-
/* Need to seek in the file? */
1358-
if (openLogOff != (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize)
1359-
{
1360-
openLogOff = (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize;
1361-
if (lseek(openLogFile, (off_t) openLogOff, SEEK_SET) < 0)
1362-
ereport(PANIC,
1363-
(errcode_for_file_access(),
1364-
errmsg("could not seek in log file %u, segment %u to offset %u: %m",
1365-
openLogId, openLogSeg, openLogOff)));
1366-
}
1367-
1368-
/* OK to write the page */
1369-
from = XLogCtl->pages + Write->curridx * BLCKSZ;
1370-
errno = 0;
1371-
if (write(openLogFile, from, BLCKSZ) != BLCKSZ)
1372-
{
1373-
/* if write didn't set errno, assume problem is no disk space */
1374-
if (errno == 0)
1375-
errno = ENOSPC;
1376-
ereport(PANIC,
1377-
(errcode_for_file_access(),
1378-
errmsg("could not write to log file %u, segment %u at offset %u: %m",
1379-
openLogId, openLogSeg, openLogOff)));
1380-
}
1381-
openLogOff += BLCKSZ;
1412+
/* Add a page to buffer */
1413+
XLogPageWrite(&pages, currentIndex);
13821414

13831415
/*
13841416
* If we just wrote the whole last page of a logfile segment,
@@ -1390,8 +1422,9 @@ XLogWrite(XLogwrtRqst WriteRqst)
13901422
* This is also the right place to notify the Archiver that the
13911423
* segment is ready to copy to archival storage.
13921424
*/
1393-
if (openLogOff >= XLogSegSize && !ispartialpage)
1425+
if (openLogOff + pages.size >= XLogSegSize && !ispartialpage)
13941426
{
1427+
XLogPageFlush(&pages, currentIndex);
13951428
issue_xlog_fsync();
13961429
LogwrtResult.Flush = LogwrtResult.Write; /* end of current page */
13971430

@@ -1405,8 +1438,9 @@ XLogWrite(XLogwrtRqst WriteRqst)
14051438
LogwrtResult.Write = WriteRqst.Write;
14061439
break;
14071440
}
1408-
Write->curridx = NextBufIdx(Write->curridx);
1441+
currentIndex = NextBufIdx(currentIndex);
14091442
}
1443+
XLogPageFlush(&pages, currentIndex);
14101444

14111445
/*
14121446
* If asked to flush, do so
@@ -3584,7 +3618,7 @@ XLOGShmemSize(void)
35843618
if (XLOGbuffers < MinXLOGbuffers)
35853619
XLOGbuffers = MinXLOGbuffers;
35863620

3587-
return MAXALIGN(sizeof(XLogCtlData) + sizeof(XLogRecPtr) * XLOGbuffers)
3621+
return XLOG_BUFFER_ALIGN(sizeof(XLogCtlData) + sizeof(XLogRecPtr) * XLOGbuffers)
35883622
+ BLCKSZ * XLOGbuffers +
35893623
MAXALIGN(sizeof(ControlFileData));
35903624
}
@@ -3601,7 +3635,7 @@ XLOGShmemInit(void)
36013635

36023636
XLogCtl = (XLogCtlData *)
36033637
ShmemInitStruct("XLOG Ctl",
3604-
MAXALIGN(sizeof(XLogCtlData) +
3638+
XLOG_BUFFER_ALIGN(sizeof(XLogCtlData) +
36053639
sizeof(XLogRecPtr) * XLOGbuffers)
36063640
+ BLCKSZ * XLOGbuffers,
36073641
&foundXLog);
@@ -3630,9 +3664,9 @@ XLOGShmemInit(void)
36303664
* Here, on the other hand, we must MAXALIGN to ensure the page
36313665
* buffers have worst-case alignment.
36323666
*/
3633-
XLogCtl->pages =
3634-
((char *) XLogCtl) + MAXALIGN(sizeof(XLogCtlData) +
3635-
sizeof(XLogRecPtr) * XLOGbuffers);
3667+
XLogCtl->pages = XLOG_BUFFER_POINTERALIGN(
3668+
((char *) XLogCtl)
3669+
+ sizeof(XLogCtlData) + sizeof(XLogRecPtr) * XLOGbuffers);
36363670
memset(XLogCtl->pages, 0, BLCKSZ * XLOGbuffers);
36373671

36383672
/*
@@ -3690,10 +3724,9 @@ BootStrapXLOG(void)
36903724
/* First timeline ID is always 1 */
36913725
ThisTimeLineID = 1;
36923726

3693-
/* Use malloc() to ensure buffer is MAXALIGNED */
3694-
buffer = (char *) malloc(BLCKSZ);
3695-
page = (XLogPageHeader) buffer;
3696-
memset(buffer, 0, BLCKSZ);
3727+
buffer = (char *) malloc(BLCKSZ + ALIGNOF_XLOG_BUFFER);
3728+
page = (XLogPageHeader) XLOG_BUFFER_POINTERALIGN(buffer);
3729+
memset(page, 0, BLCKSZ);
36973730

36983731
/* Set up information for the initial checkpoint record */
36993732
checkPoint.redo.xlogid = 0;
@@ -3745,7 +3778,7 @@ BootStrapXLOG(void)
37453778

37463779
/* Write the first page with the initial record */
37473780
errno = 0;
3748-
if (write(openLogFile, buffer, BLCKSZ) != BLCKSZ)
3781+
if (write(openLogFile, page, BLCKSZ) != BLCKSZ)
37493782
{
37503783
/* if write didn't set errno, assume problem is no disk space */
37513784
if (errno == 0)
@@ -5837,3 +5870,71 @@ remove_backup_label(void)
58375870
errmsg("could not remove file \"%s\": %m",
58385871
BACKUP_LABEL_FILE)));
58395872
}
5873+
5874+
5875+
/* XLog gather-write staffs */
5876+
5877+
static void
5878+
XLogPageReset(XLogPages *pages)
5879+
{
5880+
memset(pages, 0, sizeof(*pages));
5881+
}
5882+
5883+
static void
5884+
XLogPageWrite(XLogPages *pages, int index)
5885+
{
5886+
char *page = XLogCtl->pages + index * BLCKSZ;
5887+
int size = BLCKSZ;
5888+
int offset = (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize;
5889+
5890+
if (pages->head + pages->size == page
5891+
&& pages->offset + pages->size == offset)
5892+
{ /* Pages are continuous. Append new page. */
5893+
pages->size += size;
5894+
}
5895+
else
5896+
{ /* Pages are not continuous. Flush and clear. */
5897+
XLogPageFlush(pages, PrevBufIdx(index));
5898+
pages->head = page;
5899+
pages->size = size;
5900+
pages->offset = offset;
5901+
}
5902+
}
5903+
5904+
static void
5905+
XLogPageFlush(XLogPages *pages, int index)
5906+
{
5907+
if (!pages->head)
5908+
{ /* No needs to write pages. */
5909+
XLogCtl->Write.curridx = index;
5910+
return;
5911+
}
5912+
5913+
/* Need to seek in the file? */
5914+
if (openLogOff != pages->offset)
5915+
{
5916+
openLogOff = pages->offset;
5917+
if (lseek(openLogFile, (off_t) openLogOff, SEEK_SET) < 0)
5918+
ereport(PANIC,
5919+
(errcode_for_file_access(),
5920+
errmsg("could not seek in log file %u, segment %u to offset %u: %m",
5921+
openLogId, openLogSeg, openLogOff)));
5922+
}
5923+
5924+
/* OK to write the page */
5925+
errno = 0;
5926+
if (write(openLogFile, pages->head, pages->size) != pages->size)
5927+
{
5928+
/* if write didn't set errno, assume problem is no disk space */
5929+
if (errno == 0)
5930+
errno = ENOSPC;
5931+
ereport(PANIC,
5932+
(errcode_for_file_access(),
5933+
errmsg("could not write to log file %u, segment %u at offset %u: %m",
5934+
openLogId, openLogSeg, openLogOff)));
5935+
}
5936+
5937+
openLogOff += pages->size;
5938+
XLogCtl->Write.curridx = index;
5939+
XLogPageReset(pages);
5940+
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy