Skip to content

Commit faeedbc

Browse files
committed
Introduce PG_IO_ALIGN_SIZE and align all I/O buffers.
In order to have the option to use O_DIRECT/FILE_FLAG_NO_BUFFERING in a later commit, we need the addresses of user space buffers to be well aligned. The exact requirements vary by OS and file system (typically sectors and/or memory pages). The address alignment size is set to 4096, which is enough for currently known systems: it matches modern sectors and common memory page size. There is no standard governing O_DIRECT's requirements so we might eventually have to reconsider this with more information from the field or future systems. Aligning I/O buffers on memory pages is also known to improve regular buffered I/O performance. Three classes of I/O buffers for regular data pages are adjusted: (1) Heap buffers are now allocated with the new palloc_aligned() or MemoryContextAllocAligned() functions introduced by commit 439f617. (2) Stack buffers now use a new struct PGIOAlignedBlock to respect PG_IO_ALIGN_SIZE, if possible with this compiler. (3) The buffer pool is also aligned in shared memory. WAL buffers were already aligned on XLOG_BLCKSZ. It's possible for XLOG_BLCKSZ to be configured smaller than PG_IO_ALIGNED_SIZE and thus for O_DIRECT WAL writes to fail to be well aligned, but that's a pre-existing condition and will be addressed by a later commit. BufFiles are not yet addressed (there's no current plan to use O_DIRECT for those, but they could potentially get some incidental speedup even in plain buffered I/O operations through better alignment). If we can't align stack objects suitably using the compiler extensions we know about, we disable the use of O_DIRECT by setting PG_O_DIRECT to 0. This avoids the need to consider systems that have O_DIRECT but can't align stack objects the way we want; such systems could in theory be supported with more work but we don't currently know of any such machines, so it's easier to pretend there is no O_DIRECT support instead. That's an existing and tested class of system. Add assertions that all buffers passed into smgrread(), smgrwrite() and smgrextend() are correctly aligned, unless PG_O_DIRECT is 0 (= stack alignment tricks may be unavailable) or the block size has been set too small to allow arrays of buffers to be all aligned. Author: Thomas Munro <thomas.munro@gmail.com> Author: Andres Freund <andres@anarazel.de> Reviewed-by: Justin Pryzby <pryzby@telsasoft.com> Discussion: https://postgr.es/m/CA+hUKGK1X532hYqJ_MzFWt0n1zt8trz980D79WbjwnT-yYLZpg@mail.gmail.com
1 parent d73c285 commit faeedbc

File tree

26 files changed

+108
-45
lines changed

26 files changed

+108
-45
lines changed

contrib/bloom/blinsert.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ blbuildempty(Relation index)
166166
Page metapage;
167167

168168
/* Construct metapage. */
169-
metapage = (Page) palloc(BLCKSZ);
169+
metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
170170
BloomFillMetapage(index, metapage);
171171

172172
/*

contrib/pg_prewarm/pg_prewarm.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ typedef enum
3636
PREWARM_BUFFER
3737
} PrewarmType;
3838

39-
static PGAlignedBlock blockbuffer;
39+
static PGIOAlignedBlock blockbuffer;
4040

4141
/*
4242
* pg_prewarm(regclass, mode text, fork text,

src/backend/access/gist/gistbuild.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -415,7 +415,7 @@ gist_indexsortbuild(GISTBuildState *state)
415415
* Write an empty page as a placeholder for the root page. It will be
416416
* replaced with the real root page at the end.
417417
*/
418-
page = palloc0(BLCKSZ);
418+
page = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO);
419419
smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
420420
page, true);
421421
state->pages_allocated++;
@@ -509,7 +509,8 @@ gist_indexsortbuild_levelstate_add(GISTBuildState *state,
509509
levelstate->current_page++;
510510

511511
if (levelstate->pages[levelstate->current_page] == NULL)
512-
levelstate->pages[levelstate->current_page] = palloc(BLCKSZ);
512+
levelstate->pages[levelstate->current_page] =
513+
palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
513514

514515
newPage = levelstate->pages[levelstate->current_page];
515516
gistinitpage(newPage, old_page_flags);
@@ -579,7 +580,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
579580

580581
/* Create page and copy data */
581582
data = (char *) (dist->list);
582-
target = palloc0(BLCKSZ);
583+
target = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO);
583584
gistinitpage(target, isleaf ? F_LEAF : 0);
584585
for (int i = 0; i < dist->block.num; i++)
585586
{
@@ -630,7 +631,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
630631
if (parent == NULL)
631632
{
632633
parent = palloc0(sizeof(GistSortedBuildLevelState));
633-
parent->pages[0] = (Page) palloc(BLCKSZ);
634+
parent->pages[0] = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
634635
parent->parent = NULL;
635636
gistinitpage(parent->pages[0], 0);
636637

src/backend/access/hash/hashpage.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -992,7 +992,7 @@ static bool
992992
_hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
993993
{
994994
BlockNumber lastblock;
995-
PGAlignedBlock zerobuf;
995+
PGIOAlignedBlock zerobuf;
996996
Page page;
997997
HashPageOpaque ovflopaque;
998998

src/backend/access/heap/rewriteheap.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
255255

256256
state->rs_old_rel = old_heap;
257257
state->rs_new_rel = new_heap;
258-
state->rs_buffer = (Page) palloc(BLCKSZ);
258+
state->rs_buffer = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
259259
/* new_heap needn't be empty, just locked */
260260
state->rs_blockno = RelationGetNumberOfBlocks(new_heap);
261261
state->rs_buffer_valid = false;

src/backend/access/nbtree/nbtree.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ btbuildempty(Relation index)
154154
Page metapage;
155155

156156
/* Construct metapage. */
157-
metapage = (Page) palloc(BLCKSZ);
157+
metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
158158
_bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false));
159159

160160
/*

src/backend/access/nbtree/nbtsort.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -619,7 +619,7 @@ _bt_blnewpage(uint32 level)
619619
Page page;
620620
BTPageOpaque opaque;
621621

622-
page = (Page) palloc(BLCKSZ);
622+
page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
623623

624624
/* Zero the page and set up standard page header info */
625625
_bt_pageinit(page, BLCKSZ);
@@ -660,7 +660,9 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
660660
while (blkno > wstate->btws_pages_written)
661661
{
662662
if (!wstate->btws_zeropage)
663-
wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
663+
wstate->btws_zeropage = (Page) palloc_aligned(BLCKSZ,
664+
PG_IO_ALIGN_SIZE,
665+
MCXT_ALLOC_ZERO);
664666
/* don't set checksum for all-zero page */
665667
smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM,
666668
wstate->btws_pages_written++,
@@ -1170,7 +1172,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
11701172
* set to point to "P_NONE"). This changes the index to the "valid" state
11711173
* by filling in a valid magic number in the metapage.
11721174
*/
1173-
metapage = (Page) palloc(BLCKSZ);
1175+
metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
11741176
_bt_initmetapage(metapage, rootblkno, rootlevel,
11751177
wstate->inskey->allequalimage);
11761178
_bt_blwritepage(wstate, metapage, BTREE_METAPAGE);

src/backend/access/spgist/spginsert.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ spgbuildempty(Relation index)
158158
Page page;
159159

160160
/* Construct metapage. */
161-
page = (Page) palloc(BLCKSZ);
161+
page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
162162
SpGistInitMetapage(page);
163163

164164
/*

src/backend/access/transam/generic_xlog.c

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,14 +58,17 @@ typedef struct
5858
char delta[MAX_DELTA_SIZE]; /* delta between page images */
5959
} PageData;
6060

61-
/* State of generic xlog record construction */
61+
/*
62+
* State of generic xlog record construction. Must be allocated at an I/O
63+
* aligned address.
64+
*/
6265
struct GenericXLogState
6366
{
67+
/* Page images (properly aligned, must be first) */
68+
PGIOAlignedBlock images[MAX_GENERIC_XLOG_PAGES];
6469
/* Info about each page, see above */
6570
PageData pages[MAX_GENERIC_XLOG_PAGES];
6671
bool isLogged;
67-
/* Page images (properly aligned) */
68-
PGAlignedBlock images[MAX_GENERIC_XLOG_PAGES];
6972
};
7073

7174
static void writeFragment(PageData *pageData, OffsetNumber offset,
@@ -269,7 +272,9 @@ GenericXLogStart(Relation relation)
269272
GenericXLogState *state;
270273
int i;
271274

272-
state = (GenericXLogState *) palloc(sizeof(GenericXLogState));
275+
state = (GenericXLogState *) palloc_aligned(sizeof(GenericXLogState),
276+
PG_IO_ALIGN_SIZE,
277+
0);
273278
state->isLogged = RelationNeedsWAL(relation);
274279

275280
for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)

src/backend/access/transam/xlog.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4506,7 +4506,7 @@ XLOGShmemSize(void)
45064506
/* xlblocks array */
45074507
size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
45084508
/* extra alignment padding for XLOG I/O buffers */
4509-
size = add_size(size, XLOG_BLCKSZ);
4509+
size = add_size(size, Max(XLOG_BLCKSZ, PG_IO_ALIGN_SIZE));
45104510
/* and the buffers themselves */
45114511
size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
45124512

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy