Skip to content

Commit 896ddf9

Browse files
committed
Avoid fragmentation of logical tapes when writing concurrently.
Disk-based HashAgg relies on writing to multiple tapes concurrently. Avoid fragmentation of the tapes' blocks by preallocating many blocks for a tape at once. No file operations are performed during preallocation; only the block numbers are reserved. Reviewed-by: Tomas Vondra Discussion: https://postgr.es/m/20200519151202.u2p2gpiawoaznsv2%40development
1 parent 49223e1 commit 896ddf9

File tree

1 file changed

+77
-3
lines changed

1 file changed

+77
-3
lines changed

src/backend/utils/sort/logtape.c

Lines changed: 77 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,18 @@ typedef struct TapeBlockTrailer
110110
#define TapeBlockSetNBytes(buf, nbytes) \
111111
(TapeBlockGetTrailer(buf)->next = -(nbytes))
112112

113+
/*
114+
* When multiple tapes are being written to concurrently (as in HashAgg),
115+
* avoid excessive fragmentation by preallocating block numbers to individual
116+
* tapes. Each preallocation doubles in size starting at
117+
* TAPE_WRITE_PREALLOC_MIN blocks up to TAPE_WRITE_PREALLOC_MAX blocks.
118+
*
119+
* No filesystem operations are performed for preallocation; only the block
120+
* numbers are reserved. This may lead to sparse writes, which will cause
121+
* ltsWriteBlock() to fill in holes with zeros.
122+
*/
123+
#define TAPE_WRITE_PREALLOC_MIN 8
124+
#define TAPE_WRITE_PREALLOC_MAX 128
113125

114126
/*
115127
* This data structure represents a single "logical tape" within the set
@@ -151,6 +163,15 @@ typedef struct LogicalTape
151163
int max_size; /* highest useful, safe buffer_size */
152164
int pos; /* next read/write position in buffer */
153165
int nbytes; /* total # of valid bytes in buffer */
166+
167+
/*
168+
* Preallocated block numbers are held in an array sorted in descending
169+
* order; blocks are consumed from the end of the array (lowest block
170+
* numbers first).
171+
*/
172+
long *prealloc;
173+
int nprealloc; /* number of elements in list */
174+
int prealloc_size; /* number of elements list can hold */
154175
} LogicalTape;
155176

156177
/*
@@ -198,6 +219,7 @@ struct LogicalTapeSet
198219
static void ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer);
199220
static void ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer);
200221
static long ltsGetFreeBlock(LogicalTapeSet *lts);
222+
static long ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt);
201223
static void ltsReleaseBlock(LogicalTapeSet *lts, long blocknum);
202224
static void ltsConcatWorkerTapes(LogicalTapeSet *lts, TapeShare *shared,
203225
SharedFileSet *fileset);
@@ -397,6 +419,45 @@ ltsGetFreeBlock(LogicalTapeSet *lts)
397419
return blocknum;
398420
}
399421

422+
/*
423+
* Return the lowest free block number from the tape's preallocation list.
424+
* Refill the preallocation list if necessary.
425+
*/
426+
static long
427+
ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt)
428+
{
429+
/* sorted in descending order, so return the last element */
430+
if (lt->nprealloc > 0)
431+
return lt->prealloc[--lt->nprealloc];
432+
433+
if (lt->prealloc == NULL)
434+
{
435+
lt->prealloc_size = TAPE_WRITE_PREALLOC_MIN;
436+
lt->prealloc = (long *) palloc(sizeof(long) * lt->prealloc_size);
437+
}
438+
else if (lt->prealloc_size < TAPE_WRITE_PREALLOC_MAX)
439+
{
440+
/* when the preallocation list runs out, double the size */
441+
lt->prealloc_size *= 2;
442+
if (lt->prealloc_size > TAPE_WRITE_PREALLOC_MAX)
443+
lt->prealloc_size = TAPE_WRITE_PREALLOC_MAX;
444+
lt->prealloc = (long *) repalloc(lt->prealloc,
445+
sizeof(long) * lt->prealloc_size);
446+
}
447+
448+
/* refill preallocation list */
449+
lt->nprealloc = lt->prealloc_size;
450+
for (int i = lt->nprealloc; i > 0; i--)
451+
{
452+
lt->prealloc[i - 1] = ltsGetFreeBlock(lts);
453+
454+
/* verify descending order */
455+
Assert(i == lt->nprealloc || lt->prealloc[i - 1] > lt->prealloc[i]);
456+
}
457+
458+
return lt->prealloc[--lt->nprealloc];
459+
}
460+
400461
/*
401462
* Return a block# to the freelist.
402463
*/
@@ -557,6 +618,9 @@ ltsInitTape(LogicalTape *lt)
557618
lt->max_size = MaxAllocSize;
558619
lt->pos = 0;
559620
lt->nbytes = 0;
621+
lt->prealloc = NULL;
622+
lt->nprealloc = 0;
623+
lt->prealloc_size = 0;
560624
}
561625

562626
/*
@@ -709,7 +773,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
709773
Assert(lt->firstBlockNumber == -1);
710774
Assert(lt->pos == 0);
711775

712-
lt->curBlockNumber = ltsGetFreeBlock(lts);
776+
lt->curBlockNumber = ltsGetPreallocBlock(lts, lt);
713777
lt->firstBlockNumber = lt->curBlockNumber;
714778

715779
TapeBlockGetTrailer(lt->buffer)->prev = -1L;
@@ -733,7 +797,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
733797
* First allocate the next block, so that we can store it in the
734798
* 'next' pointer of this block.
735799
*/
736-
nextBlockNumber = ltsGetFreeBlock(lts);
800+
nextBlockNumber = ltsGetPreallocBlock(lts, lt);
737801

738802
/* set the next-pointer and dump the current block. */
739803
TapeBlockGetTrailer(lt->buffer)->next = nextBlockNumber;
@@ -835,13 +899,23 @@ LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size)
835899
Assert(lt->frozen);
836900
}
837901

838-
/* Allocate a read buffer (unless the tape is empty) */
839902
if (lt->buffer)
840903
pfree(lt->buffer);
841904

842905
/* the buffer is lazily allocated, but set the size here */
843906
lt->buffer = NULL;
844907
lt->buffer_size = buffer_size;
908+
909+
/* free the preallocation list, and return unused block numbers */
910+
if (lt->prealloc != NULL)
911+
{
912+
for (int i = lt->nprealloc; i > 0; i--)
913+
ltsReleaseBlock(lts, lt->prealloc[i - 1]);
914+
pfree(lt->prealloc);
915+
lt->prealloc = NULL;
916+
lt->nprealloc = 0;
917+
lt->prealloc_size = 0;
918+
}
845919
}
846920

847921
/*

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy