Skip to content

Commit 1aba62e

Browse files
committed
Allow per-tablespace effective_io_concurrency
Per discussion, nowadays it is possible to have tablespaces that have wildly different I/O characteristics from others. Setting different effective_io_concurrency parameters for those has been measured to improve performance. Author: Julien Rouhaud Reviewed by: Andres Freund
1 parent 665a00c commit 1aba62e

File tree

12 files changed

+145
-63
lines changed

12 files changed

+145
-63
lines changed

doc/src/sgml/config.sgml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1901,7 +1901,10 @@ include_dir 'conf.d'
19011901
</para>
19021902

19031903
<para>
1904-
The default is 1 on supported systems, otherwise 0.
1904+
The default is 1 on supported systems, otherwise 0. This value can
1905+
be overriden for tables in a particular tablespace by setting the
1906+
tablespace parameter of the same name (see
1907+
<xref linkend="sql-altertablespace">).
19051908
</para>
19061909
</listitem>
19071910
</varlistentry>

doc/src/sgml/ref/create_tablespace.sgml

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -104,14 +104,15 @@ CREATE TABLESPACE <replaceable class="parameter">tablespace_name</replaceable>
104104
<listitem>
105105
<para>
106106
A tablespace parameter to be set or reset. Currently, the only
107-
available parameters are <varname>seq_page_cost</> and
108-
<varname>random_page_cost</>. Setting either value for a particular
109-
tablespace will override the planner's usual estimate of the cost of
110-
reading pages from tables in that tablespace, as established by
111-
the configuration parameters of the same name (see
112-
<xref linkend="guc-seq-page-cost">,
113-
<xref linkend="guc-random-page-cost">). This may be useful if one
114-
tablespace is located on a disk which is faster or slower than the
107+
available parameters are <varname>seq_page_cost</>,
108+
<varname>random_page_cost</> and <varname>effective_io_concurrency</>.
109+
Setting either value for a particular tablespace will override the
110+
planner's usual estimate of the cost of reading pages from tables in
111+
that tablespace, as established by the configuration parameters of the
112+
same name (see <xref linkend="guc-seq-page-cost">,
113+
<xref linkend="guc-random-page-cost">,
114+
<xref linkend="guc-effective-io-concurrency">). This may be useful if
115+
one tablespace is located on a disk which is faster or slower than the
115116
remainder of the I/O subsystem.
116117
</para>
117118
</listitem>

src/backend/access/common/reloptions.c

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,19 @@ static relopt_int intRelOpts[] =
254254
},
255255
-1, 64, MAX_KILOBYTES
256256
},
257+
{
258+
{
259+
"effective_io_concurrency",
260+
"Number of simultaneous requests that can be handled efficiently by the disk subsystem.",
261+
RELOPT_KIND_TABLESPACE,
262+
AccessExclusiveLock
263+
},
264+
#ifdef USE_PREFETCH
265+
-1, 0, MAX_IO_CONCURRENCY
266+
#else
267+
0, 0, 0
268+
#endif
269+
},
257270

258271
/* list terminator */
259272
{{NULL}}
@@ -1438,7 +1451,8 @@ tablespace_reloptions(Datum reloptions, bool validate)
14381451
int numoptions;
14391452
static const relopt_parse_elt tab[] = {
14401453
{"random_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, random_page_cost)},
1441-
{"seq_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, seq_page_cost)}
1454+
{"seq_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, seq_page_cost)},
1455+
{"effective_io_concurrency", RELOPT_TYPE_INT, offsetof(TableSpaceOpts, effective_io_concurrency)}
14421456
};
14431457

14441458
options = parseRelOptions(reloptions, validate, RELOPT_KIND_TABLESPACE,

src/backend/executor/nodeBitmapHeapscan.c

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
#include "storage/predicate.h"
4545
#include "utils/memutils.h"
4646
#include "utils/rel.h"
47+
#include "utils/spccache.h"
4748
#include "utils/snapmgr.h"
4849
#include "utils/tqual.h"
4950

@@ -95,9 +96,8 @@ BitmapHeapNext(BitmapHeapScanState *node)
9596
* prefetching. node->prefetch_pages tracks exactly how many pages ahead
9697
* the prefetch iterator is. Also, node->prefetch_target tracks the
9798
* desired prefetch distance, which starts small and increases up to the
98-
* GUC-controlled maximum, target_prefetch_pages. This is to avoid doing
99-
* a lot of prefetching in a scan that stops after a few tuples because of
100-
* a LIMIT.
99+
* node->prefetch_maximum. This is to avoid doing a lot of prefetching in
100+
* a scan that stops after a few tuples because of a LIMIT.
101101
*/
102102
if (tbm == NULL)
103103
{
@@ -111,7 +111,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
111111
node->tbmres = tbmres = NULL;
112112

113113
#ifdef USE_PREFETCH
114-
if (target_prefetch_pages > 0)
114+
if (node->prefetch_maximum > 0)
115115
{
116116
node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm);
117117
node->prefetch_pages = 0;
@@ -188,10 +188,10 @@ BitmapHeapNext(BitmapHeapScanState *node)
188188
* page/tuple, then to one after the second tuple is fetched, then
189189
* it doubles as later pages are fetched.
190190
*/
191-
if (node->prefetch_target >= target_prefetch_pages)
191+
if (node->prefetch_target >= node->prefetch_maximum)
192192
/* don't increase any further */ ;
193-
else if (node->prefetch_target >= target_prefetch_pages / 2)
194-
node->prefetch_target = target_prefetch_pages;
193+
else if (node->prefetch_target >= node->prefetch_maximum / 2)
194+
node->prefetch_target = node->prefetch_maximum;
195195
else if (node->prefetch_target > 0)
196196
node->prefetch_target *= 2;
197197
else
@@ -211,7 +211,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
211211
* Try to prefetch at least a few pages even before we get to the
212212
* second page if we don't stop reading after the first tuple.
213213
*/
214-
if (node->prefetch_target < target_prefetch_pages)
214+
if (node->prefetch_target < node->prefetch_maximum)
215215
node->prefetch_target++;
216216
#endif /* USE_PREFETCH */
217217
}
@@ -539,6 +539,7 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
539539
{
540540
BitmapHeapScanState *scanstate;
541541
Relation currentRelation;
542+
int io_concurrency;
542543

543544
/* check for unsupported flags */
544545
Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
@@ -564,6 +565,8 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
564565
scanstate->prefetch_iterator = NULL;
565566
scanstate->prefetch_pages = 0;
566567
scanstate->prefetch_target = 0;
568+
/* may be updated below */
569+
scanstate->prefetch_maximum = target_prefetch_pages;
567570

568571
/*
569572
* Miscellaneous initialization
@@ -598,6 +601,22 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
598601
*/
599602
currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
600603

604+
/*
605+
* Determine the maximum for prefetch_target. If the tablespace has a
606+
* specific IO concurrency set, use that to compute the corresponding
607+
* maximum value; otherwise, we already initialized to the value computed
608+
* by the GUC machinery.
609+
*/
610+
io_concurrency =
611+
get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
612+
if (io_concurrency != effective_io_concurrency)
613+
{
614+
double maximum;
615+
616+
if (ComputeIoConcurrency(io_concurrency, &maximum))
617+
scanstate->prefetch_maximum = rint(maximum);
618+
}
619+
601620
scanstate->ss.ss_currentRelation = currentRelation;
602621

603622
/*

src/backend/storage/buffer/bufmgr.c

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,11 +80,14 @@ bool zero_damaged_pages = false;
8080
int bgwriter_lru_maxpages = 100;
8181
double bgwriter_lru_multiplier = 2.0;
8282
bool track_io_timing = false;
83+
int effective_io_concurrency = 0;
8384

8485
/*
8586
* How many buffers PrefetchBuffer callers should try to stay ahead of their
8687
* ReadBuffer calls by. This is maintained by the assign hook for
87-
* effective_io_concurrency. Zero means "never prefetch".
88+
* effective_io_concurrency. Zero means "never prefetch". This value is
89+
* only used for buffers not belonging to tablespaces that have their
90+
* effective_io_concurrency parameter set.
8891
*/
8992
int target_prefetch_pages = 0;
9093

@@ -415,6 +418,64 @@ static void CheckForBufferLeaks(void);
415418
static int rnode_comparator(const void *p1, const void *p2);
416419

417420

421+
/*
422+
* ComputeIoConcurrency -- get the number of pages to prefetch for a given
423+
* number of spindles.
424+
*/
425+
bool
426+
ComputeIoConcurrency(int io_concurrency, double *target)
427+
{
428+
double new_prefetch_pages = 0.0;
429+
int i;
430+
431+
/*
432+
* Make sure the io_concurrency value is within valid range; it may have
433+
* been forced with a manual pg_tablespace update.
434+
*/
435+
io_concurrency = Min(Max(io_concurrency, 0), MAX_IO_CONCURRENCY);
436+
437+
/*----------
438+
* The user-visible GUC parameter is the number of drives (spindles),
439+
* which we need to translate to a number-of-pages-to-prefetch target.
440+
* The target value is stashed in *extra and then assigned to the actual
441+
* variable by assign_effective_io_concurrency.
442+
*
443+
* The expected number of prefetch pages needed to keep N drives busy is:
444+
*
445+
* drives | I/O requests
446+
* -------+----------------
447+
* 1 | 1
448+
* 2 | 2/1 + 2/2 = 3
449+
* 3 | 3/1 + 3/2 + 3/3 = 5 1/2
450+
* 4 | 4/1 + 4/2 + 4/3 + 4/4 = 8 1/3
451+
* n | n * H(n)
452+
*
453+
* This is called the "coupon collector problem" and H(n) is called the
454+
* harmonic series. This could be approximated by n * ln(n), but for
455+
* reasonable numbers of drives we might as well just compute the series.
456+
*
457+
* Alternatively we could set the target to the number of pages necessary
458+
* so that the expected number of active spindles is some arbitrary
459+
* percentage of the total. This sounds the same but is actually slightly
460+
* different. The result ends up being ln(1-P)/ln((n-1)/n) where P is
461+
* that desired fraction.
462+
*
463+
* Experimental results show that both of these formulas aren't aggressive
464+
* enough, but we don't really have any better proposals.
465+
*
466+
* Note that if io_concurrency = 0 (disabled), we must set target = 0.
467+
*----------
468+
*/
469+
470+
for (i = 1; i <= io_concurrency; i++)
471+
new_prefetch_pages += (double) io_concurrency / (double) i;
472+
473+
*target = new_prefetch_pages;
474+
475+
/* This range check shouldn't fail, but let's be paranoid */
476+
return (new_prefetch_pages > 0.0 && new_prefetch_pages < (double) INT_MAX);
477+
}
478+
418479
/*
419480
* PrefetchBuffer -- initiate asynchronous read of a block of a relation
420481
*

src/backend/utils/cache/spccache.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "commands/tablespace.h"
2424
#include "miscadmin.h"
2525
#include "optimizer/cost.h"
26+
#include "storage/bufmgr.h"
2627
#include "utils/catcache.h"
2728
#include "utils/hsearch.h"
2829
#include "utils/inval.h"
@@ -198,3 +199,14 @@ get_tablespace_page_costs(Oid spcid,
198199
*spc_seq_page_cost = spc->opts->seq_page_cost;
199200
}
200201
}
202+
203+
int
204+
get_tablespace_io_concurrency(Oid spcid)
205+
{
206+
TableSpaceCacheEntry *spc = get_tablespace(spcid);
207+
208+
if (!spc->opts || spc->opts->effective_io_concurrency < 0)
209+
return effective_io_concurrency;
210+
else
211+
return spc->opts->effective_io_concurrency;
212+
}

src/backend/utils/misc/guc.c

Lines changed: 3 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -490,7 +490,6 @@ static int wal_block_size;
490490
static bool data_checksums;
491491
static int wal_segment_size;
492492
static bool integer_datetimes;
493-
static int effective_io_concurrency;
494493
static bool assert_enabled;
495494

496495
/* should be static, but commands/variable.c needs to get at this */
@@ -2352,7 +2351,7 @@ static struct config_int ConfigureNamesInt[] =
23522351
},
23532352
&effective_io_concurrency,
23542353
#ifdef USE_PREFETCH
2355-
1, 0, 1000,
2354+
1, 0, MAX_IO_CONCURRENCY,
23562355
#else
23572356
0, 0, 0,
23582357
#endif
@@ -9986,47 +9985,9 @@ static bool
99869985
check_effective_io_concurrency(int *newval, void **extra, GucSource source)
99879986
{
99889987
#ifdef USE_PREFETCH
9989-
double new_prefetch_pages = 0.0;
9990-
int i;
9991-
9992-
/*----------
9993-
* The user-visible GUC parameter is the number of drives (spindles),
9994-
* which we need to translate to a number-of-pages-to-prefetch target.
9995-
* The target value is stashed in *extra and then assigned to the actual
9996-
* variable by assign_effective_io_concurrency.
9997-
*
9998-
* The expected number of prefetch pages needed to keep N drives busy is:
9999-
*
10000-
* drives | I/O requests
10001-
* -------+----------------
10002-
* 1 | 1
10003-
* 2 | 2/1 + 2/2 = 3
10004-
* 3 | 3/1 + 3/2 + 3/3 = 5 1/2
10005-
* 4 | 4/1 + 4/2 + 4/3 + 4/4 = 8 1/3
10006-
* n | n * H(n)
10007-
*
10008-
* This is called the "coupon collector problem" and H(n) is called the
10009-
* harmonic series. This could be approximated by n * ln(n), but for
10010-
* reasonable numbers of drives we might as well just compute the series.
10011-
*
10012-
* Alternatively we could set the target to the number of pages necessary
10013-
* so that the expected number of active spindles is some arbitrary
10014-
* percentage of the total. This sounds the same but is actually slightly
10015-
* different. The result ends up being ln(1-P)/ln((n-1)/n) where P is
10016-
* that desired fraction.
10017-
*
10018-
* Experimental results show that both of these formulas aren't aggressive
10019-
* enough, but we don't really have any better proposals.
10020-
*
10021-
* Note that if *newval = 0 (disabled), we must set target = 0.
10022-
*----------
10023-
*/
10024-
10025-
for (i = 1; i <= *newval; i++)
10026-
new_prefetch_pages += (double) *newval / (double) i;
9988+
double new_prefetch_pages;
100279989

10028-
/* This range check shouldn't fail, but let's be paranoid */
10029-
if (new_prefetch_pages >= 0.0 && new_prefetch_pages < (double) INT_MAX)
9990+
if (ComputeIoConcurrency(*newval, &new_prefetch_pages))
100309991
{
100319992
int *myextra = (int *) guc_malloc(ERROR, sizeof(int));
100329993

src/bin/psql/tab-complete.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1885,7 +1885,7 @@ psql_completion(const char *text, int start, int end)
18851885
pg_strcasecmp(prev_wd, "(") == 0)
18861886
{
18871887
static const char *const list_TABLESPACEOPTIONS[] =
1888-
{"seq_page_cost", "random_page_cost", NULL};
1888+
{"seq_page_cost", "random_page_cost", "effective_io_concurrency", NULL};
18891889

18901890
COMPLETE_WITH_LIST(list_TABLESPACEOPTIONS);
18911891
}

src/include/commands/tablespace.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ typedef struct TableSpaceOpts
3939
int32 vl_len_; /* varlena header (do not touch directly!) */
4040
float8 random_page_cost;
4141
float8 seq_page_cost;
42+
int effective_io_concurrency;
4243
} TableSpaceOpts;
4344

4445
extern Oid CreateTableSpace(CreateTableSpaceStmt *stmt);

src/include/nodes/execnodes.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1424,7 +1424,8 @@ typedef struct BitmapIndexScanState
14241424
* lossy_pages total number of lossy pages retrieved
14251425
* prefetch_iterator iterator for prefetching ahead of current page
14261426
* prefetch_pages # pages prefetch iterator is ahead of current
1427-
* prefetch_target target prefetch distance
1427+
* prefetch_target current target prefetch distance
1428+
* prefetch_maximum maximum value for prefetch_target
14281429
* ----------------
14291430
*/
14301431
typedef struct BitmapHeapScanState
@@ -1439,6 +1440,7 @@ typedef struct BitmapHeapScanState
14391440
TBMIterator *prefetch_iterator;
14401441
int prefetch_pages;
14411442
int prefetch_target;
1443+
int prefetch_maximum;
14421444
} BitmapHeapScanState;
14431445

14441446
/* ----------------

src/include/storage/bufmgr.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,17 @@ extern int target_prefetch_pages;
5858
/* in buf_init.c */
5959
extern PGDLLIMPORT char *BufferBlocks;
6060

61+
/* in guc.c */
62+
extern int effective_io_concurrency;
63+
6164
/* in localbuf.c */
6265
extern PGDLLIMPORT int NLocBuffer;
6366
extern PGDLLIMPORT Block *LocalBufferBlockPointers;
6467
extern PGDLLIMPORT int32 *LocalRefCount;
6568

69+
/* upper limit for effective_io_concurrency */
70+
#define MAX_IO_CONCURRENCY 1000
71+
6672
/* special block number for ReadBuffer() */
6773
#define P_NEW InvalidBlockNumber /* grow the file to get a new page */
6874

@@ -144,6 +150,7 @@ extern PGDLLIMPORT int32 *LocalRefCount;
144150
/*
145151
* prototypes for functions in bufmgr.c
146152
*/
153+
extern bool ComputeIoConcurrency(int io_concurrency, double *target);
147154
extern void PrefetchBuffer(Relation reln, ForkNumber forkNum,
148155
BlockNumber blockNum);
149156
extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum);

src/include/utils/spccache.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,6 @@
1515

1616
void get_tablespace_page_costs(Oid spcid, float8 *spc_random_page_cost,
1717
float8 *spc_seq_page_cost);
18+
int get_tablespace_io_concurrency(Oid spcid);
1819

1920
#endif /* SPCCACHE_H */

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy