Skip to content

Commit ed802e7

Browse files
committed
pgbench: Allow \setrandom to generate Gaussian/exponential distributions.
Mitsumasa KONDO and Fabien COELHO, with further wordsmithing by me.
1 parent e280c63 commit ed802e7

File tree

2 files changed

+231
-13
lines changed

2 files changed

+231
-13
lines changed

contrib/pgbench/pgbench.c

Lines changed: 173 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ static int pthread_join(pthread_t th, void **thread_return);
9898
#define LOG_STEP_SECONDS 5 /* seconds between log messages */
9999
#define DEFAULT_NXACTS 10 /* default nxacts */
100100

101+
#define MIN_GAUSSIAN_THRESHOLD 2.0 /* minimum threshold for gauss */
102+
101103
int nxacts = 0; /* number of transactions per client */
102104
int duration = 0; /* duration in seconds */
103105

@@ -471,6 +473,76 @@ getrand(TState *thread, int64 min, int64 max)
471473
return min + (int64) ((max - min + 1) * pg_erand48(thread->random_state));
472474
}
473475

476+
/*
477+
* random number generator: exponential distribution from min to max inclusive.
478+
* the threshold is so that the density of probability for the last cut-off max
479+
* value is exp(-threshold).
480+
*/
481+
static int64
482+
getExponentialRand(TState *thread, int64 min, int64 max, double threshold)
483+
{
484+
double cut, uniform, rand;
485+
Assert(threshold > 0.0);
486+
cut = exp(-threshold);
487+
/* erand in [0, 1), uniform in (0, 1] */
488+
uniform = 1.0 - pg_erand48(thread->random_state);
489+
/*
490+
* inner expresion in (cut, 1] (if threshold > 0),
491+
* rand in [0, 1)
492+
*/
493+
Assert((1.0 - cut) != 0.0);
494+
rand = - log(cut + (1.0 - cut) * uniform) / threshold;
495+
/* return int64 random number within between min and max */
496+
return min + (int64)((max - min + 1) * rand);
497+
}
498+
499+
/* random number generator: gaussian distribution from min to max inclusive */
500+
static int64
501+
getGaussianRand(TState *thread, int64 min, int64 max, double threshold)
502+
{
503+
double stdev;
504+
double rand;
505+
506+
/*
507+
* Get user specified random number from this loop, with
508+
* -threshold < stdev <= threshold
509+
*
510+
* This loop is executed until the number is in the expected range.
511+
*
512+
* As the minimum threshold is 2.0, the probability of looping is low:
513+
* sqrt(-2 ln(r)) <= 2 => r >= e^{-2} ~ 0.135, then when taking the average
514+
* sinus multiplier as 2/pi, we have a 8.6% looping probability in the
515+
* worst case. For a 5.0 threshold value, the looping probability
516+
* is about e^{-5} * 2 / pi ~ 0.43%.
517+
*/
518+
do
519+
{
520+
/*
521+
* pg_erand48 generates [0,1), but for the basic version of the
522+
* Box-Muller transform the two uniformly distributed random numbers
523+
* are expected in (0, 1] (see http://en.wikipedia.org/wiki/Box_muller)
524+
*/
525+
double rand1 = 1.0 - pg_erand48(thread->random_state);
526+
double rand2 = 1.0 - pg_erand48(thread->random_state);
527+
528+
/* Box-Muller basic form transform */
529+
double var_sqrt = sqrt(-2.0 * log(rand1));
530+
stdev = var_sqrt * sin(2.0 * M_PI * rand2);
531+
532+
/*
533+
* we may try with cos, but there may be a bias induced if the previous
534+
* value fails the test. To be on the safe side, let us try over.
535+
*/
536+
}
537+
while (stdev < -threshold || stdev >= threshold);
538+
539+
/* stdev is in [-threshold, threshold), normalization to [0,1) */
540+
rand = (stdev + threshold) / (threshold * 2.0);
541+
542+
/* return int64 random number within between min and max */
543+
return min + (int64)((max - min + 1) * rand);
544+
}
545+
474546
/* call PQexec() and exit() on failure */
475547
static void
476548
executeStatement(PGconn *con, const char *sql)
@@ -1319,6 +1391,7 @@ doCustom(TState *thread, CState *st, instr_time *conn_time, FILE *logfile, AggVa
13191391
char *var;
13201392
int64 min,
13211393
max;
1394+
double threshold = 0;
13221395
char res[64];
13231396

13241397
if (*argv[2] == ':')
@@ -1364,11 +1437,11 @@ doCustom(TState *thread, CState *st, instr_time *conn_time, FILE *logfile, AggVa
13641437
}
13651438

13661439
/*
1367-
* getrand() needs to be able to subtract max from min and add one
1368-
* to the result without overflowing. Since we know max > min, we
1369-
* can detect overflow just by checking for a negative result. But
1370-
* we must check both that the subtraction doesn't overflow, and
1371-
* that adding one to the result doesn't overflow either.
1440+
* Generate random number functions need to be able to subtract
1441+
* max from min and add one to the result without overflowing.
1442+
* Since we know max > min, we can detect overflow just by checking
1443+
* for a negative result. But we must check both that the subtraction
1444+
* doesn't overflow, and that adding one to the result doesn't overflow either.
13721445
*/
13731446
if (max - min < 0 || (max - min) + 1 < 0)
13741447
{
@@ -1377,10 +1450,64 @@ doCustom(TState *thread, CState *st, instr_time *conn_time, FILE *logfile, AggVa
13771450
return true;
13781451
}
13791452

1453+
if (argc == 4 || /* uniform without or with "uniform" keyword */
1454+
(argc == 5 && pg_strcasecmp(argv[4], "uniform") == 0))
1455+
{
1456+
#ifdef DEBUG
1457+
printf("min: " INT64_FORMAT " max: " INT64_FORMAT " random: " INT64_FORMAT "\n", min, max, getrand(thread, min, max));
1458+
#endif
1459+
snprintf(res, sizeof(res), INT64_FORMAT, getrand(thread, min, max));
1460+
}
1461+
else if (argc == 6 &&
1462+
((pg_strcasecmp(argv[4], "gaussian") == 0) ||
1463+
(pg_strcasecmp(argv[4], "exponential") == 0)))
1464+
{
1465+
if (*argv[5] == ':')
1466+
{
1467+
if ((var = getVariable(st, argv[5] + 1)) == NULL)
1468+
{
1469+
fprintf(stderr, "%s: invalid threshold number %s\n", argv[0], argv[5]);
1470+
st->ecnt++;
1471+
return true;
1472+
}
1473+
threshold = strtod(var, NULL);
1474+
}
1475+
else
1476+
threshold = strtod(argv[5], NULL);
1477+
1478+
if (pg_strcasecmp(argv[4], "gaussian") == 0)
1479+
{
1480+
if (threshold < MIN_GAUSSIAN_THRESHOLD)
1481+
{
1482+
fprintf(stderr, "%s: gaussian threshold must be at least %f\n,", argv[5], MIN_GAUSSIAN_THRESHOLD);
1483+
st->ecnt++;
1484+
return true;
1485+
}
1486+
#ifdef DEBUG
1487+
printf("min: " INT64_FORMAT " max: " INT64_FORMAT " random: " INT64_FORMAT "\n", min, max, getGaussianRand(thread, min, max, threshold));
1488+
#endif
1489+
snprintf(res, sizeof(res), INT64_FORMAT, getGaussianRand(thread, min, max, threshold));
1490+
}
1491+
else if (pg_strcasecmp(argv[4], "exponential") == 0)
1492+
{
1493+
if (threshold <= 0.0)
1494+
{
1495+
fprintf(stderr, "%s: exponential threshold must be strictly positive\n,", argv[5]);
1496+
st->ecnt++;
1497+
return true;
1498+
}
13801499
#ifdef DEBUG
1381-
printf("min: " INT64_FORMAT " max: " INT64_FORMAT " random: " INT64_FORMAT "\n", min, max, getrand(thread, min, max));
1500+
printf("min: " INT64_FORMAT " max: " INT64_FORMAT " random: " INT64_FORMAT "\n", min, max, getExponentialRand(thread, min, max, threshold));
13821501
#endif
1383-
snprintf(res, sizeof(res), INT64_FORMAT, getrand(thread, min, max));
1502+
snprintf(res, sizeof(res), INT64_FORMAT, getExponentialRand(thread, min, max, threshold));
1503+
}
1504+
}
1505+
else /* this means an error somewhere in the parsing phase... */
1506+
{
1507+
fprintf(stderr, "%s: unexpected arguments\n", argv[0]);
1508+
st->ecnt++;
1509+
return true;
1510+
}
13841511

13851512
if (!putVariable(st, argv[0], argv[1], res))
13861513
{
@@ -1914,15 +2041,51 @@ process_commands(char *buf)
19142041

19152042
if (pg_strcasecmp(my_commands->argv[0], "setrandom") == 0)
19162043
{
2044+
/* parsing:
2045+
* \setrandom variable min max [uniform]
2046+
* \setrandom variable min max (gaussian|exponential) threshold
2047+
*/
2048+
19172049
if (my_commands->argc < 4)
19182050
{
19192051
fprintf(stderr, "%s: missing argument\n", my_commands->argv[0]);
19202052
exit(1);
19212053
}
2054+
/* argc >= 4 */
19222055

1923-
for (j = 4; j < my_commands->argc; j++)
1924-
fprintf(stderr, "%s: extra argument \"%s\" ignored\n",
1925-
my_commands->argv[0], my_commands->argv[j]);
2056+
if (my_commands->argc == 4 || /* uniform without/with "uniform" keyword */
2057+
(my_commands->argc == 5 &&
2058+
pg_strcasecmp(my_commands->argv[4], "uniform") == 0))
2059+
{
2060+
/* nothing to do */
2061+
}
2062+
else if (/* argc >= 5 */
2063+
(pg_strcasecmp(my_commands->argv[4], "gaussian") == 0) ||
2064+
(pg_strcasecmp(my_commands->argv[4], "exponential") == 0))
2065+
{
2066+
if (my_commands->argc < 6)
2067+
{
2068+
fprintf(stderr, "%s(%s): missing threshold argument\n", my_commands->argv[0], my_commands->argv[4]);
2069+
exit(1);
2070+
}
2071+
else if (my_commands->argc > 6)
2072+
{
2073+
fprintf(stderr, "%s(%s): too many arguments (extra:",
2074+
my_commands->argv[0], my_commands->argv[4]);
2075+
for (j = 6; j < my_commands->argc; j++)
2076+
fprintf(stderr, " %s", my_commands->argv[j]);
2077+
fprintf(stderr, ")\n");
2078+
exit(1);
2079+
}
2080+
}
2081+
else /* cannot parse, unexpected arguments */
2082+
{
2083+
fprintf(stderr, "%s: unexpected arguments (bad:", my_commands->argv[0]);
2084+
for (j = 4; j < my_commands->argc; j++)
2085+
fprintf(stderr, " %s", my_commands->argv[j]);
2086+
fprintf(stderr, ")\n");
2087+
exit(1);
2088+
}
19262089
}
19272090
else if (pg_strcasecmp(my_commands->argv[0], "set") == 0)
19282091
{

doc/src/sgml/pgbench.sgml

Lines changed: 58 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -748,8 +748,8 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
748748

749749
<varlistentry>
750750
<term>
751-
<literal>\setrandom <replaceable>varname</> <replaceable>min</> <replaceable>max</></literal>
752-
</term>
751+
<literal>\setrandom <replaceable>varname</> <replaceable>min</> <replaceable>max</> [ uniform | [ { gaussian | exponential } <replaceable>threshold</> ] ]</literal>
752+
</term>
753753

754754
<listitem>
755755
<para>
@@ -760,10 +760,65 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
760760
having an integer value.
761761
</para>
762762

763+
<para>
764+
By default, or when <literal>uniform</> is specified, all values in the
765+
range are drawn with equal probability. Specifiying <literal>gaussian</>
766+
or <literal>exponential</> options modifies this behavior; each
767+
requires a mandatory threshold which determines the precise shape of the
768+
distribution.
769+
</para>
770+
771+
<para>
772+
For a Gaussian distribution, the interval is mapped onto a standard
773+
normal distribution (the classical bell-shaped Gaussian curve) truncated
774+
at <literal>-threshold</> on the left and <literal>+threshold</>
775+
on the right.
776+
To be precise, if <literal>PHI(x)</> is the cumulative distribution
777+
function of the standard normal distribution, with mean <literal>mu</>
778+
defined as <literal>(max + min) / 2.0</>, then value <replaceable>i</>
779+
between <replaceable>min</> and <replaceable>max</> inclusive is drawn
780+
with probability:
781+
<literal>
782+
(PHI(2.0 * threshold * (i - min - mu + 0.5) / (max - min + 1)) -
783+
PHI(2.0 * threshold * (i - min - mu - 0.5) / (max - min + 1))) /
784+
(2.0 * PHI(threshold) - 1.0)
785+
</>
786+
Intuitively, the larger the <replaceable>threshold</>, the more
787+
frequently values close to the middle of the interval are drawn, and the
788+
less frequently values close to the <replaceable>min</> and
789+
<replaceable>max</> bounds.
790+
About 67% of values are drawn from the middle <literal>1.0 / threshold</>
791+
and 95% in the middle <literal>2.0 / threshold</>; for instance, if
792+
<replaceable>threshold</> is 4.0, 67% of values are drawn from the middle
793+
quarter and 95% from the middle half of the interval.
794+
The minimum <replaceable>threshold</> is 2.0 for performance of
795+
the Box-Muller transform.
796+
</para>
797+
798+
<para>
799+
For an exponential distribution, the <replaceable>threshold</>
800+
parameter controls the distribution by truncating a quickly-decreasing
801+
exponential distribution at <replaceable>threshold</>, and then
802+
projecting onto integers between the bounds.
803+
To be precise, value <replaceable>i</> between <replaceable>min</> and
804+
<replaceable>max</> inclusive is drawn with probability:
805+
<literal>(exp(-threshold*(i-min)/(max+1-min)) -
806+
exp(-threshold*(i+1-min)/(max+1-min))) / (1.0 - exp(-threshold))</>.
807+
Intuitively, the larger the <replaceable>threshold</>, the more
808+
frequently values close to <replaceable>min</> are accessed, and the
809+
less frequently values close to <replaceable>max</> are accessed.
810+
The closer to 0 the threshold, the flatter (more uniform) the access
811+
distribution.
812+
A crude approximation of the distribution is that the most frequent 1%
813+
values in the range, close to <replaceable>min</>, are drawn
814+
<replaceable>threshold</>% of the time.
815+
The <replaceable>threshold</> value must be strictly positive.
816+
</para>
817+
763818
<para>
764819
Example:
765820
<programlisting>
766-
\setrandom aid 1 :naccounts
821+
\setrandom aid 1 :naccounts gaussian 5.0
767822
</programlisting></para>
768823
</listitem>
769824
</varlistentry>

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy