From 42625ecda28458e7446a83130135fdd061f21832 Mon Sep 17 00:00:00 2001 From: Joe Conway Date: Sun, 29 Jun 2025 23:00:00 -0400 Subject: [PATCH 001/138] Adapt REL_18_STABLE to its new status as a stable branch Per the checklist in RELEASE_CHANGES for the creation of a new stable branch, this commit does the following things: - Arm gen_node_support.pl's nodetag ABI stability, based on the contents of nodetags.h. - Update URLs of top-level README and Makefile to point to the new stable version. --- Makefile | 2 +- README.md | 4 ++-- src/backend/nodes/gen_node_support.pl | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 8a2ec9396b6b4..b363b2f24766b 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ all: all check install installdirs installcheck installcheck-parallel uninstall clean distclean maintainer-clean dist distcheck world check-world install-world installcheck-world: @if [ ! -f GNUmakefile ] ; then \ echo "You need to run the 'configure' program first. Please see"; \ - echo "" ; \ + echo "" ; \ false ; \ fi @IFS=':' ; \ diff --git a/README.md b/README.md index f6104c038b3d5..7352a90a72373 100644 --- a/README.md +++ b/README.md @@ -12,9 +12,9 @@ and functions. This distribution also contains C language bindings. Copyright and license information can be found in the file COPYRIGHT. General documentation about this version of PostgreSQL can be found at -. In particular, information +. In particular, information about building PostgreSQL from the source code can be found at -. +. The latest version of this software, and related software, may be obtained at . For more information diff --git a/src/backend/nodes/gen_node_support.pl b/src/backend/nodes/gen_node_support.pl index 9ecddb1423143..214d2c70d60db 100644 --- a/src/backend/nodes/gen_node_support.pl +++ b/src/backend/nodes/gen_node_support.pl @@ -107,8 +107,8 @@ sub elem # In HEAD, these variables should be left undef, since we don't promise # ABI stability during development. -my $last_nodetag = undef; -my $last_nodetag_no = undef; +my $last_nodetag = 'WindowObjectData'; +my $last_nodetag_no = 479; # output file names my @output_files; From b2a57747ba047844c0e7bf0acb7c75f84fc34989 Mon Sep 17 00:00:00 2001 From: Daniel Gustafsson Date: Mon, 30 Jun 2025 10:12:31 +0200 Subject: [PATCH 002/138] doc: Fix typo in pg_sync_replication_slots documentation Commit 1546e17f9d0 accidentally misspelled additionally as additionaly. Backpatch to v17 to match where the original commit was backpatched. Author: Daniel Gustafsson Backpatch-through: 17 --- doc/src/sgml/func.sgml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 224d4fe5a9f95..298791858be30 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -29981,7 +29981,7 @@ postgres=# SELECT '0/0'::pg_lsn + pd.segment_number * ps.setting::int + :offset logical decoding and must be dropped after promotion. See for details. Note that this function is primarily intended for testing and - debugging purposes and should be used with caution. Additionaly, + debugging purposes and should be used with caution. Additionally, this function cannot be executed if sync_replication_slots is enabled and the slotsync From 95163cbe111cd75121482281492f8db5df78d31f Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Mon, 30 Jun 2025 10:20:14 -0400 Subject: [PATCH 003/138] aio: Fix reference to outdated name Reported-by: Antonin Houska Author: Antonin Houska Discussion: https://postgr.es/m/5250.1751266701@localhost Backpatch-through: 18, where da7226993fd4 introduced this --- src/include/storage/aio_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/storage/aio_types.h b/src/include/storage/aio_types.h index 181833660778e..afee85c787b44 100644 --- a/src/include/storage/aio_types.h +++ b/src/include/storage/aio_types.h @@ -107,7 +107,7 @@ typedef struct PgAioResult /* of type PgAioResultStatus, see above */ uint32 status:PGAIO_RESULT_STATUS_BITS; - /* meaning defined by callback->error */ + /* meaning defined by callback->report */ uint32 error_data:PGAIO_RESULT_ERROR_BITS; int32 result; From eb37fe716a477ee10434c320d7e7c23b79337922 Mon Sep 17 00:00:00 2001 From: Amit Langote Date: Tue, 1 Jul 2025 13:13:40 +0900 Subject: [PATCH 004/138] Fix typos in comments Commit 19d8e2308bc added enum values with the prefix TU_, but a few comments still referred to TUUI_, which was used in development versions of the patches committed as 19d8e2308bc. Author: Yugo Nagata Discussion: https://postgr.es/m/20250701110216.8ac8a9e4c6f607f1d954f44a@sraoss.co.jp Backpatch-through: 16 --- src/backend/executor/execIndexing.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index bdf862b24062e..ca33a854278ed 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -279,7 +279,7 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo) * executor is performing an UPDATE that could not use an * optimization like heapam's HOT (in more general terms a * call to table_tuple_update() took place and set - * 'update_indexes' to TUUI_All). Receiving this hint makes + * 'update_indexes' to TU_All). Receiving this hint makes * us consider if we should pass down the 'indexUnchanged' * hint in turn. That's something that we figure out for * each index_insert() call iff 'update' is true. @@ -290,7 +290,7 @@ ExecCloseIndices(ResultRelInfo *resultRelInfo) * HOT has been applied and any updated columns are indexed * only by summarizing indexes (or in more general terms a * call to table_tuple_update() took place and set - * 'update_indexes' to TUUI_Summarizing). We can (and must) + * 'update_indexes' to TU_Summarizing). We can (and must) * therefore only update the indexes that have * 'amsummarizing' = true. * From 45879f48f140538f95794c1537390a058c5ebe47 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Tue, 1 Jul 2025 12:02:31 +0200 Subject: [PATCH 005/138] Limit the size of numa_move_pages requests There's a kernel bug in do_pages_stat(), affecting systems combining 64-bit kernel and 32-bit user space. The function splits the request into chunks of 16 pointers, but forgets the pointers are 32-bit when advancing to the next chunk. Some of the pointers get skipped, and memory after the array is interpreted as pointers. The result is that the produced status of memory pages is mostly bogus. Systems combining 64-bit and 32-bit environments like this might seem rare, but that's not the case - all 32-bit Debian packages are built in a 32-bit chroot on a system with a 64-bit kernel. This is a long-standing kernel bug (since 2010), affecting pretty much all kernels, so it'll take time until all systems get a fixed kernel. Luckily, we can work around the issue by chunking the requests the same way do_pages_stat() does, at least on affected systems. We don't know what kernel a 32-bit build will run on, so all 32-bit builds use chunks of 16 elements (the largest chunk before hitting the issue). 64-bit builds are not affected by this issue, and so could work without the chunking. But chunking has other advantages, so we apply chunking even for 64-bit builds, with chunks of 1024 elements. Reported-by: Christoph Berg Author: Christoph Berg Author: Bertrand Drouvot Discussion: https://postgr.es/m/aEtDozLmtZddARdB@msg.df7cb.de Context: https://marc.info/?l=linux-mm&m=175077821909222&w=2 Backpatch-through: 18 --- src/port/pg_numa.c | 50 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/src/port/pg_numa.c b/src/port/pg_numa.c index 4b487a2a4e814..d5935207d0a13 100644 --- a/src/port/pg_numa.c +++ b/src/port/pg_numa.c @@ -29,6 +29,19 @@ #include #include +/* + * numa_move_pages() chunk size, has to be <= 16 to work around a kernel bug + * in do_pages_stat() (chunked by DO_PAGES_STAT_CHUNK_NR). By using the same + * chunk size, we make it work even on unfixed kernels. + * + * 64-bit system are not affected by the bug, and so use much larger chunks. + */ +#if SIZEOF_SIZE_T == 4 +#define NUMA_QUERY_CHUNK_SIZE 16 +#else +#define NUMA_QUERY_CHUNK_SIZE 1024 +#endif + /* libnuma requires initialization as per numa(3) on Linux */ int pg_numa_init(void) @@ -42,11 +55,46 @@ pg_numa_init(void) * We use move_pages(2) syscall here - instead of get_mempolicy(2) - as the * first one allows us to batch and query about many memory pages in one single * giant system call that is way faster. + * + * We call numa_move_pages() for smaller chunks of the whole array. The first + * reason is to work around a kernel bug, but also to allow interrupting the + * query between the calls (for many pointers processing the whole array can + * take a lot of time). */ int pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status) { - return numa_move_pages(pid, count, pages, NULL, status, 0); + unsigned long next = 0; + int ret = 0; + + /* + * Chunk pointers passed to numa_move_pages to NUMA_QUERY_CHUNK_SIZE + * items, to work around a kernel bug in do_pages_stat(). + */ + while (next < count) + { + unsigned long count_chunk = Min(count - next, + NUMA_QUERY_CHUNK_SIZE); + + /* + * Bail out if any of the chunks errors out (ret<0). We ignore + * (ret>0) which is used to return number of nonmigrated pages, + * but we're not migrating any pages here. + */ + ret = numa_move_pages(pid, count_chunk, &pages[next], NULL, &status[next], 0); + if (ret < 0) + { + /* plain error, return as is */ + return ret; + } + + next += count_chunk; + } + + /* should have consumed the input array exactly */ + Assert(next == count); + + return 0; } int From 14e52227e57885d1a983d9f2515b569d3180c93d Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Tue, 1 Jul 2025 12:32:23 +0200 Subject: [PATCH 006/138] Silence valgrind about pg_numa_touch_mem_if_required When querying NUMA status of pages in shared memory, we need to touch the memory first to get valid results. This may trigger valgrind reports, because some of the memory (e.g. unpinned buffers) may be marked as noaccess. Solved by adding a valgrind suppresion. An alternative would be to adjust the access/noaccess status before touching the memory, but that seems far too invasive. It would require all those places to have detailed knowledge of what the shared memory stores. The pg_numa_touch_mem_if_required() macro is replaced with a function. Macros are invisible to suppressions, so it'd have to suppress reports for the caller - e.g. pg_get_shmem_allocations_numa(). So we'd suppress reports for the whole function, and that seems to heavy-handed. It might easily hide other valid issues. Reviewed-by: Christoph Berg Reviewed-by: Bertrand Drouvot Discussion: https://postgr.es/m/aEtDozLmtZddARdB@msg.df7cb.de Backpatch-through: 18 --- contrib/pg_buffercache/pg_buffercache_pages.c | 3 +-- src/backend/storage/ipc/shmem.c | 4 +--- src/include/port/pg_numa.h | 10 +++++++--- src/tools/valgrind.supp | 14 ++++++++++++++ 4 files changed, 23 insertions(+), 8 deletions(-) diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index 4b007f6e1b06a..ae0291e6e96df 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -320,7 +320,6 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) uint64 os_page_count; int pages_per_buffer; int max_entries; - volatile uint64 touch pg_attribute_unused(); char *startptr, *endptr; @@ -375,7 +374,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) /* Only need to touch memory once per backend process lifetime */ if (firstNumaTouch) - pg_numa_touch_mem_if_required(touch, ptr); + pg_numa_touch_mem_if_required(ptr); } Assert(idx == os_page_count); diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index c9ae3b45b76b1..ca3656fc76f43 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -679,12 +679,10 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) */ for (i = 0; i < shm_ent_page_count; i++) { - volatile uint64 touch pg_attribute_unused(); - page_ptrs[i] = startptr + (i * os_page_size); if (firstNumaTouch) - pg_numa_touch_mem_if_required(touch, page_ptrs[i]); + pg_numa_touch_mem_if_required(page_ptrs[i]); CHECK_FOR_INTERRUPTS(); } diff --git a/src/include/port/pg_numa.h b/src/include/port/pg_numa.h index 40f1d324dcfe2..6c8b7103cc344 100644 --- a/src/include/port/pg_numa.h +++ b/src/include/port/pg_numa.h @@ -24,12 +24,16 @@ extern PGDLLIMPORT int pg_numa_get_max_node(void); * This is required on Linux, before pg_numa_query_pages() as we * need to page-fault before move_pages(2) syscall returns valid results. */ -#define pg_numa_touch_mem_if_required(ro_volatile_var, ptr) \ - ro_volatile_var = *(volatile uint64 *) ptr +static inline void +pg_numa_touch_mem_if_required(void *ptr) +{ + volatile uint64 touch pg_attribute_unused(); + touch = *(volatile uint64 *) ptr; +} #else -#define pg_numa_touch_mem_if_required(ro_volatile_var, ptr) \ +#define pg_numa_touch_mem_if_required(ptr) \ do {} while(0) #endif diff --git a/src/tools/valgrind.supp b/src/tools/valgrind.supp index 7ea464c809417..2ad5b81526d3f 100644 --- a/src/tools/valgrind.supp +++ b/src/tools/valgrind.supp @@ -180,3 +180,17 @@ Memcheck:Cond fun:PyObject_Realloc } + +# NUMA introspection requires touching memory first, and some of it may +# be marked as noacess (e.g. unpinned buffers). So just ignore that. +{ + pg_numa_touch_mem_if_required + Memcheck:Addr4 + fun:pg_numa_touch_mem_if_required +} + +{ + pg_numa_touch_mem_if_required + Memcheck:Addr8 + fun:pg_numa_touch_mem_if_required +} From 54ac4944c36f8f6cfc4deaa3f828118b564e1d3d Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Tue, 1 Jul 2025 12:58:35 +0200 Subject: [PATCH 007/138] Add CHECK_FOR_INTERRUPTS into pg_numa_query_pages Querying the NUMA status can be quite time consuming, especially with large shared buffers. 8cc139bec34a called numa_move_pages() once, for all buffers, and we had to wait for the syscall to complete. But with the chunking, introduced by 7fe2f67c7c to work around a kernel bug, we can do CHECK_FOR_INTERRUPTS() after each chunk, allowing users to abort the execution. Reviewed-by: Christoph Berg Reviewed-by: Bertrand Drouvot Discussion: https://postgr.es/m/aEtDozLmtZddARdB@msg.df7cb.de Backpatch-through: 18 --- src/port/pg_numa.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/port/pg_numa.c b/src/port/pg_numa.c index d5935207d0a13..c65f22020ea51 100644 --- a/src/port/pg_numa.c +++ b/src/port/pg_numa.c @@ -16,6 +16,7 @@ #include "c.h" #include +#include "miscadmin.h" #include "port/pg_numa.h" /* @@ -76,6 +77,8 @@ pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status) unsigned long count_chunk = Min(count - next, NUMA_QUERY_CHUNK_SIZE); + CHECK_FOR_INTERRUPTS(); + /* * Bail out if any of the chunks errors out (ret<0). We ignore * (ret>0) which is used to return number of nonmigrated pages, From 07448b3969d55a2081cdafafc23f68df3392f220 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Tue, 1 Jul 2025 15:20:26 +0200 Subject: [PATCH 008/138] Fix indentation in pg_numa code Broken by commits 7fe2f67c7c9f, 81f287dc923f and bf1119d74a79. Backpatch to 18, same as the offending commits. Backpatch-through: 18 --- src/include/port/pg_numa.h | 1 + src/port/pg_numa.c | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/include/port/pg_numa.h b/src/include/port/pg_numa.h index 6c8b7103cc344..9d1ea6d0db89a 100644 --- a/src/include/port/pg_numa.h +++ b/src/include/port/pg_numa.h @@ -28,6 +28,7 @@ static inline void pg_numa_touch_mem_if_required(void *ptr) { volatile uint64 touch pg_attribute_unused(); + touch = *(volatile uint64 *) ptr; } diff --git a/src/port/pg_numa.c b/src/port/pg_numa.c index c65f22020ea51..3368a43a33826 100644 --- a/src/port/pg_numa.c +++ b/src/port/pg_numa.c @@ -65,8 +65,8 @@ pg_numa_init(void) int pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status) { - unsigned long next = 0; - int ret = 0; + unsigned long next = 0; + int ret = 0; /* * Chunk pointers passed to numa_move_pages to NUMA_QUERY_CHUNK_SIZE @@ -80,9 +80,9 @@ pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status) CHECK_FOR_INTERRUPTS(); /* - * Bail out if any of the chunks errors out (ret<0). We ignore - * (ret>0) which is used to return number of nonmigrated pages, - * but we're not migrating any pages here. + * Bail out if any of the chunks errors out (ret<0). We ignore (ret>0) + * which is used to return number of nonmigrated pages, but we're not + * migrating any pages here. */ ret = numa_move_pages(pid, count_chunk, &pages[next], NULL, &status[next], 0); if (ret < 0) From 45c5276628d129d6adec68b25b61daadf8476783 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 1 Jul 2025 12:08:20 -0400 Subject: [PATCH 009/138] Make safeguard against incorrect flags for fsync more portable. The existing code assumed that O_RDONLY is defined as 0, but this is not required by POSIX and is not true on GNU Hurd. We can avoid the assumption by relying on O_ACCMODE to mask the fcntl() result. (Hopefully, all supported platforms define that.) Author: Michael Banck Co-authored-by: Samuel Thibault Reviewed-by: Tom Lane Discussion: https://postgr.es/m/6862e8d1.050a0220.194b8d.76fa@mx.google.com Discussion: https://postgr.es/m/68480868.5d0a0220.1e214d.68a6@mx.google.com Backpatch-through: 13 --- src/backend/storage/file/fd.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 0e8299dd55646..a4ec7959f31cf 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -400,25 +400,22 @@ pg_fsync(int fd) * portable, even if it runs ok on the current system. * * We assert here that a descriptor for a file was opened with write - * permissions (either O_RDWR or O_WRONLY) and for a directory without - * write permissions (O_RDONLY). + * permissions (i.e., not O_RDONLY) and for a directory without write + * permissions (O_RDONLY). Notice that the assertion check is made even + * if fsync() is disabled. * - * Ignore any fstat errors and let the follow-up fsync() do its work. - * Doing this sanity check here counts for the case where fsync() is - * disabled. + * If fstat() fails, ignore it and let the follow-up fsync() complain. */ if (fstat(fd, &st) == 0) { int desc_flags = fcntl(fd, F_GETFL); - /* - * O_RDONLY is historically 0, so just make sure that for directories - * no write flags are used. - */ + desc_flags &= O_ACCMODE; + if (S_ISDIR(st.st_mode)) - Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0); + Assert(desc_flags == O_RDONLY); else - Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0); + Assert(desc_flags != O_RDONLY); } errno = 0; #endif From 581305a4659dd9006db58a551829b685db3950ec Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 1 Jul 2025 12:40:35 -0400 Subject: [PATCH 010/138] Make sure IOV_MAX is defined. We stopped defining IOV_MAX on non-Windows systems in 75357ab94, on the assumption that every non-Windows system defines it in as required by X/Open. GNU Hurd, however, doesn't follow that standard either. Put back the old logic to assume 16 if it's not defined. Author: Michael Banck Co-authored-by: Christoph Berg Reviewed-by: Tom Lane Discussion: https://postgr.es/m/6862e8d1.050a0220.194b8d.76fa@mx.google.com Discussion: https://postgr.es/m/6846e0c3.df0a0220.39ef9b.c60e@mx.google.com Backpatch-through: 16 --- src/include/port/pg_iovec.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/include/port/pg_iovec.h b/src/include/port/pg_iovec.h index df40c7208be48..90be3af449d6f 100644 --- a/src/include/port/pg_iovec.h +++ b/src/include/port/pg_iovec.h @@ -21,9 +21,6 @@ #else -/* POSIX requires at least 16 as a maximum iovcnt. */ -#define IOV_MAX 16 - /* Define our own POSIX-compatible iovec struct. */ struct iovec { @@ -33,6 +30,15 @@ struct iovec #endif +/* + * If didn't define IOV_MAX, define our own. X/Open requires at + * least 16. (GNU Hurd apparently feel that they're not bound by X/Open, + * because they don't define this symbol at all.) + */ +#ifndef IOV_MAX +#define IOV_MAX 16 +#endif + /* * Define a reasonable maximum that is safe to use on the stack in arrays of * struct iovec and other small types. The operating system could limit us to From b71351e1f2cb2687535cdc59370d6b908a842e18 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Tue, 1 Jul 2025 20:12:36 +0200 Subject: [PATCH 011/138] Fix outdated comment for IndexInfo Commit 78416235713 removed the ii_OpclassOptions field, but the comment was not updated. Author: Japin Li Reviewed-by: Richard Guo Discussion: https://www.postgresql.org/message-id/flat/ME0P300MB04453E6C7EA635F0ECF41BFCB6832%40ME0P300MB0445.AUSP300.PROD.OUTLOOK.COM --- src/include/nodes/execnodes.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 2492282213ff3..fdf79aa18a237 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -173,7 +173,6 @@ typedef struct ExprState * UniqueProcs * UniqueStrats * Unique is it a unique index? - * OpclassOptions opclass-specific options, or NULL if none * ReadyForInserts is it valid for inserts? * CheckedUnchanged IndexUnchanged status determined yet? * IndexUnchanged aminsert hint, cached for retail inserts From 399997d8ccace888b84887ed1a0242a6e745d1a9 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Tue, 1 Jul 2025 20:37:24 +0200 Subject: [PATCH 012/138] Update comment for IndexInfo.ii_WithoutOverlaps Commit fc0438b4e80 added the ii_WithoutOverlaps field, but the comment was not updated. Author: Japin Li Reviewed-by: Richard Guo Discussion: https://www.postgresql.org/message-id/flat/ME0P300MB04453E6C7EA635F0ECF41BFCB6832%40ME0P300MB0445.AUSP300.PROD.OUTLOOK.COM --- src/include/nodes/execnodes.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index fdf79aa18a237..09ea5998aadfe 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -178,6 +178,7 @@ typedef struct ExprState * IndexUnchanged aminsert hint, cached for retail inserts * Concurrent are we doing a concurrent index build? * BrokenHotChain did we detect any broken HOT chains? + * WithoutOverlaps is it a WITHOUT OVERLAPS index? * Summarizing is it a summarizing index? * ParallelWorkers # of workers requested (excludes leader) * Am Oid of index AM From c8b9f75111aa8dbc3dd6dea672d72d62fd4bf5cf Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Tue, 1 Jul 2025 13:54:38 -0500 Subject: [PATCH 013/138] Document pg_get_multixact_members(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Oversight in commit 0ac5ad5134. Author: Sami Imseih Co-authored-by: Álvaro Herrera Reviewed-by: Ashutosh Bapat Discussion: https://postgr.es/m/20150619215231.GT133018%40postgresql.org Discussion: https://postgr.es/m/CAA5RZ0sjQDDwJfMRb%3DZ13nDLuRpF13ME2L_BdGxi0op8RKjmDg%40mail.gmail.com Backpatch-through: 13 --- doc/src/sgml/func.sgml | 28 +++++++++++++++++++++++++++- doc/src/sgml/maintenance.sgml | 5 ++++- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 298791858be30..c017e2acfd5be 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -27687,6 +27687,31 @@ acl | {postgres=arwdDxtm/postgres,foo=r/postgres} details. + + + + + pg_get_multixact_members + + pg_get_multixact_members ( multixid xid ) + setof record + ( xid xid, + mode text ) + + + Returns the transaction ID and lock mode for each member of the + specified multixact ID. The lock modes forupd, + fornokeyupd, sh, and + keysh correspond to the row-level locks + FOR UPDATE, FOR NO KEY UPDATE, + FOR SHARE, and FOR KEY SHARE, + respectively, as described in . Two + additional modes are specific to multixacts: + nokeyupd, used by updates that do not modify key + columns, and upd, used by updates or deletes that + modify key columns. + + @@ -27695,7 +27720,8 @@ acl | {postgres=arwdDxtm/postgres,foo=r/postgres} The internal transaction ID type xid is 32 bits wide and wraps around every 4 billion transactions. However, the functions shown in , except - age and mxid_age, use a + age, mxid_age, and + pg_get_multixact_members, use a 64-bit type xid8 that does not wrap around during the life of an installation and can be converted to xid by casting if required; see for details. diff --git a/doc/src/sgml/maintenance.sgml b/doc/src/sgml/maintenance.sgml index 600e4b3f2f3b8..e7a9f58c01582 100644 --- a/doc/src/sgml/maintenance.sgml +++ b/doc/src/sgml/maintenance.sgml @@ -779,7 +779,10 @@ HINT: Execute a database-wide VACUUM in that database. careful aging management, storage cleanup, and wraparound handling. There is a separate storage area which holds the list of members in each multixact, which also uses a 32-bit counter and which must also - be managed. + be managed. The system function + pg_get_multixact_members() described in + can be used to examine the + transaction IDs associated with a multixact ID. From 3386b2fe7af98159b0c7bc3f0f03fcb524d98cd6 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Tue, 1 Jul 2025 14:35:59 -0500 Subject: [PATCH 014/138] Add commit 07448b3969 to .git-blame-ignore-revs. --- .git-blame-ignore-revs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 8048afd1a80fa..cf30711d616be 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -14,6 +14,9 @@ # # $ git log --pretty=format:"%H # %cd%n# %s" $PGINDENTGITHASH -1 --date=iso +07448b3969d55a2081cdafafc23f68df3392f220 # 2025-07-01 15:24:19 +0200 +# Fix indentation in pg_numa code + b27644bade0348d0dafd3036c47880a349fe9332 # 2025-06-15 13:04:24 -0400 # Sync typedefs.list with the buildfarm. From b897a58556d8c29366ae980d65bf5e90daf7098e Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Tue, 1 Jul 2025 22:15:26 +0200 Subject: [PATCH 015/138] Update comment for IndexInfo.ii_NullsNotDistinct Commit 7a7b3e11e61 added the ii_NullsNotDistinct field, but the comment was not updated. Author: Japin Li Reviewed-by: Richard Guo Discussion: https://www.postgresql.org/message-id/flat/ME0P300MB04453E6C7EA635F0ECF41BFCB6832%40ME0P300MB0445.AUSP300.PROD.OUTLOOK.COM --- src/include/nodes/execnodes.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 09ea5998aadfe..f1520d1f31a6c 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -173,6 +173,7 @@ typedef struct ExprState * UniqueProcs * UniqueStrats * Unique is it a unique index? + * NullsNotDistinct is NULLS NOT DISTINCT? * ReadyForInserts is it valid for inserts? * CheckedUnchanged IndexUnchanged status determined yet? * IndexUnchanged aminsert hint, cached for retail inserts From d09d1379346eac4b3aa45ee92a5a96bd315bfa4b Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Wed, 2 Jul 2025 13:48:41 +0900 Subject: [PATCH 016/138] Fix bug in archive streamer with LZ4 decompression When decompressing some input data, the calculation for the initial starting point and the initial size were incorrect, potentially leading to failures when decompressing contents with LZ4. These initialization points are fixed in this commit, bringing the logic closer to what exists for gzip and zstd. The contents of the compressed data is clear (for example backups taken with LZ4 can still be decompressed with a "lz4" command), only the decompression part reading the input data was impacted by this issue. This code path impacts pg_basebackup and pg_verifybackup, which can use the LZ4 decompression routines with an archive streamer, or any tools that try to use the archive streamers in src/fe_utils/. The issue is easier to reproduce with files that have a low-compression rate, like ones filled with random data, for a size of at least 512kB, but this could happen with anything as long as it is stored in a data folder. Some tests are added based on this idea, with a file filled with random bytes grabbed from the backend, written at the root of the data folder. This is proving good enough to reproduce the original problem. Author: Mikhail Gribkov Discussion: https://postgr.es/m/CAMEv5_uQS1Hg6KCaEP2JkrTBbZ-nXQhxomWrhYQvbdzR-zy-wA@mail.gmail.com Backpatch-through: 15 --- src/bin/pg_verifybackup/t/008_untar.pl | 22 +++++++++++++++++++ src/bin/pg_verifybackup/t/010_client_untar.pl | 22 +++++++++++++++++++ src/fe_utils/astreamer_lz4.c | 4 ++-- 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/src/bin/pg_verifybackup/t/008_untar.pl b/src/bin/pg_verifybackup/t/008_untar.pl index deed3ec247d2d..bc3d6b352ad50 100644 --- a/src/bin/pg_verifybackup/t/008_untar.pl +++ b/src/bin/pg_verifybackup/t/008_untar.pl @@ -16,6 +16,22 @@ $primary->init(allows_streaming => 1); $primary->start; +# Create file with some random data and an arbitrary size, useful to check +# the solidity of the compression and decompression logic. The size of the +# file is chosen to be around 640kB. This has proven to be large enough to +# detect some issues related to LZ4, and low enough to not impact the runtime +# of the test significantly. +my $junk_data = $primary->safe_psql( + 'postgres', qq( + SELECT string_agg(encode(sha256(i::bytea), 'hex'), '') + FROM generate_series(1, 10240) s(i);)); +my $data_dir = $primary->data_dir; +my $junk_file = "$data_dir/junk"; +open my $jf, '>', $junk_file + or die "Could not create junk file: $!"; +print $jf $junk_data; +close $jf; + # Create a tablespace directory. my $source_ts_path = PostgreSQL::Test::Utils::tempdir_short(); @@ -52,6 +68,12 @@ 'backup_archive' => [ 'base.tar.lz4', "$tsoid.tar.lz4" ], 'enabled' => check_pg_config("#define USE_LZ4 1") }, + { + 'compression_method' => 'lz4', + 'backup_flags' => [ '--compress', 'server-lz4:5' ], + 'backup_archive' => [ 'base.tar.lz4', "$tsoid.tar.lz4" ], + 'enabled' => check_pg_config("#define USE_LZ4 1") + }, { 'compression_method' => 'zstd', 'backup_flags' => [ '--compress', 'server-zstd' ], diff --git a/src/bin/pg_verifybackup/t/010_client_untar.pl b/src/bin/pg_verifybackup/t/010_client_untar.pl index d8d2b06c7ee86..b62faeb5acfab 100644 --- a/src/bin/pg_verifybackup/t/010_client_untar.pl +++ b/src/bin/pg_verifybackup/t/010_client_untar.pl @@ -15,6 +15,22 @@ $primary->init(allows_streaming => 1); $primary->start; +# Create file with some random data and an arbitrary size, useful to check +# the solidity of the compression and decompression logic. The size of the +# file is chosen to be around 640kB. This has proven to be large enough to +# detect some issues related to LZ4, and low enough to not impact the runtime +# of the test significantly. +my $junk_data = $primary->safe_psql( + 'postgres', qq( + SELECT string_agg(encode(sha256(i::bytea), 'hex'), '') + FROM generate_series(1, 10240) s(i);)); +my $data_dir = $primary->data_dir; +my $junk_file = "$data_dir/junk"; +open my $jf, '>', $junk_file + or die "Could not create junk file: $!"; +print $jf $junk_data; +close $jf; + my $backup_path = $primary->backup_dir . '/client-backup'; my $extract_path = $primary->backup_dir . '/extracted-backup'; @@ -37,6 +53,12 @@ 'backup_archive' => 'base.tar.lz4', 'enabled' => check_pg_config("#define USE_LZ4 1") }, + { + 'compression_method' => 'lz4', + 'backup_flags' => [ '--compress', 'client-lz4:1' ], + 'backup_archive' => 'base.tar.lz4', + 'enabled' => check_pg_config("#define USE_LZ4 1") + }, { 'compression_method' => 'zstd', 'backup_flags' => [ '--compress', 'client-zstd:5' ], diff --git a/src/fe_utils/astreamer_lz4.c b/src/fe_utils/astreamer_lz4.c index 781aaf99f38fe..5f581d1de3769 100644 --- a/src/fe_utils/astreamer_lz4.c +++ b/src/fe_utils/astreamer_lz4.c @@ -322,9 +322,9 @@ astreamer_lz4_decompressor_content(astreamer *streamer, mystreamer = (astreamer_lz4_frame *) streamer; next_in = (uint8 *) data; - next_out = (uint8 *) mystreamer->base.bbs_buffer.data; + next_out = (uint8 *) mystreamer->base.bbs_buffer.data + mystreamer->bytes_written; avail_in = len; - avail_out = mystreamer->base.bbs_buffer.maxlen; + avail_out = mystreamer->base.bbs_buffer.maxlen - mystreamer->bytes_written; while (avail_in > 0) { From 3e73d8735371fc20a13fb4b68fc643909749dde0 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Wed, 2 Jul 2025 11:51:10 +0700 Subject: [PATCH 017/138] Remove implicit cast from 'void *' Commit e2809e3a101 added code to a header which assigns a pointer to void to a pointer to unsigned char. This causes build errors for extensions written in C++. Fix by adding an explicit cast. Reviewed-by: Tom Lane Discussion: https://postgr.es/m/CANWCAZaCq9AHBuhs%3DMx7Gg_0Af9oRU7iAqr0itJCtfmsWwVmnQ%40mail.gmail.com Backpatch-through: 18 --- src/include/port/pg_crc32c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 82313bb7fcfee..ae008118ea818 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -72,7 +72,7 @@ pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len) { if (__builtin_constant_p(len) && len < 32) { - const unsigned char *p = data; + const unsigned char *p = (const unsigned char *) data; /* * For small constant inputs, inline the computation to avoid a From 7c6ededac82baa0c334c11a36addd48522ec433d Mon Sep 17 00:00:00 2001 From: Masahiko Sawada Date: Tue, 1 Jul 2025 23:25:17 -0700 Subject: [PATCH 018/138] Fix missing FSM vacuum opportunities on tables without indexes. Commit c120550edb86 optimized the vacuuming of relations without indexes (a.k.a. one-pass strategy) by directly marking dead item IDs as LP_UNUSED. However, the periodic FSM vacuum was still checking if dead item IDs had been marked as LP_DEAD when attempting to vacuum the FSM every VACUUM_FSM_EVERY_PAGES blocks. This condition was never met due to the optimization, resulting in missed FSM vacuum opportunities. This commit modifies the periodic FSM vacuum condition to use the number of tuples deleted during HOT pruning. This count includes items marked as either LP_UNUSED or LP_REDIRECT, both of which are expected to result in new free space to report. Back-patch to v17 where the vacuum optimization for tables with no indexes was introduced. Reviewed-by: Melanie Plageman Discussion: https://postgr.es/m/CAD21AoBL8m6B9GSzQfYxVaEgvD7-Kr3AJaS-hJPHC+avm-29zw@mail.gmail.com Backpatch-through: 17 --- src/backend/access/heap/vacuumlazy.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 4111a8996b5a1..0fef8e49e2b64 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -431,7 +431,7 @@ static void find_next_unskippable_block(LVRelState *vacrel, bool *skipsallvis); static bool lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, Page page, bool sharelock, Buffer vmbuffer); -static void lazy_scan_prune(LVRelState *vacrel, Buffer buf, +static int lazy_scan_prune(LVRelState *vacrel, Buffer buf, BlockNumber blkno, Page page, Buffer vmbuffer, bool all_visible_according_to_vm, bool *has_lpdead_items, bool *vm_page_frozen); @@ -1245,6 +1245,7 @@ lazy_scan_heap(LVRelState *vacrel) Buffer buf; Page page; uint8 blk_info = 0; + int ndeleted = 0; bool has_lpdead_items; void *per_buffer_data = NULL; bool vm_page_frozen = false; @@ -1387,10 +1388,10 @@ lazy_scan_heap(LVRelState *vacrel) * line pointers previously marked LP_DEAD. */ if (got_cleanup_lock) - lazy_scan_prune(vacrel, buf, blkno, page, - vmbuffer, - blk_info & VAC_BLK_ALL_VISIBLE_ACCORDING_TO_VM, - &has_lpdead_items, &vm_page_frozen); + ndeleted = lazy_scan_prune(vacrel, buf, blkno, page, + vmbuffer, + blk_info & VAC_BLK_ALL_VISIBLE_ACCORDING_TO_VM, + &has_lpdead_items, &vm_page_frozen); /* * Count an eagerly scanned page as a failure or a success. @@ -1481,7 +1482,7 @@ lazy_scan_heap(LVRelState *vacrel) * table has indexes. There will only be newly-freed space if we * held the cleanup lock and lazy_scan_prune() was called. */ - if (got_cleanup_lock && vacrel->nindexes == 0 && has_lpdead_items && + if (got_cleanup_lock && vacrel->nindexes == 0 && ndeleted > 0 && blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES) { FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, @@ -1936,8 +1937,10 @@ cmpOffsetNumbers(const void *a, const void *b) * *vm_page_frozen is set to true if the page is newly set all-frozen in the * VM. The caller currently only uses this for determining whether an eagerly * scanned page was successfully set all-frozen. + * + * Returns the number of tuples deleted from the page during HOT pruning. */ -static void +static int lazy_scan_prune(LVRelState *vacrel, Buffer buf, BlockNumber blkno, @@ -2208,6 +2211,8 @@ lazy_scan_prune(LVRelState *vacrel, *vm_page_frozen = true; } } + + return presult.ndeleted; } /* From 87f0d3cd8db54f616d3e65f6cb6fb54428b852df Mon Sep 17 00:00:00 2001 From: Daniel Gustafsson Date: Wed, 2 Jul 2025 11:42:36 +0200 Subject: [PATCH 019/138] doc: pg_buffercache documentation wordsmithing A words seemed to have gone missing in the leading paragraphs. Author: Bertrand Drouvot Co-authored-by: Daniel Gustafsson Discussion: https://postgr.es/m/aGTQYZz9L0bjlzVL@ip-10-97-1-34.eu-west-3.compute.internal Backpatch-through: 18 --- doc/src/sgml/pgbuffercache.sgml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml index 537d601494242..546ace8369e28 100644 --- a/doc/src/sgml/pgbuffercache.sgml +++ b/doc/src/sgml/pgbuffercache.sgml @@ -37,12 +37,12 @@ This module provides the pg_buffercache_pages() - function (wrapped in the pg_buffercache view), + function (wrapped in the pg_buffercache view), the pg_buffercache_numa_pages() function (wrapped in the pg_buffercache_numa view), the pg_buffercache_summary() function, the pg_buffercache_usage_counts() function, the - pg_buffercache_evict(), the + pg_buffercache_evict() function, the pg_buffercache_evict_relation() function and the pg_buffercache_evict_all() function. @@ -55,7 +55,7 @@ - The pg_buffercache_numa_pages() provides + The pg_buffercache_numa_pages() function provides NUMA node mappings for shared buffer entries. This information is not part of pg_buffercache_pages() itself, as it is much slower to retrieve. From 7c365eb504297408673938087a15cac22a8c7d01 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Wed, 2 Jul 2025 09:40:48 -0400 Subject: [PATCH 020/138] Make handling of redundant nbtree keys more robust. nbtree preprocessing's handling of redundant (and contradictory) keys created problems for scans with = arrays. It was just about possible for a scan with an = array key and one or more redundant keys (keys that preprocessing could not eliminate due an incomplete opfamily and a cross-type key) to get stuck. Testing has shown that infinite cycling where the scan never manages to make forward progress was possible. This could happen when the scan's arrays were reset in _bt_readpage's forcenonrequired=true path (added by bugfix commit 5f4d98d4) when the arrays weren't at least advanced up to the same point that they were in at the start of the _bt_readpage call. Earlier redundant keys prevented the finaltup call to _bt_advance_array_keys from reaching lower-order keys that needed to be used to sufficiently advance the scan's arrays. To fix, make preprocessing leave the scan's keys in a state that is as close as possible to how it'll usually leave them (in the common case where there's no redundant keys that preprocessing failed to eliminate). Now nbtree preprocessing _reliably_ leaves behind at most one required >/>= key per index column, and at most one required keyData[] array as needed. That way they'll always be evaluated after the scan's required keys, and so cannot prevent code in places like _bt_advance_array_keys and _bt_first from reaching a required key. Also teach _bt_first to decide which initial positioning keys to use based on the same requiredness markings that have long been used by _bt_checkkeys/_bt_advance_array_keys. This is a necessary condition for reliably avoiding infinite cycling. _bt_advance_array_keys expects to be able to reason about what'll happen in the next _bt_first call should it start another primitive index scan, by evaluating inequality keys that were marked required in the opposite-to-scan scan direction only. Now everybody (_bt_first, _bt_checkkeys, and _bt_advance_array_keys) will always agree on which exact key will be used on each index column to start and/or end the scan (except when row compare keys are involved, which have similar problems not addressed by this commit). An upcoming commit will finish off the work started by this commit by harmonizing how _bt_first, _bt_checkkeys, and _bt_advance_array_keys apply row compare keys to start and end scans. This fixes what was arguably an oversight in either commit 5f4d98d4 or commit 8a510275. Author: Peter Geoghegan Reviewed-By: Heikki Linnakangas Discussion: https://postgr.es/m/CAH2-Wz=ds4M+3NXMgwxYxqU8MULaLf696_v5g=9WNmWL2=Uo2A@mail.gmail.com Backpatch-through: 18 --- src/backend/access/nbtree/nbtpreprocesskeys.c | 384 +++++++++++++++--- src/backend/access/nbtree/nbtsearch.c | 204 +++++----- src/backend/access/nbtree/nbtutils.c | 136 +------ 3 files changed, 455 insertions(+), 269 deletions(-) diff --git a/src/backend/access/nbtree/nbtpreprocesskeys.c b/src/backend/access/nbtree/nbtpreprocesskeys.c index a136e4bbfdfb5..36813a96fff57 100644 --- a/src/backend/access/nbtree/nbtpreprocesskeys.c +++ b/src/backend/access/nbtree/nbtpreprocesskeys.c @@ -16,6 +16,7 @@ #include "postgres.h" #include "access/nbtree.h" +#include "common/int.h" #include "lib/qunique.h" #include "utils/array.h" #include "utils/lsyscache.h" @@ -56,6 +57,8 @@ static void _bt_skiparray_strat_decrement(IndexScanDesc scan, ScanKey arraysk, BTArrayKeyInfo *array); static void _bt_skiparray_strat_increment(IndexScanDesc scan, ScanKey arraysk, BTArrayKeyInfo *array); +static void _bt_unmark_keys(IndexScanDesc scan, int *keyDataMap); +static int _bt_reorder_array_cmp(const void *a, const void *b); static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys); static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap); static int _bt_num_array_keys(IndexScanDesc scan, Oid *skip_eq_ops_out, @@ -96,7 +99,7 @@ static int _bt_compare_array_elements(const void *a, const void *b, void *arg); * incomplete sets of cross-type operators, we may fail to detect redundant * or contradictory keys, but we can survive that.) * - * The output keys must be sorted by index attribute. Presently we expect + * Required output keys are sorted by index attribute. Presently we expect * (but verify) that the input keys are already so sorted --- this is done * by match_clauses_to_index() in indxpath.c. Some reordering of the keys * within each attribute may be done as a byproduct of the processing here. @@ -127,29 +130,36 @@ static int _bt_compare_array_elements(const void *a, const void *b, void *arg); * This has the potential to be much more efficient than a full index scan * (though it behaves like a full scan when there's many distinct "x" values). * - * If possible, redundant keys are eliminated: we keep only the tightest + * Typically, redundant keys are eliminated: we keep only the tightest * >/>= bound and the tightest />= or both - * 4::int AND x > 10::bigint", and we are unable to determine - * which key is more restrictive for lack of a suitable cross-type operator. - * _bt_first will arbitrarily pick one of the keys to do the initial - * positioning with. If it picks x > 4, then the x > 10 condition will fail - * until we reach index entries > 10; but we can't stop the scan just because - * x > 10 is failing. On the other hand, if we are scanning backwards, then - * failure of either key is indeed enough to stop the scan. (In general, when - * inequality keys are present, the initial-positioning code only promises to - * position before the first possible match, not exactly at the first match, - * for a forward scan; or after the last match for a backward scan.) + * we cannot eliminate either key. + * + * When all redundant keys could not be eliminated, we'll output a key array + * that can more or less be treated as if it had no redundant keys. Suppose + * we have "x > 4::int AND x > 10::bigint AND x < 70", and we are unable to + * determine which > key is more restrictive for lack of a suitable cross-type + * operator. We'll arbitrarily pick one of the > keys; the other > key won't + * be marked required. Obviously, the scan will be less efficient if we + * choose x > 4 over x > 10 -- but it can still largely proceed as if there + * was only a single > condition. "x > 10" will be placed at the end of the + * so->keyData[] output array. It'll always be evaluated last, after the keys + * that could be marked required in the usual way (after "x > 4 AND x < 70"). + * This can sometimes result in so->keyData[] keys that aren't even in index + * attribute order (if the qual involves multiple attributes). The scan's + * required keys will still be in attribute order, though, so it can't matter. + * + * This scheme ensures that _bt_first always uses the same set of keys at the + * start of a forwards scan as those _bt_checkkeys uses to determine when to + * end a similar backwards scan (and vice-versa). _bt_advance_array_keys + * depends on this: it expects to be able to reliably predict what the next + * _bt_first call will do by testing whether _bt_checkkeys' routines report + * that the final tuple on the page is past the end of matches for the scan's + * keys with the scan direction flipped. If it is (if continuescan=false), + * then it follows that calling _bt_first will, at a minimum, relocate the + * scan to the very next leaf page (in the current scan direction). * * As a byproduct of this work, we can detect contradictory quals such * as "x = 1 AND x > 2". If we see that, we return so->qual_ok = false, @@ -188,7 +198,8 @@ _bt_preprocess_keys(IndexScanDesc scan) int numberOfEqualCols; ScanKey inkeys; BTScanKeyPreproc xform[BTMaxStrategyNumber]; - bool test_result; + bool test_result, + redundant_key_kept = false; AttrNumber attno; ScanKey arrayKeyData; int *keyDataMap = NULL; @@ -388,7 +399,8 @@ _bt_preprocess_keys(IndexScanDesc scan) xform[j].inkey = NULL; xform[j].inkeyi = -1; } - /* else, cannot determine redundancy, keep both keys */ + else + redundant_key_kept = true; } /* track number of attrs for which we have "=" keys */ numberOfEqualCols++; @@ -409,6 +421,8 @@ _bt_preprocess_keys(IndexScanDesc scan) else xform[BTLessStrategyNumber - 1].inkey = NULL; } + else + redundant_key_kept = true; } /* try to keep only one of >, >= */ @@ -426,6 +440,8 @@ _bt_preprocess_keys(IndexScanDesc scan) else xform[BTGreaterStrategyNumber - 1].inkey = NULL; } + else + redundant_key_kept = true; } /* @@ -466,25 +482,6 @@ _bt_preprocess_keys(IndexScanDesc scan) /* check strategy this key's operator corresponds to */ j = inkey->sk_strategy - 1; - /* if row comparison, push it directly to the output array */ - if (inkey->sk_flags & SK_ROW_HEADER) - { - ScanKey outkey = &so->keyData[new_numberOfKeys++]; - - memcpy(outkey, inkey, sizeof(ScanKeyData)); - if (arrayKeyData) - keyDataMap[new_numberOfKeys - 1] = i; - if (numberOfEqualCols == attno - 1) - _bt_mark_scankey_required(outkey); - - /* - * We don't support RowCompare using equality; such a qual would - * mess up the numberOfEqualCols tracking. - */ - Assert(j != (BTEqualStrategyNumber - 1)); - continue; - } - if (inkey->sk_strategy == BTEqualStrategyNumber && (inkey->sk_flags & SK_SEARCHARRAY)) { @@ -593,9 +590,8 @@ _bt_preprocess_keys(IndexScanDesc scan) * the new scan key. * * Note: We do things this way around so that our arrays are - * always in the same order as their corresponding scan keys, - * even with incomplete opfamilies. _bt_advance_array_keys - * depends on this. + * always in the same order as their corresponding scan keys. + * _bt_preprocess_array_keys_final expects this. */ ScanKey outkey = &so->keyData[new_numberOfKeys++]; @@ -607,6 +603,7 @@ _bt_preprocess_keys(IndexScanDesc scan) xform[j].inkey = inkey; xform[j].inkeyi = i; xform[j].arrayidx = arrayidx; + redundant_key_kept = true; } } } @@ -622,6 +619,15 @@ _bt_preprocess_keys(IndexScanDesc scan) if (arrayKeyData) _bt_preprocess_array_keys_final(scan, keyDataMap); + /* + * If there are remaining redundant inequality keys, we must make sure + * that each index attribute has no more than one required >/>= key, and + * no more than one required qual_ok) + _bt_unmark_keys(scan, keyDataMap); + /* Could pfree arrayKeyData/keyDataMap now, but not worth the cycles */ } @@ -847,8 +853,7 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, cmp_op; StrategyNumber strat; - Assert(!((leftarg->sk_flags | rightarg->sk_flags) & - (SK_ROW_HEADER | SK_ROW_MEMBER))); + Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_ROW_MEMBER)); /* * First, deal with cases where one or both args are NULL. This should @@ -924,6 +929,16 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, return true; } + /* + * We don't yet know how to determine redundancy when it involves a row + * compare key (barring simple cases involving IS NULL/IS NOT NULL) + */ + if ((leftarg->sk_flags | rightarg->sk_flags) & SK_ROW_HEADER) + { + Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_BT_SKIP)); + return false; + } + /* * If either leftarg or rightarg are equality-type array scankeys, we need * specialized handling (since by now we know that IS NULL wasn't used) @@ -1467,6 +1482,283 @@ _bt_skiparray_strat_increment(IndexScanDesc scan, ScanKey arraysk, } } +/* + * _bt_unmark_keys() -- make superfluous required keys nonrequired after all + * + * When _bt_preprocess_keys fails to eliminate one or more redundant keys, it + * calls here to make sure that no index attribute has more than one > or >= + * key marked required, and no more than one required < or <= key. Attributes + * with = keys will always get one = key as their required key. All other + * keys that were initially marked required get "unmarked" here. That way, + * _bt_first and _bt_checkkeys will reliably agree on which keys to use to + * start and/or to end the scan. + * + * We also relocate keys that become/started out nonrequired to the end of + * so->keyData[]. That way, _bt_first and _bt_checkkeys cannot fail to reach + * a required key due to some earlier nonrequired key getting in the way. + * + * Only call here when _bt_compare_scankey_args returned false at least once + * (otherwise, calling here will just waste cycles). + */ +static void +_bt_unmark_keys(IndexScanDesc scan, int *keyDataMap) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + AttrNumber attno; + bool *unmarkikey; + int nunmark, + nunmarked, + nkept, + firsti; + ScanKey keepKeys, + unmarkKeys; + FmgrInfo *keepOrderProcs = NULL, + *unmarkOrderProcs = NULL; + bool haveReqEquals, + haveReqForward, + haveReqBackward; + + /* + * Do an initial pass over so->keyData[] that determines which keys to + * keep as required. We expect so->keyData[] to still be in attribute + * order when we're called (though we don't expect any particular order + * among each attribute's keys). + * + * When both equality and inequality keys remain on a single attribute, we + * *must* make sure that exactly one of the equalities remains required. + * Any requiredness markings that we might leave on later keys/attributes + * are predicated on there being required = keys on all prior columns. + */ + unmarkikey = palloc0(so->numberOfKeys * sizeof(bool)); + nunmark = 0; + + /* Set things up for first key's attribute */ + attno = so->keyData[0].sk_attno; + firsti = 0; + haveReqEquals = false; + haveReqForward = false; + haveReqBackward = false; + for (int i = 0; i < so->numberOfKeys; i++) + { + ScanKey origkey = &so->keyData[i]; + + if (origkey->sk_attno != attno) + { + /* Reset for next attribute */ + attno = origkey->sk_attno; + firsti = i; + + haveReqEquals = false; + haveReqForward = false; + haveReqBackward = false; + } + + /* Equalities get priority over inequalities */ + if (haveReqEquals) + { + /* + * We already found the first "=" key for this attribute. We've + * already decided that all its other keys will be unmarked. + */ + Assert(!(origkey->sk_flags & SK_SEARCHNULL)); + unmarkikey[i] = true; + nunmark++; + continue; + } + else if ((origkey->sk_flags & SK_BT_REQFWD) && + (origkey->sk_flags & SK_BT_REQBKWD)) + { + /* + * Found the first "=" key for attno. All other attno keys will + * be unmarked. + */ + Assert(origkey->sk_strategy == BTEqualStrategyNumber); + + haveReqEquals = true; + for (int j = firsti; j < i; j++) + { + /* Unmark any prior inequality keys on attno after all */ + if (!unmarkikey[j]) + { + unmarkikey[j] = true; + nunmark++; + } + } + continue; + } + + /* Deal with inequalities next */ + if ((origkey->sk_flags & SK_BT_REQFWD) && !haveReqForward) + { + haveReqForward = true; + continue; + } + else if ((origkey->sk_flags & SK_BT_REQBKWD) && !haveReqBackward) + { + haveReqBackward = true; + continue; + } + + /* + * We have either a redundant inequality key that will be unmarked, or + * we have a key that wasn't marked required in the first place + */ + unmarkikey[i] = true; + nunmark++; + } + + /* Should only be called when _bt_compare_scankey_args reported failure */ + Assert(nunmark > 0); + + /* + * Next, allocate temp arrays: one for required keys that'll remain + * required, the other for all remaining keys + */ + unmarkKeys = palloc(nunmark * sizeof(ScanKeyData)); + keepKeys = palloc((so->numberOfKeys - nunmark) * sizeof(ScanKeyData)); + nunmarked = 0; + nkept = 0; + if (so->numArrayKeys) + { + unmarkOrderProcs = palloc(nunmark * sizeof(FmgrInfo)); + keepOrderProcs = palloc((so->numberOfKeys - nunmark) * sizeof(FmgrInfo)); + } + + /* + * Next, copy the contents of so->keyData[] into the appropriate temp + * array. + * + * Scans with = array keys need us to maintain invariants around the order + * of so->orderProcs[] and so->arrayKeys[] relative to so->keyData[]. See + * _bt_preprocess_array_keys_final for a full explanation. + */ + for (int i = 0; i < so->numberOfKeys; i++) + { + ScanKey origkey = &so->keyData[i]; + ScanKey unmark; + + if (!unmarkikey[i]) + { + /* + * Key gets to keep its original requiredness markings. + * + * Key will stay in its original position, unless we're going to + * unmark an earlier key (in which case this key gets moved back). + */ + memcpy(keepKeys + nkept, origkey, sizeof(ScanKeyData)); + + if (so->numArrayKeys) + { + keyDataMap[i] = nkept; + memcpy(keepOrderProcs + nkept, &so->orderProcs[i], + sizeof(FmgrInfo)); + } + + nkept++; + continue; + } + + /* + * Key will be unmarked as needed, and moved to the end of the array, + * next to other keys that will become (or always were) nonrequired + */ + unmark = unmarkKeys + nunmarked; + memcpy(unmark, origkey, sizeof(ScanKeyData)); + + if (so->numArrayKeys) + { + keyDataMap[i] = (so->numberOfKeys - nunmark) + nunmarked; + memcpy(&unmarkOrderProcs[nunmarked], &so->orderProcs[i], + sizeof(FmgrInfo)); + } + + /* + * Preprocessing only generates skip arrays when it knows that they'll + * be the only required = key on the attr. We'll never unmark them. + */ + Assert(!(unmark->sk_flags & SK_BT_SKIP)); + + /* + * Also shouldn't have to unmark an IS NULL or an IS NOT NULL key. + * They aren't cross-type, so an incomplete opfamily can't matter. + */ + Assert(!(unmark->sk_flags & SK_ISNULL) || + !(unmark->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))); + + /* Clear requiredness flags on redundant key (and on any subkeys) */ + unmark->sk_flags &= ~(SK_BT_REQFWD | SK_BT_REQBKWD); + if (unmark->sk_flags & SK_ROW_HEADER) + { + ScanKey subkey = (ScanKey) DatumGetPointer(unmark->sk_argument); + + Assert(subkey->sk_strategy == unmark->sk_strategy); + for (;;) + { + Assert(subkey->sk_flags & SK_ROW_MEMBER); + subkey->sk_flags &= ~(SK_BT_REQFWD | SK_BT_REQBKWD); + if (subkey->sk_flags & SK_ROW_END) + break; + subkey++; + } + } + + nunmarked++; + } + + /* Copy both temp arrays back into so->keyData[] to reorder */ + Assert(nkept == so->numberOfKeys - nunmark); + Assert(nunmarked == nunmark); + memcpy(so->keyData, keepKeys, sizeof(ScanKeyData) * nkept); + memcpy(so->keyData + nkept, unmarkKeys, sizeof(ScanKeyData) * nunmarked); + + /* Done with temp arrays */ + pfree(unmarkikey); + pfree(keepKeys); + pfree(unmarkKeys); + + /* + * Now copy so->orderProcs[] temp entries needed by scans with = array + * keys back (just like with the so->keyData[] temp arrays) + */ + if (so->numArrayKeys) + { + memcpy(so->orderProcs, keepOrderProcs, sizeof(FmgrInfo) * nkept); + memcpy(so->orderProcs + nkept, unmarkOrderProcs, + sizeof(FmgrInfo) * nunmarked); + + /* Also fix-up array->scan_key references */ + for (int arridx = 0; arridx < so->numArrayKeys; arridx++) + { + BTArrayKeyInfo *array = &so->arrayKeys[arridx]; + + array->scan_key = keyDataMap[array->scan_key]; + } + + /* + * Sort so->arrayKeys[] based on its new BTArrayKeyInfo.scan_key + * offsets, so that its order matches so->keyData[] order as expected + */ + qsort(so->arrayKeys, so->numArrayKeys, sizeof(BTArrayKeyInfo), + _bt_reorder_array_cmp); + + /* Done with temp arrays */ + pfree(unmarkOrderProcs); + pfree(keepOrderProcs); + } +} + +/* + * qsort comparator for reordering so->arrayKeys[] BTArrayKeyInfo entries + */ +static int +_bt_reorder_array_cmp(const void *a, const void *b) +{ + BTArrayKeyInfo *arraya = (BTArrayKeyInfo *) a; + BTArrayKeyInfo *arrayb = (BTArrayKeyInfo *) b; + + return pg_cmp_s32(arraya->scan_key, arrayb->scan_key); +} + /* * _bt_preprocess_array_keys() -- Preprocess SK_SEARCHARRAY scan keys * diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 36544ecfd5878..9846ef6db53ae 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -960,46 +960,51 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) /*---------- * Examine the scan keys to discover where we need to start the scan. + * The selected scan keys (at most one per index column) are remembered by + * storing their addresses into the local startKeys[] array. The final + * startKeys[] entry's strategy is set in strat_total. (Actually, there + * are a couple of cases where we force a less/more restrictive strategy.) * - * We want to identify the keys that can be used as starting boundaries; - * these are =, >, or >= keys for a forward scan or =, <, <= keys for - * a backwards scan. We can use keys for multiple attributes so long as - * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept - * a > or < boundary or find an attribute with no boundary (which can be - * thought of as the same as "> -infinity"), we can't use keys for any - * attributes to its right, because it would break our simplistic notion - * of what initial positioning strategy to use. + * We must use the key that was marked required (in the direction opposite + * our own scan's) during preprocessing. Each index attribute can only + * have one such required key. In general, the keys that we use to find + * an initial position when scanning forwards are the same keys that end + * the scan on the leaf level when scanning backwards (and vice-versa). * * When the scan keys include cross-type operators, _bt_preprocess_keys - * may not be able to eliminate redundant keys; in such cases we will - * arbitrarily pick a usable one for each attribute. This is correct - * but possibly not optimal behavior. (For example, with keys like - * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when - * x=5 would be more efficient.) Since the situation only arises given - * a poorly-worded query plus an incomplete opfamily, live with it. + * may not be able to eliminate redundant keys; in such cases it will + * arbitrarily pick a usable key for each attribute (and scan direction), + * ensuring that there is no more than one key required in each direction. + * We stop considering further keys once we reach the first nonrequired + * key (which must come after all required keys), so this can't affect us. + * + * The required keys that we use as starting boundaries have to be =, >, + * or >= keys for a forward scan or =, <, <= keys for a backwards scan. + * We can use keys for multiple attributes so long as the prior attributes + * had only =, >= (resp. =, <=) keys. These rules are very similar to the + * rules that preprocessing used to determine which keys to mark required. + * We cannot always use every required key as a positioning key, though. + * Skip arrays necessitate independently applying our own rules here. + * Skip arrays are always generally considered = array keys, but we'll + * nevertheless treat them as inequalities at certain points of the scan. + * When that happens, it _might_ have implications for the number of + * required keys that we can safely use for initial positioning purposes. * - * When both equality and inequality keys appear for a single attribute - * (again, only possible when cross-type operators appear), we *must* - * select one of the equality keys for the starting point, because - * _bt_checkkeys() will stop the scan as soon as an equality qual fails. - * For example, if we have keys like "x >= 4 AND x = 10" and we elect to - * start at x=4, we will fail and stop before reaching x=10. If multiple - * equality quals survive preprocessing, however, it doesn't matter which - * one we use --- by definition, they are either redundant or - * contradictory. + * For example, a forward scan with a skip array on its leading attribute + * (with no low_compare/high_compare) will have at least two required scan + * keys, but we won't use any of them as boundary keys during the scan's + * initial call here. Our positioning key during the first call here can + * be thought of as representing "> -infinity". Similarly, if such a skip + * array's low_compare is "a > 'foo'", then we position using "a > 'foo'" + * during the scan's initial call here; a lower-order key such as "b = 42" + * can't be used until the "a" array advances beyond MINVAL/low_compare. * - * In practice we rarely see any "attribute boundary key gaps" here. - * Preprocessing can usually backfill skip array keys for any attributes - * that were omitted from the original scan->keyData[] input keys. All - * array keys are always considered = keys, but we'll sometimes need to - * treat the current key value as if we were using an inequality strategy. - * This happens with range skip arrays, which store inequality keys in the - * array's low_compare/high_compare fields (used to find the first/last - * set of matches, when = key will lack a usable sk_argument value). - * These are always preferred over any redundant "standard" inequality - * keys on the same column (per the usual rule about preferring = keys). - * Note also that any column with an = skip array key can never have an - * additional, contradictory = key. + * On the other hand, if such a skip array's low_compare was "a >= 'foo'", + * then we _can_ use "a >= 'foo' AND b = 42" during the initial call here. + * A subsequent call here might have us use "a = 'fop' AND b = 42". Note + * that we treat = and >= as equivalent when scanning forwards (just as we + * treat = and <= as equivalent when scanning backwards). We effectively + * do the same thing (though with a distinct "a" element/value) each time. * * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP * array keys whose array is "null_elem=true") imply a NOT NULL qualifier. @@ -1014,18 +1019,17 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * first (leftmost) columns. We'll add on lower-order columns of the row * comparison below, if possible. * - * The selected scan keys (at most one per index column) are remembered by - * storing their addresses into the local startKeys[] array. - * - * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start - * the next primitive index scan (for scans with array keys) based in part - * on an understanding of how it'll enable us to reposition the scan. - * They're directly aware of how we'll sometimes cons up an explicit - * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a - * symmetric "deduce NOT NULL" rule of their own. This allows top-level - * scans to skip large groups of NULLs through repeated deductions about - * key strictness (for a required inequality key) and whether NULLs in the - * key's index column are stored last or first (relative to non-NULLs). + * _bt_advance_array_keys needs to know exactly how we'll reposition the + * scan (should it opt to schedule another primitive index scan). It is + * critical that primscans only be scheduled when they'll definitely make + * some useful progress. _bt_advance_array_keys does this by calling + * _bt_checkkeys routines that report whether a tuple is past the end of + * matches for the scan's keys (given the scan's current array elements). + * If the page's final tuple is "after the end of matches" for a scan that + * uses the *opposite* scan direction, then it must follow that it's also + * "before the start of matches" for the actual current scan direction. + * It is therefore essential that all of our initial positioning rules are + * symmetric with _bt_checkkeys's corresponding continuescan=false rule. * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might * need to be kept in sync. *---------- @@ -1034,18 +1038,17 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) if (so->numberOfKeys > 0) { AttrNumber curattr; - ScanKey chosen; + ScanKey bkey; ScanKey impliesNN; ScanKey cur; /* - * chosen is the so-far-chosen key for the current attribute, if any. - * We don't cast the decision in stone until we reach keys for the - * next attribute. + * bkey will be set to the key that preprocessing left behind as the + * boundary key for this attribute, in this scan direction (if any) */ cur = so->keyData; curattr = 1; - chosen = NULL; + bkey = NULL; /* Also remember any scankey that implies a NOT NULL constraint */ impliesNN = NULL; @@ -1058,23 +1061,29 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) { if (i >= so->numberOfKeys || cur->sk_attno != curattr) { + /* Done looking for the curattr boundary key */ + Assert(bkey == NULL || + (bkey->sk_attno == curattr && + (bkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))); + Assert(impliesNN == NULL || + (impliesNN->sk_attno == curattr && + (impliesNN->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))); + /* - * Done looking at keys for curattr. - * * If this is a scan key for a skip array whose current * element is MINVAL, choose low_compare (when scanning * backwards it'll be MAXVAL, and we'll choose high_compare). * - * Note: if the array's low_compare key makes 'chosen' NULL, + * Note: if the array's low_compare key makes 'bkey' NULL, * then we behave as if the array's first element is -inf, * except when !array->null_elem implies a usable NOT NULL * constraint. */ - if (chosen != NULL && - (chosen->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))) + if (bkey != NULL && + (bkey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))) { - int ikey = chosen - so->keyData; - ScanKey skipequalitykey = chosen; + int ikey = bkey - so->keyData; + ScanKey skipequalitykey = bkey; BTArrayKeyInfo *array = NULL; for (int arridx = 0; arridx < so->numArrayKeys; arridx++) @@ -1087,35 +1096,35 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) if (ScanDirectionIsForward(dir)) { Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL)); - chosen = array->low_compare; + bkey = array->low_compare; } else { Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL)); - chosen = array->high_compare; + bkey = array->high_compare; } - Assert(chosen == NULL || - chosen->sk_attno == skipequalitykey->sk_attno); + Assert(bkey == NULL || + bkey->sk_attno == skipequalitykey->sk_attno); if (!array->null_elem) impliesNN = skipequalitykey; else - Assert(chosen == NULL && impliesNN == NULL); + Assert(bkey == NULL && impliesNN == NULL); } /* * If we didn't find a usable boundary key, see if we can * deduce a NOT NULL key */ - if (chosen == NULL && impliesNN != NULL && + if (bkey == NULL && impliesNN != NULL && ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? ScanDirectionIsForward(dir) : ScanDirectionIsBackward(dir))) { /* Yes, so build the key in notnullkeys[keysz] */ - chosen = ¬nullkeys[keysz]; - ScanKeyEntryInitialize(chosen, + bkey = ¬nullkeys[keysz]; + ScanKeyEntryInitialize(bkey, (SK_SEARCHNOTNULL | SK_ISNULL | (impliesNN->sk_flags & (SK_BT_DESC | SK_BT_NULLS_FIRST))), @@ -1130,12 +1139,12 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } /* - * If we still didn't find a usable boundary key, quit; else - * save the boundary key pointer in startKeys. + * If preprocessing didn't leave a usable boundary key, quit; + * else save the boundary key pointer in startKeys[] */ - if (chosen == NULL) + if (bkey == NULL) break; - startKeys[keysz++] = chosen; + startKeys[keysz++] = bkey; /* * We can only consider adding more boundary keys when the one @@ -1143,7 +1152,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * (during backwards scans we can only do so when the key that * we just added to startKeys[] uses the = or <= strategy) */ - strat_total = chosen->sk_strategy; + strat_total = bkey->sk_strategy; if (strat_total == BTGreaterStrategyNumber || strat_total == BTLessStrategyNumber) break; @@ -1154,19 +1163,19 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * make strat_total > or < (and stop adding boundary keys). * This can only happen with opclasses that lack skip support. */ - if (chosen->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR)) + if (bkey->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR)) { - Assert(chosen->sk_flags & SK_BT_SKIP); + Assert(bkey->sk_flags & SK_BT_SKIP); Assert(strat_total == BTEqualStrategyNumber); if (ScanDirectionIsForward(dir)) { - Assert(!(chosen->sk_flags & SK_BT_PRIOR)); + Assert(!(bkey->sk_flags & SK_BT_PRIOR)); strat_total = BTGreaterStrategyNumber; } else { - Assert(!(chosen->sk_flags & SK_BT_NEXT)); + Assert(!(bkey->sk_flags & SK_BT_NEXT)); strat_total = BTLessStrategyNumber; } @@ -1180,24 +1189,30 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) /* * Done if that was the last scan key output by preprocessing. - * Also done if there is a gap index attribute that lacks a - * usable key (only possible when preprocessing was unable to - * generate a skip array key to "fill in the gap"). + * Also done if we've now examined all keys marked required. */ if (i >= so->numberOfKeys || - cur->sk_attno != curattr + 1) + !(cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) break; /* * Reset for next attr. */ + Assert(cur->sk_attno == curattr + 1); curattr = cur->sk_attno; - chosen = NULL; + bkey = NULL; impliesNN = NULL; } /* - * Can we use this key as a starting boundary for this attr? + * If we've located the starting boundary key for curattr, we have + * no interest in curattr's other required key + */ + if (bkey != NULL) + continue; + + /* + * Is this key the starting boundary key for curattr? * * If not, does it imply a NOT NULL constraint? (Because * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber, @@ -1207,27 +1222,20 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) { case BTLessStrategyNumber: case BTLessEqualStrategyNumber: - if (chosen == NULL) - { - if (ScanDirectionIsBackward(dir)) - chosen = cur; - else - impliesNN = cur; - } + if (ScanDirectionIsBackward(dir)) + bkey = cur; + else if (impliesNN == NULL) + impliesNN = cur; break; case BTEqualStrategyNumber: - /* override any non-equality choice */ - chosen = cur; + bkey = cur; break; case BTGreaterEqualStrategyNumber: case BTGreaterStrategyNumber: - if (chosen == NULL) - { - if (ScanDirectionIsForward(dir)) - chosen = cur; - else - impliesNN = cur; - } + if (ScanDirectionIsForward(dir)) + bkey = cur; + else if (impliesNN == NULL) + impliesNN = cur; break; } } diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index c71d1b6f2e1e0..eb6dbfda33c6e 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -44,7 +44,6 @@ static bool _bt_array_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *arra static bool _bt_array_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array); static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir, bool *skip_array_set); -static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir); static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, IndexTuple tuple, TupleDesc tupdesc, int tupnatts, bool readpagetup, int sktrig, bool *scanBehind); @@ -52,7 +51,6 @@ static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, int sktrig, bool sktrig_required); #ifdef USE_ASSERT_CHECKING -static bool _bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir); static bool _bt_verify_keys_with_arraykeys(IndexScanDesc scan); #endif static bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, @@ -1034,73 +1032,6 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir, return false; } -/* - * _bt_rewind_nonrequired_arrays() -- Rewind SAOP arrays not marked required - * - * Called when _bt_advance_array_keys decides to start a new primitive index - * scan on the basis of the current scan position being before the position - * that _bt_first is capable of repositioning the scan to by applying an - * inequality operator required in the opposite-to-scan direction only. - * - * Although equality strategy scan keys (for both arrays and non-arrays alike) - * are either marked required in both directions or in neither direction, - * there is a sense in which non-required arrays behave like required arrays. - * With a qual such as "WHERE a IN (100, 200) AND b >= 3 AND c IN (5, 6, 7)", - * the scan key on "c" is non-required, but nevertheless enables positioning - * the scan at the first tuple >= "(100, 3, 5)" on the leaf level during the - * first descent of the tree by _bt_first. Later on, there could also be a - * second descent, that places the scan right before tuples >= "(200, 3, 5)". - * _bt_first must never be allowed to build an insertion scan key whose "c" - * entry is set to a value other than 5, the "c" array's first element/value. - * (Actually, it's the first in the current scan direction. This example uses - * a forward scan.) - * - * Calling here resets the array scan key elements for the scan's non-required - * arrays. This is strictly necessary for correctness in a subset of cases - * involving "required in opposite direction"-triggered primitive index scans. - * Not all callers are at risk of _bt_first using a non-required array like - * this, but advancement always resets the arrays when another primitive scan - * is scheduled, just to keep things simple. Array advancement even makes - * sure to reset non-required arrays during scans that have no inequalities. - * (Advancement still won't call here when there are no inequalities, though - * that's just because it's all handled indirectly instead.) - * - * Note: _bt_verify_arrays_bt_first is called by an assertion to enforce that - * everybody got this right. - * - * Note: In practice almost all SAOP arrays are marked required during - * preprocessing (if necessary by generating skip arrays). It is hardly ever - * truly necessary to call here, but consistently doing so is simpler. - */ -static void -_bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir) -{ - Relation rel = scan->indexRelation; - BTScanOpaque so = (BTScanOpaque) scan->opaque; - int arrayidx = 0; - - for (int ikey = 0; ikey < so->numberOfKeys; ikey++) - { - ScanKey cur = so->keyData + ikey; - BTArrayKeyInfo *array = NULL; - - if (!(cur->sk_flags & SK_SEARCHARRAY) || - cur->sk_strategy != BTEqualStrategyNumber) - continue; - - array = &so->arrayKeys[arrayidx++]; - Assert(array->scan_key == ikey); - - if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) - continue; - - Assert(array->num_elems != -1); /* No non-required skip arrays */ - - _bt_array_set_low_or_high(rel, cur, array, - ScanDirectionIsForward(dir)); - } -} - /* * _bt_tuple_before_array_skeys() -- too early to advance required arrays? * @@ -1380,8 +1311,6 @@ _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir) */ if (so->needPrimScan) { - Assert(_bt_verify_arrays_bt_first(scan, dir)); - /* * Flag was set -- must call _bt_first again, which will reset the * scan's needPrimScan flag @@ -2007,14 +1936,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, */ else if (has_required_opposite_direction_only && pstate->finaltup && unlikely(!_bt_oppodir_checkkeys(scan, dir, pstate->finaltup))) - { - /* - * Make sure that any SAOP arrays that were not marked required by - * preprocessing are reset to their first element for this direction - */ - _bt_rewind_nonrequired_arrays(scan, dir); goto new_prim_scan; - } continue_scan: @@ -2045,8 +1967,6 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, */ so->oppositeDirCheck = has_required_opposite_direction_only; - _bt_rewind_nonrequired_arrays(scan, dir); - /* * skip by setting "look ahead" mechanism's offnum for forwards scans * (backwards scans check scanBehind flag directly instead) @@ -2142,48 +2062,6 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, } #ifdef USE_ASSERT_CHECKING -/* - * Verify that the scan's qual state matches what we expect at the point that - * _bt_start_prim_scan is about to start a just-scheduled new primitive scan. - * - * We enforce a rule against non-required array scan keys: they must start out - * with whatever element is the first for the scan's current scan direction. - * See _bt_rewind_nonrequired_arrays comments for an explanation. - */ -static bool -_bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir) -{ - BTScanOpaque so = (BTScanOpaque) scan->opaque; - int arrayidx = 0; - - for (int ikey = 0; ikey < so->numberOfKeys; ikey++) - { - ScanKey cur = so->keyData + ikey; - BTArrayKeyInfo *array = NULL; - int first_elem_dir; - - if (!(cur->sk_flags & SK_SEARCHARRAY) || - cur->sk_strategy != BTEqualStrategyNumber) - continue; - - array = &so->arrayKeys[arrayidx++]; - - if (((cur->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) || - ((cur->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir))) - continue; - - if (ScanDirectionIsForward(dir)) - first_elem_dir = 0; - else - first_elem_dir = array->num_elems - 1; - - if (array->cur_elem != first_elem_dir) - return false; - } - - return _bt_verify_keys_with_arraykeys(scan); -} - /* * Verify that the scan's "so->keyData[]" scan keys are in agreement with * its array key state @@ -2194,6 +2072,7 @@ _bt_verify_keys_with_arraykeys(IndexScanDesc scan) BTScanOpaque so = (BTScanOpaque) scan->opaque; int last_sk_attno = InvalidAttrNumber, arrayidx = 0; + bool nonrequiredseen = false; if (!so->qual_ok) return false; @@ -2217,8 +2096,16 @@ _bt_verify_keys_with_arraykeys(IndexScanDesc scan) if (array->num_elems != -1 && cur->sk_argument != array->elem_values[array->cur_elem]) return false; - if (last_sk_attno > cur->sk_attno) - return false; + if (cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) + { + if (last_sk_attno > cur->sk_attno) + return false; + if (nonrequiredseen) + return false; + } + else + nonrequiredseen = true; + last_sk_attno = cur->sk_attno; } @@ -2551,7 +2438,6 @@ _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate) if (!(key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) { /* Scan key isn't marked required (corner case) */ - Assert(!(key->sk_flags & SK_ROW_HEADER)); break; /* unsafe */ } if (key->sk_flags & SK_ROW_HEADER) From 4cb889d21f3d40844dd04fa0fc65e485c789e74e Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Wed, 2 Jul 2025 09:48:14 -0400 Subject: [PATCH 021/138] Make row compares robust during nbtree array scans. Recent nbtree bugfix commit 5f4d98d4 added a special case to the code that sets up a page-level prefix of keys that are definitely satisfied by every tuple on the page: whenever _bt_set_startikey reached a row compare key, we'd refuse to apply the pstate.forcenonrequired behavior in scans where that usually happens (scans with a higher-order array key). That hack made the scan avoid essentially the same infinite cycling behavior that also affected nbtree scans with redundant keys (keys that preprocessing could not eliminate) prior to commit f09816a0. There are now serious doubts about this row compare workaround. Testing has shown that a scan with a row compare key and an array key could still read the same leaf page twice (without the scan's direction changing), which isn't supposed to be possible following the SAOP enhancements added by Postgres 17 commit 5bf748b8. Also, we still allowed a required row compare key to be used with forcenonrequired mode when its header key happened to be beyond the pstate.ikey set by _bt_set_startikey, which was complicated and brittle. The underlying problem was that row compares had inconsistent rules around how scans start (which keys can be used for initial positioning purposes) and how scans end (which keys can set continuescan=false). Quals with redundant keys that could not be eliminated by preprocessing also had that same quality to them prior to today's bugfix f09816a0. It now seems prudent to bring row compare keys in line with the new charter for required keys, by making the start and end rules symmetric. This commit fixes two points of disagreement between _bt_first and _bt_check_rowcompare. Firstly, _bt_check_rowcompare was capable of ending the scan at the point where it needed to compare an ISNULL-marked row compare member that came immediately after a required row compare member. _bt_first now has symmetric handling for NULL row compares. Secondly, _bt_first had its own ideas about which keys were safe to use for initial positioning purposes. It could use fewer or more keys than _bt_check_rowcompare. _bt_first now uses the same requiredness markings as _bt_check_rowcompare for this. Now that _bt_first and _bt_check_rowcompare agree on how to start and end scans, we can get rid of the forcenonrequired special case, without any risk of infinite cycling. This approach also makes row compare keys behave more like regular scalar keys, particularly within _bt_first. Fixing these inconsistencies necessitates dealing with a related issue with the way that row compares were marked required by preprocessing: we didn't mark any lower-order row members required following 2016 bugfix commit a298a1e0. That approach was over broad. The bug in question was actually an oversight in how _bt_check_rowcompare dealt with tuple NULL values that failed to satisfy a scan key marked required in the opposite scan direction (it was a bug in 2011 commits 6980f817 and 882368e8, not a bug in 2006 commit 3a0a16cb). Go back to marking row compare members as required using the original 2006 rules, and fix the 2016 bug in a more principled way: by limiting use of the "set continuescan=false with a key required in the opposite scan direction upon encountering a NULL tuple value" optimization to the first/most significant row member key. While it isn't safe to use an implied IS NOT NULL qualifier to end the scan when it comes from a required lower-order row compare member key, it _is_ generally safe for such a required member key to end the scan -- provided the key is marked required in the _current_ scan direction. This fixes what was arguably an oversight in either commit 5f4d98d4 or commit 8a510275. It is a direct follow-up to today's commit f09816a0. Author: Peter Geoghegan Reviewed-By: Heikki Linnakangas Discussion: https://postgr.es/m/CAH2-Wz=pcijHL_mA0_TJ5LiTB28QpQ0cGtT-ccFV=KzuunNDDQ@mail.gmail.com Backpatch-through: 18 --- src/backend/access/nbtree/nbtpreprocesskeys.c | 19 +- src/backend/access/nbtree/nbtsearch.c | 245 ++++++++++-------- src/backend/access/nbtree/nbtutils.c | 157 ++++++----- src/test/regress/expected/btree_index.out | 101 ++++++-- src/test/regress/sql/btree_index.sql | 65 ++++- 5 files changed, 385 insertions(+), 202 deletions(-) diff --git a/src/backend/access/nbtree/nbtpreprocesskeys.c b/src/backend/access/nbtree/nbtpreprocesskeys.c index 36813a96fff57..8eb4bb8410ea2 100644 --- a/src/backend/access/nbtree/nbtpreprocesskeys.c +++ b/src/backend/access/nbtree/nbtpreprocesskeys.c @@ -792,12 +792,25 @@ _bt_mark_scankey_required(ScanKey skey) if (skey->sk_flags & SK_ROW_HEADER) { ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument); + AttrNumber attno = skey->sk_attno; /* First subkey should be same column/operator as the header */ - Assert(subkey->sk_flags & SK_ROW_MEMBER); - Assert(subkey->sk_attno == skey->sk_attno); + Assert(subkey->sk_attno == attno); Assert(subkey->sk_strategy == skey->sk_strategy); - subkey->sk_flags |= addflags; + + for (;;) + { + Assert(subkey->sk_flags & SK_ROW_MEMBER); + if (subkey->sk_attno != attno) + break; /* non-adjacent key, so not required */ + if (subkey->sk_strategy != skey->sk_strategy) + break; /* wrong direction, so not required */ + subkey->sk_flags |= addflags; + if (subkey->sk_flags & SK_ROW_END) + break; + subkey++; + attno++; + } } } diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 9846ef6db53ae..4af1ff1e9e5e3 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -1016,8 +1016,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * traversing a lot of null entries at the start of the scan. * * In this loop, row-comparison keys are treated the same as keys on their - * first (leftmost) columns. We'll add on lower-order columns of the row - * comparison below, if possible. + * first (leftmost) columns. We'll add all lower-order columns of the row + * comparison that were marked required during preprocessing below. * * _bt_advance_array_keys needs to know exactly how we'll reposition the * scan (should it opt to schedule another primitive index scan). It is @@ -1261,16 +1261,18 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) Assert(keysz <= INDEX_MAX_KEYS); for (int i = 0; i < keysz; i++) { - ScanKey cur = startKeys[i]; + ScanKey bkey = startKeys[i]; - Assert(cur->sk_attno == i + 1); + Assert(bkey->sk_attno == i + 1); - if (cur->sk_flags & SK_ROW_HEADER) + if (bkey->sk_flags & SK_ROW_HEADER) { /* * Row comparison header: look to the first row member instead */ - ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument); + ScanKey subkey = (ScanKey) DatumGetPointer(bkey->sk_argument); + bool loosen_strat = false, + tighten_strat = false; /* * Cannot be a NULL in the first row member: _bt_preprocess_keys @@ -1278,9 +1280,18 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * ever getting this far */ Assert(subkey->sk_flags & SK_ROW_MEMBER); - Assert(subkey->sk_attno == cur->sk_attno); + Assert(subkey->sk_attno == bkey->sk_attno); Assert(!(subkey->sk_flags & SK_ISNULL)); + /* + * This is either a > or >= key (during backwards scans it is + * either < or <=) that was marked required during preprocessing. + * Later so->keyData[] keys can't have been marked required, so + * our row compare header key must be the final startKeys[] entry. + */ + Assert(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)); + Assert(i == keysz - 1); + /* * The member scankeys are already in insertion format (ie, they * have sk_func = 3-way-comparison function) @@ -1288,112 +1299,141 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData)); /* - * If the row comparison is the last positioning key we accepted, - * try to add additional keys from the lower-order row members. - * (If we accepted independent conditions on additional index - * columns, we use those instead --- doesn't seem worth trying to - * determine which is more restrictive.) Note that this is OK - * even if the row comparison is of ">" or "<" type, because the - * condition applied to all but the last row member is effectively - * ">=" or "<=", and so the extra keys don't break the positioning - * scheme. But, by the same token, if we aren't able to use all - * the row members, then the part of the row comparison that we - * did use has to be treated as just a ">=" or "<=" condition, and - * so we'd better adjust strat_total accordingly. + * Now look to later row compare members. + * + * If there's an "index attribute gap" between two row compare + * members, the second member won't have been marked required, and + * so can't be used as a starting boundary key here. The part of + * the row comparison that we do still use has to be treated as a + * ">=" or "<=" condition. For example, a qual "(a, c) > (1, 42)" + * with an omitted intervening index attribute "b" will use an + * insertion scan key "a >= 1". Even the first "a = 1" tuple on + * the leaf level might satisfy the row compare qual. + * + * We're able to use a _more_ restrictive strategy when we reach a + * NULL row compare member, since they're always unsatisfiable. + * For example, a qual "(a, b, c) >= (1, NULL, 77)" will use an + * insertion scan key "a > 1". All tuples where "a = 1" cannot + * possibly satisfy the row compare qual, so this is safe. */ - if (i == keysz - 1) + Assert(!(subkey->sk_flags & SK_ROW_END)); + for (;;) { - bool used_all_subkeys = false; + subkey++; + Assert(subkey->sk_flags & SK_ROW_MEMBER); - Assert(!(subkey->sk_flags & SK_ROW_END)); - for (;;) + if (subkey->sk_flags & SK_ISNULL) { - subkey++; - Assert(subkey->sk_flags & SK_ROW_MEMBER); - if (subkey->sk_attno != keysz + 1) - break; /* out-of-sequence, can't use it */ - if (subkey->sk_strategy != cur->sk_strategy) - break; /* wrong direction, can't use it */ - if (subkey->sk_flags & SK_ISNULL) - break; /* can't use null keys */ - Assert(keysz < INDEX_MAX_KEYS); - memcpy(inskey.scankeys + keysz, subkey, - sizeof(ScanKeyData)); - keysz++; - if (subkey->sk_flags & SK_ROW_END) - { - used_all_subkeys = true; - break; - } + /* + * NULL member key, can only use earlier keys. + * + * We deliberately avoid checking if this key is marked + * required. All earlier keys are required, and this key + * is unsatisfiable either way, so we can't miss anything. + */ + tighten_strat = true; + break; } - if (!used_all_subkeys) + + if (!(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) { - switch (strat_total) - { - case BTLessStrategyNumber: - strat_total = BTLessEqualStrategyNumber; - break; - case BTGreaterStrategyNumber: - strat_total = BTGreaterEqualStrategyNumber; - break; - } + /* nonrequired member key, can only use earlier keys */ + loosen_strat = true; + break; } - break; /* done with outer loop */ + + Assert(subkey->sk_attno == keysz + 1); + Assert(subkey->sk_strategy == bkey->sk_strategy); + Assert(keysz < INDEX_MAX_KEYS); + + memcpy(inskey.scankeys + keysz, subkey, + sizeof(ScanKeyData)); + keysz++; + if (subkey->sk_flags & SK_ROW_END) + break; } - } - else - { - /* - * Ordinary comparison key. Transform the search-style scan key - * to an insertion scan key by replacing the sk_func with the - * appropriate btree comparison function. - * - * If scankey operator is not a cross-type comparison, we can use - * the cached comparison function; otherwise gotta look it up in - * the catalogs. (That can't lead to infinite recursion, since no - * indexscan initiated by syscache lookup will use cross-data-type - * operators.) - * - * We support the convention that sk_subtype == InvalidOid means - * the opclass input type; this is a hack to simplify life for - * ScanKeyInit(). - */ - if (cur->sk_subtype == rel->rd_opcintype[i] || - cur->sk_subtype == InvalidOid) + Assert(!(loosen_strat && tighten_strat)); + if (loosen_strat) { - FmgrInfo *procinfo; - - procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC); - ScanKeyEntryInitializeWithInfo(inskey.scankeys + i, - cur->sk_flags, - cur->sk_attno, - InvalidStrategy, - cur->sk_subtype, - cur->sk_collation, - procinfo, - cur->sk_argument); + /* Use less restrictive strategy (and fewer member keys) */ + switch (strat_total) + { + case BTLessStrategyNumber: + strat_total = BTLessEqualStrategyNumber; + break; + case BTGreaterStrategyNumber: + strat_total = BTGreaterEqualStrategyNumber; + break; + } } - else + if (tighten_strat) { - RegProcedure cmp_proc; - - cmp_proc = get_opfamily_proc(rel->rd_opfamily[i], - rel->rd_opcintype[i], - cur->sk_subtype, - BTORDER_PROC); - if (!RegProcedureIsValid(cmp_proc)) - elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"", - BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype, - cur->sk_attno, RelationGetRelationName(rel)); - ScanKeyEntryInitialize(inskey.scankeys + i, - cur->sk_flags, - cur->sk_attno, - InvalidStrategy, - cur->sk_subtype, - cur->sk_collation, - cmp_proc, - cur->sk_argument); + /* Use more restrictive strategy (and fewer member keys) */ + switch (strat_total) + { + case BTLessEqualStrategyNumber: + strat_total = BTLessStrategyNumber; + break; + case BTGreaterEqualStrategyNumber: + strat_total = BTGreaterStrategyNumber; + break; + } } + + /* done adding to inskey (row comparison keys always come last) */ + break; + } + + /* + * Ordinary comparison key/search-style key. + * + * Transform the search-style scan key to an insertion scan key by + * replacing the sk_func with the appropriate btree 3-way-comparison + * function. + * + * If scankey operator is not a cross-type comparison, we can use the + * cached comparison function; otherwise gotta look it up in the + * catalogs. (That can't lead to infinite recursion, since no + * indexscan initiated by syscache lookup will use cross-data-type + * operators.) + * + * We support the convention that sk_subtype == InvalidOid means the + * opclass input type; this hack simplifies life for ScanKeyInit(). + */ + if (bkey->sk_subtype == rel->rd_opcintype[i] || + bkey->sk_subtype == InvalidOid) + { + FmgrInfo *procinfo; + + procinfo = index_getprocinfo(rel, bkey->sk_attno, BTORDER_PROC); + ScanKeyEntryInitializeWithInfo(inskey.scankeys + i, + bkey->sk_flags, + bkey->sk_attno, + InvalidStrategy, + bkey->sk_subtype, + bkey->sk_collation, + procinfo, + bkey->sk_argument); + } + else + { + RegProcedure cmp_proc; + + cmp_proc = get_opfamily_proc(rel->rd_opfamily[i], + rel->rd_opcintype[i], + bkey->sk_subtype, BTORDER_PROC); + if (!RegProcedureIsValid(cmp_proc)) + elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"", + BTORDER_PROC, rel->rd_opcintype[i], bkey->sk_subtype, + bkey->sk_attno, RelationGetRelationName(rel)); + ScanKeyEntryInitialize(inskey.scankeys + i, + bkey->sk_flags, + bkey->sk_attno, + InvalidStrategy, + bkey->sk_subtype, + bkey->sk_collation, + cmp_proc, + bkey->sk_argument); } } @@ -1482,6 +1522,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) if (!BufferIsValid(so->currPos.buf)) { + Assert(!so->needPrimScan); + /* * We only get here if the index is completely empty. Lock relation * because nothing finer to lock exists. Without a buffer lock, it's @@ -1500,7 +1542,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) if (!BufferIsValid(so->currPos.buf)) { - Assert(!so->needPrimScan); _bt_parallel_done(scan); return false; } diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index eb6dbfda33c6e..9aed207995f52 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -2442,32 +2442,8 @@ _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate) } if (key->sk_flags & SK_ROW_HEADER) { - /* - * RowCompare inequality. - * - * Only the first subkey from a RowCompare can ever be marked - * required (that happens when the row header is marked required). - * There is no simple, general way for us to transitively deduce - * whether or not every tuple on the page satisfies a RowCompare - * key based only on firsttup and lasttup -- so we just give up. - */ - if (!start_past_saop_eq && !so->skipScan) - break; /* unsafe to go further */ - - /* - * We have to be even more careful with RowCompares that come - * after an array: we assume it's unsafe to even bypass the array. - * Calling _bt_start_array_keys to recover the scan's arrays - * following use of forcenonrequired mode isn't compatible with - * _bt_check_rowcompare's continuescan=false behavior with NULL - * row compare members. _bt_advance_array_keys must not make a - * decision on the basis of a key not being satisfied in the - * opposite-to-scan direction until the scan reaches a leaf page - * where the same key begins to be satisfied in scan direction. - * The _bt_first !used_all_subkeys behavior makes this limitation - * hard to work around some other way. - */ - return; /* completely unsafe to set pstate.startikey */ + /* RowCompare inequalities currently aren't supported */ + break; /* "unsafe" */ } if (key->sk_strategy != BTEqualStrategyNumber) { @@ -2964,6 +2940,31 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, Assert(subkey->sk_flags & SK_ROW_MEMBER); + /* When a NULL row member is compared, the row never matches */ + if (subkey->sk_flags & SK_ISNULL) + { + /* + * Unlike the simple-scankey case, this isn't a disallowed case + * (except when it's the first row element that has the NULL arg). + * But it can never match. If all the earlier row comparison + * columns are required for the scan direction, we can stop the + * scan, because there can't be another tuple that will succeed. + */ + Assert(subkey != (ScanKey) DatumGetPointer(skey->sk_argument)); + subkey--; + if (forcenonrequired) + { + /* treating scan's keys as non-required */ + } + else if ((subkey->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) + *continuescan = false; + else if ((subkey->sk_flags & SK_BT_REQBKWD) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + return false; + } + if (subkey->sk_attno > tupnatts) { /* @@ -2973,11 +2974,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, * attribute passes the qual. */ Assert(BTreeTupleIsPivot(tuple)); - cmpresult = 0; - if (subkey->sk_flags & SK_ROW_END) - break; - subkey++; - continue; + return true; } datum = index_getattr(tuple, @@ -2987,6 +2984,8 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, if (isNull) { + int reqflags; + if (forcenonrequired) { /* treating scan's keys as non-required */ @@ -2997,15 +2996,35 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, * Since NULLs are sorted before non-NULLs, we know we have * reached the lower limit of the range of values for this * index attr. On a backward scan, we can stop if this qual - * is one of the "must match" subset. We can stop regardless - * of whether the qual is > or <, so long as it's required, - * because it's not possible for any future tuples to pass. On - * a forward scan, however, we must keep going, because we may - * have initially positioned to the start of the index. - * (_bt_advance_array_keys also relies on this behavior during - * forward scans.) + * is one of the "must match" subset. However, on a forwards + * scan, we must keep going, because we may have initially + * positioned to the start of the index. + * + * All required NULLS FIRST > row members can use NULL tuple + * values to end backwards scans, just like with other values. + * A qual "WHERE (a, b, c) > (9, 42, 'foo')" can terminate a + * backwards scan upon reaching the index's rightmost "a = 9" + * tuple whose "b" column contains a NULL (if not sooner). + * Since "b" is NULLS FIRST, we can treat its NULLs as "<" 42. + */ + reqflags = SK_BT_REQBKWD; + + /* + * When a most significant required NULLS FIRST < row compare + * member sees NULL tuple values during a backwards scan, it + * signals the end of matches for the whole row compare/scan. + * A qual "WHERE (a, b, c) < (9, 42, 'foo')" will terminate a + * backwards scan upon reaching the rightmost tuple whose "a" + * column has a NULL. The "a" NULL value is "<" 9, and yet + * our < row compare will still end the scan. (This isn't + * safe with later/lower-order row members. Notice that it + * can only happen with an "a" NULL some time after the scan + * completely stops needing to use its "b" and "c" members.) */ - if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + if (subkey == (ScanKey) DatumGetPointer(skey->sk_argument)) + reqflags |= SK_BT_REQFWD; /* safe, first row member */ + + if ((subkey->sk_flags & reqflags) && ScanDirectionIsBackward(dir)) *continuescan = false; } @@ -3015,15 +3034,35 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, * Since NULLs are sorted after non-NULLs, we know we have * reached the upper limit of the range of values for this * index attr. On a forward scan, we can stop if this qual is - * one of the "must match" subset. We can stop regardless of - * whether the qual is > or <, so long as it's required, - * because it's not possible for any future tuples to pass. On - * a backward scan, however, we must keep going, because we - * may have initially positioned to the end of the index. - * (_bt_advance_array_keys also relies on this behavior during - * backward scans.) + * one of the "must match" subset. However, on a backward + * scan, we must keep going, because we may have initially + * positioned to the end of the index. + * + * All required NULLS LAST < row members can use NULL tuple + * values to end forwards scans, just like with other values. + * A qual "WHERE (a, b, c) < (9, 42, 'foo')" can terminate a + * forwards scan upon reaching the index's leftmost "a = 9" + * tuple whose "b" column contains a NULL (if not sooner). + * Since "b" is NULLS LAST, we can treat its NULLs as ">" 42. + */ + reqflags = SK_BT_REQFWD; + + /* + * When a most significant required NULLS LAST > row compare + * member sees NULL tuple values during a forwards scan, it + * signals the end of matches for the whole row compare/scan. + * A qual "WHERE (a, b, c) > (9, 42, 'foo')" will terminate a + * forwards scan upon reaching the leftmost tuple whose "a" + * column has a NULL. The "a" NULL value is ">" 9, and yet + * our > row compare will end the scan. (This isn't safe with + * later/lower-order row members. Notice that it can only + * happen with an "a" NULL some time after the scan completely + * stops needing to use its "b" and "c" members.) */ - if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + if (subkey == (ScanKey) DatumGetPointer(skey->sk_argument)) + reqflags |= SK_BT_REQBKWD; /* safe, first row member */ + + if ((subkey->sk_flags & reqflags) && ScanDirectionIsForward(dir)) *continuescan = false; } @@ -3034,30 +3073,6 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, return false; } - if (subkey->sk_flags & SK_ISNULL) - { - /* - * Unlike the simple-scankey case, this isn't a disallowed case - * (except when it's the first row element that has the NULL arg). - * But it can never match. If all the earlier row comparison - * columns are required for the scan direction, we can stop the - * scan, because there can't be another tuple that will succeed. - */ - Assert(subkey != (ScanKey) DatumGetPointer(skey->sk_argument)); - subkey--; - if (forcenonrequired) - { - /* treating scan's keys as non-required */ - } - else if ((subkey->sk_flags & SK_BT_REQFWD) && - ScanDirectionIsForward(dir)) - *continuescan = false; - else if ((subkey->sk_flags & SK_BT_REQBKWD) && - ScanDirectionIsBackward(dir)) - *continuescan = false; - return false; - } - /* Perform the test --- three-way comparison not bool operator */ cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func, subkey->sk_collation, diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out index bfb1a286ea4ad..21dc9b5783a7c 100644 --- a/src/test/regress/expected/btree_index.out +++ b/src/test/regress/expected/btree_index.out @@ -195,54 +195,123 @@ ORDER BY proname DESC, proargtypes DESC, pronamespace DESC LIMIT 1; (1 row) -- --- Add coverage for RowCompare quals whose rhs row has a NULL that ends scan +-- Forwards scan RowCompare qual whose row arg has a NULL that affects our +-- initial positioning strategy -- explain (costs off) SELECT proname, proargtypes, pronamespace FROM pg_proc - WHERE proname = 'abs' AND (proname, proargtypes) < ('abs', NULL) + WHERE (proname, proargtypes) >= ('abs', NULL) AND proname <= 'abs' ORDER BY proname, proargtypes, pronamespace; - QUERY PLAN -------------------------------------------------------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------------------------------------------------- Index Only Scan using pg_proc_proname_args_nsp_index on pg_proc - Index Cond: ((ROW(proname, proargtypes) < ROW('abs'::name, NULL::oidvector)) AND (proname = 'abs'::name)) + Index Cond: ((ROW(proname, proargtypes) >= ROW('abs'::name, NULL::oidvector)) AND (proname <= 'abs'::name)) (2 rows) SELECT proname, proargtypes, pronamespace FROM pg_proc - WHERE proname = 'abs' AND (proname, proargtypes) < ('abs', NULL) + WHERE (proname, proargtypes) >= ('abs', NULL) AND proname <= 'abs' ORDER BY proname, proargtypes, pronamespace; proname | proargtypes | pronamespace ---------+-------------+-------------- (0 rows) -- --- Add coverage for backwards scan RowCompare quals whose rhs row has a NULL --- that ends scan +-- Forwards scan RowCompare quals whose row arg has a NULL that ends scan -- explain (costs off) SELECT proname, proargtypes, pronamespace FROM pg_proc - WHERE proname = 'abs' AND (proname, proargtypes) > ('abs', NULL) + WHERE proname >= 'abs' AND (proname, proargtypes) < ('abs', NULL) +ORDER BY proname, proargtypes, pronamespace; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------- + Index Only Scan using pg_proc_proname_args_nsp_index on pg_proc + Index Cond: ((proname >= 'abs'::name) AND (ROW(proname, proargtypes) < ROW('abs'::name, NULL::oidvector))) +(2 rows) + +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE proname >= 'abs' AND (proname, proargtypes) < ('abs', NULL) +ORDER BY proname, proargtypes, pronamespace; + proname | proargtypes | pronamespace +---------+-------------+-------------- +(0 rows) + +-- +-- Backwards scan RowCompare qual whose row arg has a NULL that affects our +-- initial positioning strategy +-- +explain (costs off) +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE proname >= 'abs' AND (proname, proargtypes) <= ('abs', NULL) +ORDER BY proname DESC, proargtypes DESC, pronamespace DESC; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------- + Index Only Scan Backward using pg_proc_proname_args_nsp_index on pg_proc + Index Cond: ((proname >= 'abs'::name) AND (ROW(proname, proargtypes) <= ROW('abs'::name, NULL::oidvector))) +(2 rows) + +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE proname >= 'abs' AND (proname, proargtypes) <= ('abs', NULL) +ORDER BY proname DESC, proargtypes DESC, pronamespace DESC; + proname | proargtypes | pronamespace +---------+-------------+-------------- +(0 rows) + +-- +-- Backwards scan RowCompare qual whose row arg has a NULL that ends scan +-- +explain (costs off) +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE (proname, proargtypes) > ('abs', NULL) AND proname <= 'abs' ORDER BY proname DESC, proargtypes DESC, pronamespace DESC; - QUERY PLAN -------------------------------------------------------------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------------------------------------------------------- Index Only Scan Backward using pg_proc_proname_args_nsp_index on pg_proc - Index Cond: ((ROW(proname, proargtypes) > ROW('abs'::name, NULL::oidvector)) AND (proname = 'abs'::name)) + Index Cond: ((ROW(proname, proargtypes) > ROW('abs'::name, NULL::oidvector)) AND (proname <= 'abs'::name)) (2 rows) SELECT proname, proargtypes, pronamespace FROM pg_proc - WHERE proname = 'abs' AND (proname, proargtypes) > ('abs', NULL) + WHERE (proname, proargtypes) > ('abs', NULL) AND proname <= 'abs' ORDER BY proname DESC, proargtypes DESC, pronamespace DESC; proname | proargtypes | pronamespace ---------+-------------+-------------- (0 rows) +-- Makes B-Tree preprocessing deal with unmarking redundant keys that were +-- initially marked required (test case relies on current row compare +-- preprocessing limitations) +explain (costs off) +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE proname = 'zzzzzz' AND (proname, proargtypes) > ('abs', NULL) + AND pronamespace IN (1, 2, 3) AND proargtypes IN ('26 23', '5077') +ORDER BY proname, proargtypes, pronamespace; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Index Only Scan using pg_proc_proname_args_nsp_index on pg_proc + Index Cond: ((ROW(proname, proargtypes) > ROW('abs'::name, NULL::oidvector)) AND (proname = 'zzzzzz'::name) AND (proargtypes = ANY ('{"26 23",5077}'::oidvector[])) AND (pronamespace = ANY ('{1,2,3}'::oid[]))) +(2 rows) + +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE proname = 'zzzzzz' AND (proname, proargtypes) > ('abs', NULL) + AND pronamespace IN (1, 2, 3) AND proargtypes IN ('26 23', '5077') +ORDER BY proname, proargtypes, pronamespace; + proname | proargtypes | pronamespace +---------+-------------+-------------- +(0 rows) + -- --- Add coverage for recheck of > key following array advancement on previous --- (left sibling) page that used a high key whose attribute value corresponding --- to the > key was -inf (due to being truncated when the high key was created). +-- Performs a recheck of > key following array advancement on previous (left +-- sibling) page that used a high key whose attribute value corresponding to +-- the > key was -inf (due to being truncated when the high key was created). -- -- XXX This relies on the assumption that tenk1_thous_tenthous has a truncated -- high key "(183, -inf)" on the first page that we'll scan. The test will only diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql index 68c61dbc7d19c..6aaaa386abcec 100644 --- a/src/test/regress/sql/btree_index.sql +++ b/src/test/regress/sql/btree_index.sql @@ -143,38 +143,83 @@ SELECT proname, proargtypes, pronamespace ORDER BY proname DESC, proargtypes DESC, pronamespace DESC LIMIT 1; -- --- Add coverage for RowCompare quals whose rhs row has a NULL that ends scan +-- Forwards scan RowCompare qual whose row arg has a NULL that affects our +-- initial positioning strategy -- explain (costs off) SELECT proname, proargtypes, pronamespace FROM pg_proc - WHERE proname = 'abs' AND (proname, proargtypes) < ('abs', NULL) + WHERE (proname, proargtypes) >= ('abs', NULL) AND proname <= 'abs' ORDER BY proname, proargtypes, pronamespace; SELECT proname, proargtypes, pronamespace FROM pg_proc - WHERE proname = 'abs' AND (proname, proargtypes) < ('abs', NULL) + WHERE (proname, proargtypes) >= ('abs', NULL) AND proname <= 'abs' ORDER BY proname, proargtypes, pronamespace; -- --- Add coverage for backwards scan RowCompare quals whose rhs row has a NULL --- that ends scan +-- Forwards scan RowCompare quals whose row arg has a NULL that ends scan -- explain (costs off) SELECT proname, proargtypes, pronamespace FROM pg_proc - WHERE proname = 'abs' AND (proname, proargtypes) > ('abs', NULL) + WHERE proname >= 'abs' AND (proname, proargtypes) < ('abs', NULL) +ORDER BY proname, proargtypes, pronamespace; + +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE proname >= 'abs' AND (proname, proargtypes) < ('abs', NULL) +ORDER BY proname, proargtypes, pronamespace; + +-- +-- Backwards scan RowCompare qual whose row arg has a NULL that affects our +-- initial positioning strategy +-- +explain (costs off) +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE proname >= 'abs' AND (proname, proargtypes) <= ('abs', NULL) ORDER BY proname DESC, proargtypes DESC, pronamespace DESC; SELECT proname, proargtypes, pronamespace FROM pg_proc - WHERE proname = 'abs' AND (proname, proargtypes) > ('abs', NULL) + WHERE proname >= 'abs' AND (proname, proargtypes) <= ('abs', NULL) ORDER BY proname DESC, proargtypes DESC, pronamespace DESC; -- --- Add coverage for recheck of > key following array advancement on previous --- (left sibling) page that used a high key whose attribute value corresponding --- to the > key was -inf (due to being truncated when the high key was created). +-- Backwards scan RowCompare qual whose row arg has a NULL that ends scan +-- +explain (costs off) +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE (proname, proargtypes) > ('abs', NULL) AND proname <= 'abs' +ORDER BY proname DESC, proargtypes DESC, pronamespace DESC; + +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE (proname, proargtypes) > ('abs', NULL) AND proname <= 'abs' +ORDER BY proname DESC, proargtypes DESC, pronamespace DESC; + +-- Makes B-Tree preprocessing deal with unmarking redundant keys that were +-- initially marked required (test case relies on current row compare +-- preprocessing limitations) +explain (costs off) +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE proname = 'zzzzzz' AND (proname, proargtypes) > ('abs', NULL) + AND pronamespace IN (1, 2, 3) AND proargtypes IN ('26 23', '5077') +ORDER BY proname, proargtypes, pronamespace; + +SELECT proname, proargtypes, pronamespace + FROM pg_proc + WHERE proname = 'zzzzzz' AND (proname, proargtypes) > ('abs', NULL) + AND pronamespace IN (1, 2, 3) AND proargtypes IN ('26 23', '5077') +ORDER BY proname, proargtypes, pronamespace; + +-- +-- Performs a recheck of > key following array advancement on previous (left +-- sibling) page that used a high key whose attribute value corresponding to +-- the > key was -inf (due to being truncated when the high key was created). -- -- XXX This relies on the assumption that tenk1_thous_tenthous has a truncated -- high key "(183, -inf)" on the first page that we'll scan. The test will only From e16c9cd331314fcf3f7a8d9e3e20aaee448162e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Wed, 2 Jul 2025 17:02:27 +0200 Subject: [PATCH 022/138] Fix error message for ALTER CONSTRAINT ... NOT VALID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trying to alter a constraint so that it becomes NOT VALID results in an error that assumes the constraint is a foreign key. This is potentially wrong, so give a more generic error message. While at it, give CREATE CONSTRAINT TRIGGER a better error message as well. Co-authored-by: jian he Co-authored-by: Fujii Masao Co-authored-by: Álvaro Herrera Co-authored-by: Amul Sul Discussion: https://postgr.es/m/CACJufxHSp2puxP=q8ZtUGL1F+heapnzqFBZy5ZNGUjUgwjBqTQ@mail.gmail.com --- src/backend/parser/gram.y | 6 ++++++ src/test/regress/expected/constraints.out | 5 +++++ src/test/regress/expected/foreign_key.out | 2 +- src/test/regress/sql/constraints.sql | 3 +++ 4 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 50f53159d5819..03c80eaaf22a7 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -2668,6 +2668,12 @@ alter_table_cmd: c->alterDeferrability = true; if ($4 & CAS_NO_INHERIT) c->alterInheritability = true; + /* handle unsupported case with specific error message */ + if ($4 & CAS_NOT_VALID) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("constraints cannot be altered to be NOT VALID"), + parser_errposition(@4)); processCASbits($4, @4, "FOREIGN KEY", &c->deferrable, &c->initdeferred, diff --git a/src/test/regress/expected/constraints.out b/src/test/regress/expected/constraints.out index b5592617d9755..ccea883cffd65 100644 --- a/src/test/regress/expected/constraints.out +++ b/src/test/regress/expected/constraints.out @@ -748,6 +748,11 @@ ALTER TABLE unique_tbl ALTER CONSTRAINT unique_tbl_i_key ENFORCED; ERROR: cannot alter enforceability of constraint "unique_tbl_i_key" of relation "unique_tbl" ALTER TABLE unique_tbl ALTER CONSTRAINT unique_tbl_i_key NOT ENFORCED; ERROR: cannot alter enforceability of constraint "unique_tbl_i_key" of relation "unique_tbl" +-- can't make an existing constraint NOT VALID +ALTER TABLE unique_tbl ALTER CONSTRAINT unique_tbl_i_key NOT VALID; +ERROR: constraints cannot be altered to be NOT VALID +LINE 1: ...ABLE unique_tbl ALTER CONSTRAINT unique_tbl_i_key NOT VALID; + ^ DROP TABLE unique_tbl; -- -- EXCLUDE constraints diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out index 6a8f395934520..f9bd252444f53 100644 --- a/src/test/regress/expected/foreign_key.out +++ b/src/test/regress/expected/foreign_key.out @@ -1359,7 +1359,7 @@ LINE 1: ...e ALTER CONSTRAINT fktable_fk_fkey NOT DEFERRABLE INITIALLY ... ALTER TABLE fktable ALTER CONSTRAINT fktable_fk_fkey NO INHERIT; ERROR: constraint "fktable_fk_fkey" of relation "fktable" is not a not-null constraint ALTER TABLE fktable ALTER CONSTRAINT fktable_fk_fkey NOT VALID; -ERROR: FOREIGN KEY constraints cannot be marked NOT VALID +ERROR: constraints cannot be altered to be NOT VALID LINE 1: ...ER TABLE fktable ALTER CONSTRAINT fktable_fk_fkey NOT VALID; ^ ALTER TABLE fktable ALTER CONSTRAINT fktable_fk_fkey ENFORCED NOT ENFORCED; diff --git a/src/test/regress/sql/constraints.sql b/src/test/regress/sql/constraints.sql index 12668f0e0ce0f..7487723ab8437 100644 --- a/src/test/regress/sql/constraints.sql +++ b/src/test/regress/sql/constraints.sql @@ -537,6 +537,9 @@ CREATE TABLE UNIQUE_NOTEN_TBL(i int UNIQUE NOT ENFORCED); ALTER TABLE unique_tbl ALTER CONSTRAINT unique_tbl_i_key ENFORCED; ALTER TABLE unique_tbl ALTER CONSTRAINT unique_tbl_i_key NOT ENFORCED; +-- can't make an existing constraint NOT VALID +ALTER TABLE unique_tbl ALTER CONSTRAINT unique_tbl_i_key NOT VALID; + DROP TABLE unique_tbl; -- From 4938737d5452fc4975f985a0a6faf43a360ef021 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Wed, 2 Jul 2025 12:36:34 -0400 Subject: [PATCH 023/138] Update obsolete row compare preprocessing comments. Restore nbtree preprocessing comments describing how we mark nbtree row compare members required to how they were prior to 2016 bugfix commit a298a1e0. Oversight in commit bd3f59fd, which made nbtree preprocessing revert to the original 2006 rules, but neglected to revert these comments. Backpatch-through: 18 --- src/backend/access/nbtree/nbtpreprocesskeys.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/backend/access/nbtree/nbtpreprocesskeys.c b/src/backend/access/nbtree/nbtpreprocesskeys.c index 8eb4bb8410ea2..21c519cd108ed 100644 --- a/src/backend/access/nbtree/nbtpreprocesskeys.c +++ b/src/backend/access/nbtree/nbtpreprocesskeys.c @@ -752,9 +752,12 @@ _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption) * * Depending on the operator type, the key may be required for both scan * directions or just one. Also, if the key is a row comparison header, - * we have to mark its first subsidiary ScanKey as required. (Subsequent - * subsidiary ScanKeys are normally for lower-order columns, and thus - * cannot be required, since they're after the first non-equality scankey.) + * we have to mark the appropriate subsidiary ScanKeys as required. In such + * cases, the first subsidiary key is required, but subsequent ones are + * required only as long as they correspond to successive index columns and + * match the leading column as to sort direction. Otherwise the row + * comparison ordering is different from the index ordering and so we can't + * stop the scan on the basis of those lower-order columns. * * Note: when we set required-key flag bits in a subsidiary scankey, we are * scribbling on a data structure belonging to the index AM's caller, not on From 5d0800000ed5e4fb5ed010bb4b93f966e08b9fb3 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 2 Jul 2025 15:47:59 -0400 Subject: [PATCH 024/138] Correctly copy the target host identification in PQcancelCreate. PQcancelCreate failed to copy struct pg_conn_host's "type" field, instead leaving it zero (a/k/a CHT_HOST_NAME). This seemingly has no great ill effects if it should have been CHT_UNIX_SOCKET instead, but if it should have been CHT_HOST_ADDRESS then a null-pointer dereference will occur when the cancelConn is used. Bug: #18974 Reported-by: Maxim Boguk Author: Sergei Kornilov Reviewed-by: Tom Lane Discussion: https://postgr.es/m/18974-575f02b2168b36b3@postgresql.org Backpatch-through: 17 --- src/interfaces/libpq/fe-cancel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/interfaces/libpq/fe-cancel.c b/src/interfaces/libpq/fe-cancel.c index cd3102346bfa7..65517c5703bca 100644 --- a/src/interfaces/libpq/fe-cancel.c +++ b/src/interfaces/libpq/fe-cancel.c @@ -137,6 +137,7 @@ PQcancelCreate(PGconn *conn) goto oom_error; originalHost = conn->connhost[conn->whichhost]; + cancelConn->connhost[0].type = originalHost.type; if (originalHost.host) { cancelConn->connhost[0].host = strdup(originalHost.host); From f0151e2a4ef13949b4402b9d7ee0f45209cc0126 Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Thu, 3 Jul 2025 16:03:19 +0900 Subject: [PATCH 025/138] doc: Remove incorrect note about wal_status in pg_replication_slots. The documentation previously stated that the wal_status column is NULL if restart_lsn is NULL in the pg_replication_slots view. This is incorrect, and wal_status can be "lost" even when restart_lsn is NULL. This commit removes the incorrect description. Back-patched to all supported versions. Author: Fujii Masao Reviewed-by: Nisha Moond Discussion: https://postgr.es/m/c9d23cdc-b5dd-455a-8ee9-f1f24d701d89@oss.nttdata.com Backpatch-through: 13 --- doc/src/sgml/system-views.sgml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml index 986ae1f543dbd..82825db03bb2f 100644 --- a/doc/src/sgml/system-views.sgml +++ b/doc/src/sgml/system-views.sgml @@ -2832,8 +2832,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx The last two states are seen only when is - non-negative. If restart_lsn is NULL, this - field is null. + non-negative. From 8af310b331940ff5efd19402a3f8ee5521eb821a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Thu, 3 Jul 2025 11:46:12 +0200 Subject: [PATCH 026/138] Prevent creation of duplicate not-null constraints for domains MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This was previously harmless, but now that we create pg_constraint rows for those, duplicates are not welcome anymore. Backpatch to 18. Co-authored-by: jian he Co-authored-by: Álvaro Herrera Discussion: https://postgr.es/m/CACJufxFSC0mcQ82bSk58sO-WJY4P-o4N6RD2M0D=DD_u_6EzdQ@mail.gmail.com --- src/backend/commands/typecmds.c | 14 +++++++++++--- src/test/regress/expected/domain.out | 5 +++++ src/test/regress/sql/domain.sql | 3 +++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/backend/commands/typecmds.c b/src/backend/commands/typecmds.c index 45ae7472ab5ad..26d985193aea4 100644 --- a/src/backend/commands/typecmds.c +++ b/src/backend/commands/typecmds.c @@ -939,11 +939,19 @@ DefineDomain(ParseState *pstate, CreateDomainStmt *stmt) break; case CONSTR_NOTNULL: - if (nullDefined && !typNotNull) + if (nullDefined) + { + if (!typNotNull) + ereport(ERROR, + errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting NULL/NOT NULL constraints"), + parser_errposition(pstate, constr->location)); + ereport(ERROR, - errcode(ERRCODE_SYNTAX_ERROR), - errmsg("conflicting NULL/NOT NULL constraints"), + errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("redundant NOT NULL constraint definition"), parser_errposition(pstate, constr->location)); + } if (constr->is_no_inherit) ereport(ERROR, errcode(ERRCODE_INVALID_OBJECT_DEFINITION), diff --git a/src/test/regress/expected/domain.out b/src/test/regress/expected/domain.out index ba6f05eeb7df6..b5ea707df3103 100644 --- a/src/test/regress/expected/domain.out +++ b/src/test/regress/expected/domain.out @@ -1019,6 +1019,11 @@ insert into domain_test values (1, 2); -- should fail alter table domain_test add column c str_domain; ERROR: domain str_domain does not allow null values +-- disallow duplicated not-null constraints +create domain int_domain1 as int constraint nn1 not null constraint nn2 not null; +ERROR: redundant NOT NULL constraint definition +LINE 1: ...domain int_domain1 as int constraint nn1 not null constraint... + ^ create domain str_domain2 as text check (value <> 'foo') default 'foo'; -- should fail alter table domain_test add column d str_domain2; diff --git a/src/test/regress/sql/domain.sql b/src/test/regress/sql/domain.sql index b752a63ab5f69..b8f5a6397121a 100644 --- a/src/test/regress/sql/domain.sql +++ b/src/test/regress/sql/domain.sql @@ -602,6 +602,9 @@ insert into domain_test values (1, 2); -- should fail alter table domain_test add column c str_domain; +-- disallow duplicated not-null constraints +create domain int_domain1 as int constraint nn1 not null constraint nn2 not null; + create domain str_domain2 as text check (value <> 'foo') default 'foo'; -- should fail From 0cd7fcaa857400fff28e31f0538d7824814c6863 Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Thu, 3 Jul 2025 23:07:23 +0900 Subject: [PATCH 027/138] doc: Update outdated descriptions of wal_status in pg_replication_slots. The documentation for pg_replication_slots previously mentioned only max_slot_wal_keep_size as a condition under which the wal_status column could show unreserved or lost. However, since commit be87200, replication slots can also be invalidated due to horizon or wal_level, and since commit ac0e33136ab, idle_replication_slot_timeout can also trigger this state. This commit updates the description of the wal_status column to reflect that max_slot_wal_keep_size is not the only cause of the lost state. Back-patched to v16, where the additional invalidation cases were introduced. Author: Fujii Masao Reviewed-by: Hayato Kuroda Reviewed-by: Nisha Moond Discussion: https://postgr.es/m/78b34e84-2195-4f28-a151-5d204a382fdd@oss.nttdata.com Backpatch-through: 16 --- doc/src/sgml/system-views.sgml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml index 82825db03bb2f..e1ac544ee4079 100644 --- a/doc/src/sgml/system-views.sgml +++ b/doc/src/sgml/system-views.sgml @@ -2819,20 +2819,18 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx unreserved means that the slot no longer retains the required WAL files and some of them are to be removed at - the next checkpoint. This state can return + the next checkpoint. This typically occurs when + is set to + a non-negative value. This state can return to reserved or extended. - lost means that some required WAL files have - been removed and this slot is no longer usable. + lost means that this slot is no longer usable. - The last two states are seen only when - is - non-negative. From 3d7a96871c71dc0e6d2614eb4a68179bc7e383db Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 3 Jul 2025 13:46:07 -0400 Subject: [PATCH 028/138] Obtain required table lock during cross-table updates, redux. Commits 8319e5cb5 et al missed the fact that ATPostAlterTypeCleanup contains three calls to ATPostAlterTypeParse, and the other two also need protection against passing a relid that we don't yet have lock on. Add similar logic to those code paths, and add some test cases demonstrating the need for it. In v18 and master, the test cases demonstrate that there's a behavioral discrepancy between stored generated columns and virtual generated columns: we disallow changing the expression of a stored column if it's used in any composite-type columns, but not that of a virtual column. Since the expression isn't actually relevant to either sort of composite-type usage, this prohibition seems unnecessary; but changing it is a matter for separate discussion. For now we are just documenting the existing behavior. Reported-by: jian he Author: jian he Reviewed-by: Tom Lane Discussion: CACJufxGKJtGNRRSXfwMW9SqVOPEMdP17BJ7DsBf=tNsv9pWU9g@mail.gmail.com Backpatch-through: 13 --- src/backend/commands/tablecmds.c | 22 +++++++++++++++++++ src/test/regress/expected/alter_table.out | 8 +++++++ .../regress/expected/generated_stored.out | 12 ++++++++++ .../regress/expected/generated_virtual.out | 9 ++++++++ src/test/regress/sql/alter_table.sql | 8 +++++++ src/test/regress/sql/generated_stored.sql | 13 +++++++++++ src/test/regress/sql/generated_virtual.sql | 13 +++++++++++ 7 files changed, 85 insertions(+) diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index b8837f26cb4fd..011bb4acddb64 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -15488,6 +15488,14 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode) Oid relid; relid = IndexGetRelation(oldId, false); + + /* + * As above, make sure we have lock on the index's table if it's not + * the same table. + */ + if (relid != tab->relid) + LockRelationOid(relid, AccessExclusiveLock); + ATPostAlterTypeParse(oldId, relid, InvalidOid, (char *) lfirst(def_item), wqueue, lockmode, tab->rewrite); @@ -15504,6 +15512,20 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode) Oid relid; relid = StatisticsGetRelation(oldId, false); + + /* + * As above, make sure we have lock on the statistics object's table + * if it's not the same table. However, we take + * ShareUpdateExclusiveLock here, aligning with the lock level used in + * CreateStatistics and RemoveStatisticsById. + * + * CAUTION: this should be done after all cases that grab + * AccessExclusiveLock, else we risk causing deadlock due to needing + * to promote our table lock. + */ + if (relid != tab->relid) + LockRelationOid(relid, ShareUpdateExclusiveLock); + ATPostAlterTypeParse(oldId, relid, InvalidOid, (char *) lfirst(def_item), wqueue, lockmode, tab->rewrite); diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index 750efc042d8ee..08984dd98f168 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -4750,6 +4750,14 @@ create table attbl(a int); create table atref(b attbl check ((b).a is not null)); alter table attbl alter column a type numeric; -- someday this should work ERROR: cannot alter table "attbl" because column "atref.b" uses its row type +alter table atref drop constraint atref_b_check; +create statistics atref_stat on ((b).a is not null) from atref; +alter table attbl alter column a type numeric; -- someday this should work +ERROR: cannot alter table "attbl" because column "atref.b" uses its row type +drop statistics atref_stat; +create index atref_idx on atref (((b).a)); +alter table attbl alter column a type numeric; -- someday this should work +ERROR: cannot alter table "attbl" because column "atref.b" uses its row type drop table attbl, atref; /* End test case for bug #18970 */ -- Test that ALTER TABLE rewrite preserves a clustered index diff --git a/src/test/regress/expected/generated_stored.out b/src/test/regress/expected/generated_stored.out index 16de30ab1910b..adac2cedfb2a3 100644 --- a/src/test/regress/expected/generated_stored.out +++ b/src/test/regress/expected/generated_stored.out @@ -1313,6 +1313,18 @@ CREATE TABLE gtest31_1 (a int, b text GENERATED ALWAYS AS ('hello') STORED, c te CREATE TABLE gtest31_2 (x int, y gtest31_1); ALTER TABLE gtest31_1 ALTER COLUMN b TYPE varchar; -- fails ERROR: cannot alter table "gtest31_1" because column "gtest31_2.y" uses its row type +-- bug #18970: these cases are unsupported, but make sure they fail cleanly +ALTER TABLE gtest31_2 ADD CONSTRAINT cc CHECK ((y).b IS NOT NULL); +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello1'); +ERROR: cannot alter table "gtest31_1" because column "gtest31_2.y" uses its row type +ALTER TABLE gtest31_2 DROP CONSTRAINT cc; +CREATE STATISTICS gtest31_2_stat ON ((y).b is not null) FROM gtest31_2; +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello2'); +ERROR: cannot alter table "gtest31_1" because column "gtest31_2.y" uses its row type +DROP STATISTICS gtest31_2_stat; +CREATE INDEX gtest31_2_y_idx ON gtest31_2(((y).b)); +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello3'); +ERROR: cannot alter table "gtest31_1" because column "gtest31_2.y" uses its row type DROP TABLE gtest31_1, gtest31_2; -- Check it for a partitioned table, too CREATE TABLE gtest31_1 (a int, b text GENERATED ALWAYS AS ('hello') STORED, c text) PARTITION BY LIST (a); diff --git a/src/test/regress/expected/generated_virtual.out b/src/test/regress/expected/generated_virtual.out index df704b5166fa3..3b40e15a95ad0 100644 --- a/src/test/regress/expected/generated_virtual.out +++ b/src/test/regress/expected/generated_virtual.out @@ -1283,6 +1283,15 @@ CREATE TABLE gtest31_1 (a int, b text GENERATED ALWAYS AS ('hello') VIRTUAL, c t CREATE TABLE gtest31_2 (x int, y gtest31_1); ALTER TABLE gtest31_1 ALTER COLUMN b TYPE varchar; -- fails ERROR: cannot alter table "gtest31_1" because column "gtest31_2.y" uses its row type +-- bug #18970 +ALTER TABLE gtest31_2 ADD CONSTRAINT cc CHECK ((y).b IS NOT NULL); +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello1'); +ALTER TABLE gtest31_2 DROP CONSTRAINT cc; +CREATE STATISTICS gtest31_2_stat ON ((y).b is not null) FROM gtest31_2; +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello2'); +DROP STATISTICS gtest31_2_stat; +CREATE INDEX gtest31_2_y_idx ON gtest31_2(((y).b)); +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello3'); DROP TABLE gtest31_1, gtest31_2; -- Check it for a partitioned table, too CREATE TABLE gtest31_1 (a int, b text GENERATED ALWAYS AS ('hello') VIRTUAL, c text) PARTITION BY LIST (a); diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql index 41cff198e183c..fc6e36d0e7882 100644 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@ -3074,6 +3074,14 @@ drop table attbl, atref; create table attbl(a int); create table atref(b attbl check ((b).a is not null)); alter table attbl alter column a type numeric; -- someday this should work +alter table atref drop constraint atref_b_check; + +create statistics atref_stat on ((b).a is not null) from atref; +alter table attbl alter column a type numeric; -- someday this should work +drop statistics atref_stat; + +create index atref_idx on atref (((b).a)); +alter table attbl alter column a type numeric; -- someday this should work drop table attbl, atref; /* End test case for bug #18970 */ diff --git a/src/test/regress/sql/generated_stored.sql b/src/test/regress/sql/generated_stored.sql index 4ec155f2da989..f56fde8d4e5d0 100644 --- a/src/test/regress/sql/generated_stored.sql +++ b/src/test/regress/sql/generated_stored.sql @@ -595,6 +595,19 @@ ALTER TABLE gtest30_1 ALTER COLUMN b DROP EXPRESSION; -- error CREATE TABLE gtest31_1 (a int, b text GENERATED ALWAYS AS ('hello') STORED, c text); CREATE TABLE gtest31_2 (x int, y gtest31_1); ALTER TABLE gtest31_1 ALTER COLUMN b TYPE varchar; -- fails + +-- bug #18970: these cases are unsupported, but make sure they fail cleanly +ALTER TABLE gtest31_2 ADD CONSTRAINT cc CHECK ((y).b IS NOT NULL); +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello1'); +ALTER TABLE gtest31_2 DROP CONSTRAINT cc; + +CREATE STATISTICS gtest31_2_stat ON ((y).b is not null) FROM gtest31_2; +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello2'); +DROP STATISTICS gtest31_2_stat; + +CREATE INDEX gtest31_2_y_idx ON gtest31_2(((y).b)); +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello3'); + DROP TABLE gtest31_1, gtest31_2; -- Check it for a partitioned table, too diff --git a/src/test/regress/sql/generated_virtual.sql b/src/test/regress/sql/generated_virtual.sql index 6fa986515b9e3..e2b31853e0132 100644 --- a/src/test/regress/sql/generated_virtual.sql +++ b/src/test/regress/sql/generated_virtual.sql @@ -646,6 +646,19 @@ ALTER TABLE gtest30_1 ALTER COLUMN b DROP EXPRESSION; -- error CREATE TABLE gtest31_1 (a int, b text GENERATED ALWAYS AS ('hello') VIRTUAL, c text); CREATE TABLE gtest31_2 (x int, y gtest31_1); ALTER TABLE gtest31_1 ALTER COLUMN b TYPE varchar; -- fails + +-- bug #18970 +ALTER TABLE gtest31_2 ADD CONSTRAINT cc CHECK ((y).b IS NOT NULL); +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello1'); +ALTER TABLE gtest31_2 DROP CONSTRAINT cc; + +CREATE STATISTICS gtest31_2_stat ON ((y).b is not null) FROM gtest31_2; +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello2'); +DROP STATISTICS gtest31_2_stat; + +CREATE INDEX gtest31_2_y_idx ON gtest31_2(((y).b)); +ALTER TABLE gtest31_1 ALTER COLUMN b SET EXPRESSION AS ('hello3'); + DROP TABLE gtest31_1, gtest31_2; -- Check it for a partitioned table, too From 29a4b63c6bc83a7c21e3ccd0f484eee9a91be7d8 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Fri, 4 Jul 2025 15:10:17 +0900 Subject: [PATCH 029/138] Disable commit timestamps during bootstrap Attempting to use commit timestamps during bootstrapping leads to an assertion failure, that can be reached for example with an initdb -c that enables track_commit_timestamp. It makes little sense to register a commit timestamp for a BootstrapTransactionId, so let's disable the activation of the module in this case. This problem has been independently reported once by each author of this commit. Each author has proposed basically the same patch, relying on IsBootstrapProcessingMode() to skip the use of commit_ts during bootstrap. The test addition is a suggestion by me, and is applied down to v16. Author: Hayato Kuroda Author: Andy Fan Reviewed-by: Bertrand Drouvot Reviewed-by: Fujii Masao Reviewed-by: Michael Paquier Discussion: https://postgr.es/m/OSCPR01MB14966FF9E4C4145F37B937E52F5102@OSCPR01MB14966.jpnprd01.prod.outlook.com Discussion: https://postgr.es/m/87plejmnpy.fsf@163.com Backpatch-through: 13 --- src/backend/access/transam/commit_ts.c | 7 +++++++ src/test/modules/commit_ts/t/001_base.pl | 3 +-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c index 113fae1437ad8..225ff7ca9f2d3 100644 --- a/src/backend/access/transam/commit_ts.c +++ b/src/backend/access/transam/commit_ts.c @@ -707,6 +707,13 @@ ActivateCommitTs(void) TransactionId xid; int64 pageno; + /* + * During bootstrap, we should not register commit timestamps so skip the + * activation in this case. + */ + if (IsBootstrapProcessingMode()) + return; + /* If we've done this already, there's nothing to do */ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); if (commitTsShared->commitTsActive) diff --git a/src/test/modules/commit_ts/t/001_base.pl b/src/test/modules/commit_ts/t/001_base.pl index 1953b18f6b3c3..50e79ce640937 100644 --- a/src/test/modules/commit_ts/t/001_base.pl +++ b/src/test/modules/commit_ts/t/001_base.pl @@ -11,8 +11,7 @@ use PostgreSQL::Test::Cluster; my $node = PostgreSQL::Test::Cluster->new('foxtrot'); -$node->init; -$node->append_conf('postgresql.conf', 'track_commit_timestamp = on'); +$node->init(extra => [ '-c', "track_commit_timestamp=on" ]); $node->start; # Create a table, compare "now()" to the commit TS of its xmin From 07da2985d6bf685c4417b3738babbcac109a4a44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Fri, 4 Jul 2025 18:05:43 +0200 Subject: [PATCH 030/138] pg_upgrade: check for inconsistencies in not-null constraints w/inheritance With tables defined like this, CREATE TABLE ip (id int PRIMARY KEY); CREATE TABLE ic (id int) INHERITS (ip); ALTER TABLE ic ALTER id DROP NOT NULL; pg_upgrade fails during the schema restore phase due to this error: ERROR: column "id" in child table must be marked NOT NULL This can only be fixed by marking the child column as NOT NULL before the upgrade, which could take an arbitrary amount of time (because ic's data must be scanned). Have pg_upgrade's check mode warn if that condition is found, so that users know what to adjust before running the upgrade for real. Author: Ali Akbar Reviewed-by: Justin Pryzby Backpatch-through: 13 Discussion: https://postgr.es/m/CACQjQLoMsE+1pyLe98pi0KvPG2jQQ94LWJ+PTiLgVRK4B=i_jg@mail.gmail.com --- src/bin/pg_upgrade/check.c | 96 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c index 81865cd3e4859..ba4b9ff3b148c 100644 --- a/src/bin/pg_upgrade/check.c +++ b/src/bin/pg_upgrade/check.c @@ -23,6 +23,7 @@ static void check_for_isn_and_int8_passing_mismatch(ClusterInfo *cluster); static void check_for_user_defined_postfix_ops(ClusterInfo *cluster); static void check_for_incompatible_polymorphics(ClusterInfo *cluster); static void check_for_tables_with_oids(ClusterInfo *cluster); +static void check_for_not_null_inheritance(ClusterInfo *cluster); static void check_for_pg_role_prefix(ClusterInfo *cluster); static void check_for_new_tablespace_dir(void); static void check_for_user_defined_encoding_conversions(ClusterInfo *cluster); @@ -671,6 +672,14 @@ check_and_dump_old_cluster(void) if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1100) check_for_tables_with_oids(&old_cluster); + /* + * Pre-PG 18 allowed child tables to omit not-null constraints that their + * parents columns have, but schema restore fails for them. Verify there + * are none, iff applicable. + */ + if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1800) + check_for_not_null_inheritance(&old_cluster); + /* * Pre-PG 10 allowed tables with 'unknown' type columns and non WAL logged * hash indexes @@ -1623,6 +1632,93 @@ check_for_tables_with_oids(ClusterInfo *cluster) check_ok(); } +/* + * Callback function for processing results of query for + * check_for_not_null_inheritance. + */ +static void +process_inconsistent_notnull(DbInfo *dbinfo, PGresult *res, void *arg) +{ + UpgradeTaskReport *report = (UpgradeTaskReport *) arg; + int ntups = PQntuples(res); + int i_nspname = PQfnumber(res, "nspname"); + int i_relname = PQfnumber(res, "relname"); + int i_attname = PQfnumber(res, "attname"); + + AssertVariableIsOfType(&process_inconsistent_notnull, + UpgradeTaskProcessCB); + + if (ntups == 0) + return; + + if (report->file == NULL && + (report->file = fopen_priv(report->path, "w")) == NULL) + pg_fatal("could not open file \"%s\": %m", report->path); + + fprintf(report->file, "In database: %s\n", dbinfo->db_name); + + for (int rowno = 0; rowno < ntups; rowno++) + { + fprintf(report->file, " %s.%s.%s\n", + PQgetvalue(res, rowno, i_nspname), + PQgetvalue(res, rowno, i_relname), + PQgetvalue(res, rowno, i_attname)); + } +} + +/* + * check_for_not_null_inheritance() + * + * An attempt to create child tables lacking not-null constraints that are + * present in their parents errors out. This can no longer occur since 18, + * but previously there were various ways for that to happen. Check that + * the cluster to be upgraded doesn't have any of those problems. + */ +static void +check_for_not_null_inheritance(ClusterInfo *cluster) +{ + UpgradeTaskReport report; + UpgradeTask *task; + const char *query; + + prep_status("Checking for not-null constraint inconsistencies"); + + report.file = NULL; + snprintf(report.path, sizeof(report.path), "%s/%s", + log_opts.basedir, + "not_null_inconsistent_columns.txt"); + + query = "SELECT cc.relnamespace::pg_catalog.regnamespace AS nspname, " + " cc.relname, ac.attname " + "FROM pg_catalog.pg_inherits i, pg_catalog.pg_attribute ac, " + " pg_catalog.pg_attribute ap, pg_catalog.pg_class cc " + "WHERE cc.oid = ac.attrelid AND i.inhrelid = ac.attrelid " + " AND i.inhparent = ap.attrelid AND ac.attname = ap.attname " + " AND ap.attnum > 0 and ap.attnotnull AND NOT ac.attnotnull"; + + task = upgrade_task_create(); + upgrade_task_add_step(task, query, + process_inconsistent_notnull, + true, &report); + upgrade_task_run(task, cluster); + upgrade_task_free(task); + + if (report.file) + { + fclose(report.file); + pg_log(PG_REPORT, "fatal"); + pg_fatal("Your installation contains inconsistent NOT NULL constraints.\n" + "If the parent column(s) are NOT NULL, then the child column must\n" + "also be marked NOT NULL, or the upgrade will fail.\n" + "You can fix this by running\n" + " ALTER TABLE tablename ALTER column SET NOT NULL;\n" + "on each column listed in the file:\n" + " %s", report.path); + } + else + check_ok(); +} + /* * check_for_pg_role_prefix() From 5aba3e637d3d874f7bf00318c828249a964c4c3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Fri, 4 Jul 2025 18:31:24 +0200 Subject: [PATCH 031/138] pg_upgrade: Add missing newline in error message Minor oversight in 347758b12063 --- src/bin/pg_upgrade/check.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c index ba4b9ff3b148c..5eac4073fc3ff 100644 --- a/src/bin/pg_upgrade/check.c +++ b/src/bin/pg_upgrade/check.c @@ -420,7 +420,7 @@ process_data_type_check(DbInfo *dbinfo, PGresult *res, void *arg) if (!state->result) { pg_log(PG_REPORT, "failed check: %s", _(state->check->status)); - appendPQExpBuffer(*state->report, "\n%s\n%s %s\n", + appendPQExpBuffer(*state->report, "\n%s\n%s\n %s\n", _(state->check->report_text), _("A list of the problem columns is in the file:"), output_path); From 1e007722fa86ff397b0f3d165c89237ab3b05967 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Fri, 4 Jul 2025 21:30:05 +0200 Subject: [PATCH 032/138] Fix new pg_upgrade query not to rely on regnamespace That was invented in 9.5, and pg_upgrade claims to support back to 9.0. But we don't need that with a simple query change, tested by Tom Lane. Discussion: https://postgr.es/m/202507041645.afjl5rssvrgu@alvherre.pgsql --- src/bin/pg_upgrade/check.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c index 5eac4073fc3ff..bb6d588a2ec6d 100644 --- a/src/bin/pg_upgrade/check.c +++ b/src/bin/pg_upgrade/check.c @@ -1688,12 +1688,13 @@ check_for_not_null_inheritance(ClusterInfo *cluster) log_opts.basedir, "not_null_inconsistent_columns.txt"); - query = "SELECT cc.relnamespace::pg_catalog.regnamespace AS nspname, " - " cc.relname, ac.attname " + query = "SELECT nspname, cc.relname, ac.attname " "FROM pg_catalog.pg_inherits i, pg_catalog.pg_attribute ac, " - " pg_catalog.pg_attribute ap, pg_catalog.pg_class cc " + " pg_catalog.pg_attribute ap, pg_catalog.pg_class cc, " + " pg_catalog.pg_namespace nc " "WHERE cc.oid = ac.attrelid AND i.inhrelid = ac.attrelid " " AND i.inhparent = ap.attrelid AND ac.attname = ap.attname " + " AND cc.relnamespace = nc.oid " " AND ap.attnum > 0 and ap.attnotnull AND NOT ac.attnotnull"; task = upgrade_task_create(); From 8d1071e7da30dc46502c24a18cf61c285f6d6c1b Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Mon, 7 Jul 2025 08:54:30 +0900 Subject: [PATCH 033/138] Fix incompatibility with libxml2 >= 2.14 libxml2 has deprecated the members of xmlBuffer, and it is recommended to access them with dedicated routines. We have only one case in the tree where this shows an impact: xml2/xpath.c where "content" was getting directly accessed. The rest of the code looked fine, checking the PostgreSQL code with libxml2 close to the top of its "2.14" branch. xmlBufferContent() exists since year 2000 based on a check of the upstream libxml2 tree, so let's switch to it. Like 400928b83bd2, backpatch all the way down as this can have an impact on all the branches already released once newer versions of libxml2 get more popular. Reported-by: Walid Ibrahim Reviewed-by: Tom Lane Discussion: https://postgr.es/m/aGdSdcR4QTjEHX6s@paquier.xyz Backpatch-through: 13 --- contrib/xml2/xpath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/xml2/xpath.c b/contrib/xml2/xpath.c index 23d3f332dbaa7..2820874cb5e97 100644 --- a/contrib/xml2/xpath.c +++ b/contrib/xml2/xpath.c @@ -176,7 +176,7 @@ pgxmlNodeSetToText(xmlNodeSetPtr nodeset, xmlBufferWriteCHAR(buf, toptagname); xmlBufferWriteChar(buf, ">"); } - result = xmlStrdup(buf->content); + result = xmlStrdup(xmlBufferContent(buf)); xmlBufferFree(buf); return result; } From 440c5ee202000a30c4e7b27cd952edb2ab16cea8 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 7 Jul 2025 14:33:20 -0400 Subject: [PATCH 034/138] Restore the ability to run pl/pgsql expression queries in parallel. pl/pgsql's notion of an "expression" is very broad, encompassing any SQL SELECT query that returns a single column and no more than one row. So there are cases, for example evaluation of an aggregate function, where the query involves significant work and it'd be useful to run it with parallel workers. This used to be possible, but commits 3eea7a0c9 et al unintentionally disabled it. The simplest fix is to make exec_eval_expr() pass maxtuples = 0 rather than 2 to exec_run_select(). This avoids the new rule that we will never use parallelism when a nonzero "count" limit is passed to ExecutorRun(). (Note that the pre-3eea7a0c9 behavior was indeed unsafe, so reverting that rule is not in the cards.) The reason for passing 2 before was that exec_eval_expr() will throw an error if it gets more than one returned row, so we figured that as soon as we have two rows we know that will happen and we might as well stop running the query. That choice was cost-free when it was made; but disabling parallelism is far from cost-free, so now passing 2 amounts to optimizing a failure case at the expense of useful cases. An expression query that can return more than one row is certainly broken. People might now need to wait a bit longer to discover such breakage; but hopefully few will use enormously expensive cases as their first test of new pl/pgsql logic. Author: Dipesh Dhameliya Reviewed-by: Tom Lane Discussion: https://postgr.es/m/CABgZEgdfbnq9t6xXJnmXbChNTcWFjeM_6nuig41tm327gYi2ig@mail.gmail.com Backpatch-through: 13 --- src/pl/plpgsql/src/pl_exec.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c index bb99781c56e39..b9acc790dc664 100644 --- a/src/pl/plpgsql/src/pl_exec.c +++ b/src/pl/plpgsql/src/pl_exec.c @@ -5703,7 +5703,7 @@ exec_eval_expr(PLpgSQL_execstate *estate, /* * Else do it the hard way via exec_run_select */ - rc = exec_run_select(estate, expr, 2, NULL); + rc = exec_run_select(estate, expr, 0, NULL); if (rc != SPI_OK_SELECT) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), @@ -5757,6 +5757,10 @@ exec_eval_expr(PLpgSQL_execstate *estate, /* ---------- * exec_run_select Execute a select query + * + * Note: passing maxtuples different from 0 ("return all tuples") is + * deprecated because it will prevent parallel execution of the query. + * However, we retain the parameter in case we need it someday. * ---------- */ static int From 3a797c24914af421cf9d4d09bc90024884841dfa Mon Sep 17 00:00:00 2001 From: Jacob Champion Date: Mon, 7 Jul 2025 11:58:04 -0700 Subject: [PATCH 035/138] oauth: Fix kqueue detection on OpenBSD In b0635bfda, I added an early header check to the Meson OAuth support, which was intended to duplicate the later checks for HAVE_SYS_[EVENT|EPOLL]_H. However, I implemented the new test via check_header() -- which tries to compile -- rather than has_header(), which just looks for the file's existence. The distinction matters on OpenBSD, where can't be compiled without including prerequisite headers, so -Dlibcurl=enabled failed on that platform. Switch to has_header() to fix this. Note that reviewers expressed concern about the difference between our Autoconf feature tests (which compile headers) and our Meson feature tests (which do not). I'm not opposed to aligning the two, but I want to avoid making bigger changes as part of this fix. Reviewed-by: Peter Eisentraut Reviewed-by: Tom Lane Discussion: https://postgr.es/m/flat/CAOYmi+kdR218ke2zu74oTJvzYJcqV1MN5=mGAPqZQuc79HMSVA@mail.gmail.com Backpatch-through: 18 --- meson.build | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/meson.build b/meson.build index 6ffe7b4727556..692b8b8de0be9 100644 --- a/meson.build +++ b/meson.build @@ -943,10 +943,10 @@ if not libcurlopt.disabled() # libcurl and one of either epoll or kqueue. oauth_flow_supported = ( libcurl.found() - and (cc.check_header('sys/event.h', required: false, - args: test_c_args, include_directories: postgres_inc) - or cc.check_header('sys/epoll.h', required: false, - args: test_c_args, include_directories: postgres_inc)) + and (cc.has_header('sys/event.h', + args: test_c_args, include_directories: postgres_inc) + or cc.has_header('sys/epoll.h', + args: test_c_args, include_directories: postgres_inc)) ) if oauth_flow_supported From 9a5334c0b4e9d7269ee9b5e9d08925c8a1e1e01e Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Mon, 7 Jul 2025 21:03:16 -0400 Subject: [PATCH 036/138] aio: Combine io_uring memory mappings, if supported By default io_uring creates a shared memory mapping for each io_uring instance, leading to a large number of memory mappings. Unfortunately a large number of memory mappings slows things down, backend exit is particularly affected. To address that, newer kernels (6.5) support using user-provided memory for the memory. By putting the relevant memory into shared memory we don't need any additional mappings. On a system with a new enough kernel and liburing, there is no discernible overhead when doing a pgbench -S -C anymore. Reported-by: MARK CALLAGHAN Reviewed-by: "Burd, Greg" Reviewed-by: Jim Nasby Discussion: https://postgr.es/m/CAFbpF8OA44_UG+RYJcWH9WjF7E3GA6gka3gvH6nsrSnEe9H0NA@mail.gmail.com Backpatch-through: 18 --- configure | 17 ++ configure.ac | 7 + meson.build | 6 + src/backend/storage/aio/method_io_uring.c | 210 +++++++++++++++++++++- src/include/pg_config.h.in | 3 + src/tools/pgindent/typedefs.list | 1 + 6 files changed, 238 insertions(+), 6 deletions(-) diff --git a/configure b/configure index 3d3d3db97a456..96bba2f3f06b7 100755 --- a/configure +++ b/configure @@ -13309,6 +13309,23 @@ fi fi +if test "$with_liburing" = yes; then + _LIBS="$LIBS" + LIBS="$LIBURING_LIBS $LIBS" + for ac_func in io_uring_queue_init_mem +do : + ac_fn_c_check_func "$LINENO" "io_uring_queue_init_mem" "ac_cv_func_io_uring_queue_init_mem" +if test "x$ac_cv_func_io_uring_queue_init_mem" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_IO_URING_QUEUE_INIT_MEM 1 +_ACEOF + +fi +done + + LIBS="$_LIBS" +fi + if test "$with_lz4" = yes ; then { $as_echo "$as_me:${as_lineno-$LINENO}: checking for LZ4_compress_default in -llz4" >&5 $as_echo_n "checking for LZ4_compress_default in -llz4... " >&6; } diff --git a/configure.ac b/configure.ac index 4b8335dc6138e..14f485a453f41 100644 --- a/configure.ac +++ b/configure.ac @@ -1420,6 +1420,13 @@ if test "$with_libxslt" = yes ; then AC_CHECK_LIB(xslt, xsltCleanupGlobals, [], [AC_MSG_ERROR([library 'xslt' is required for XSLT support])]) fi +if test "$with_liburing" = yes; then + _LIBS="$LIBS" + LIBS="$LIBURING_LIBS $LIBS" + AC_CHECK_FUNCS([io_uring_queue_init_mem]) + LIBS="$_LIBS" +fi + if test "$with_lz4" = yes ; then AC_CHECK_LIB(lz4, LZ4_compress_default, [], [AC_MSG_ERROR([library 'lz4' is required for LZ4 support])]) fi diff --git a/meson.build b/meson.build index 692b8b8de0be9..38a9b9bfaca36 100644 --- a/meson.build +++ b/meson.build @@ -990,6 +990,12 @@ liburingopt = get_option('liburing') liburing = dependency('liburing', required: liburingopt) if liburing.found() cdata.set('USE_LIBURING', 1) + + if cc.has_function('io_uring_queue_init_mem', + dependencies: liburing, args: test_c_args) + cdata.set('HAVE_LIBURING_QUEUE_INIT_MEM', 1) + endif + endif diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c index b78048328e113..0a8c054162f06 100644 --- a/src/backend/storage/aio/method_io_uring.c +++ b/src/backend/storage/aio/method_io_uring.c @@ -29,6 +29,9 @@ #ifdef IOMETHOD_IO_URING_ENABLED +#include +#include + #include #include "miscadmin.h" @@ -94,12 +97,32 @@ PgAioUringContext struct io_uring io_uring_ring; } PgAioUringContext; +/* + * Information about the capabilities that io_uring has. + * + * Depending on liburing and kernel version different features are + * supported. At least for the kernel a kernel version check does not suffice + * as various vendors do backport features to older kernels :(. + */ +typedef struct PgAioUringCaps +{ + bool checked; + /* -1 if io_uring_queue_init_mem() is unsupported */ + int mem_init_size; +} PgAioUringCaps; + + /* PgAioUringContexts for all backends */ static PgAioUringContext *pgaio_uring_contexts; /* the current backend's context */ static PgAioUringContext *pgaio_my_uring_context; +static PgAioUringCaps pgaio_uring_caps = +{ + .checked = false, + .mem_init_size = -1, +}; static uint32 pgaio_uring_procs(void) @@ -111,16 +134,145 @@ pgaio_uring_procs(void) return MaxBackends + NUM_AUXILIARY_PROCS - MAX_IO_WORKERS; } -static Size +/* + * Initializes pgaio_uring_caps, unless that's already done. + */ +static void +pgaio_uring_check_capabilities(void) +{ + if (pgaio_uring_caps.checked) + return; + + /* + * By default io_uring creates a shared memory mapping for each io_uring + * instance, leading to a large number of memory mappings. Unfortunately a + * large number of memory mappings slows things down, backend exit is + * particularly affected. To address that, newer kernels (6.5) support + * using user-provided memory for the memory, by putting the relevant + * memory into shared memory we don't need any additional mappings. + * + * To know whether this is supported, we unfortunately need to probe the + * kernel by trying to create a ring with userspace-provided memory. This + * also has a secondary benefit: We can determine precisely how much + * memory we need for each io_uring instance. + */ +#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP) + { + struct io_uring test_ring; + size_t ring_size; + void *ring_ptr; + struct io_uring_params p = {0}; + int ret; + + /* + * Liburing does not yet provide an API to query how much memory a + * ring will need. So we over-estimate it here. As the memory is freed + * just below that's small temporary waste of memory. + * + * 1MB is more than enough for rings within io_max_concurrency's + * range. + */ + ring_size = 1024 * 1024; + + /* + * Hard to believe a system exists where 1MB would not be a multiple + * of the page size. But it's cheap to ensure... + */ + ring_size -= ring_size % sysconf(_SC_PAGESIZE); + + ring_ptr = mmap(NULL, ring_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (ring_ptr == MAP_FAILED) + elog(ERROR, + "mmap(%zu) to determine io_uring_queue_init_mem() support failed: %m", + ring_size); + + ret = io_uring_queue_init_mem(io_max_concurrency, &test_ring, &p, ring_ptr, ring_size); + if (ret > 0) + { + pgaio_uring_caps.mem_init_size = ret; + + elog(DEBUG1, + "can use combined memory mapping for io_uring, each ring needs %d bytes", + ret); + + /* clean up the created ring, it was just for a test */ + io_uring_queue_exit(&test_ring); + } + else + { + /* + * There are different reasons for ring creation to fail, but it's + * ok to treat that just as io_uring_queue_init_mem() not being + * supported. We'll report a more detailed error in + * pgaio_uring_shmem_init(). + */ + errno = -ret; + elog(DEBUG1, + "cannot use combined memory mapping for io_uring, ring creation failed: %m"); + + } + + if (munmap(ring_ptr, ring_size) != 0) + elog(ERROR, "munmap() failed: %m"); + } +#else + { + elog(DEBUG1, + "can't use combined memory mapping for io_uring, kernel or liburing too old"); + } +#endif + + pgaio_uring_caps.checked = true; +} + +/* + * Memory for all PgAioUringContext instances + */ +static size_t pgaio_uring_context_shmem_size(void) { return mul_size(pgaio_uring_procs(), sizeof(PgAioUringContext)); } +/* + * Memory for the combined memory used by io_uring instances. Returns 0 if + * that is not supported by kernel/liburing. + */ +static size_t +pgaio_uring_ring_shmem_size(void) +{ + size_t sz = 0; + + if (pgaio_uring_caps.mem_init_size > 0) + { + /* + * Memory for rings needs to be allocated to the page boundary, + * reserve space. Luckily it does not need to be aligned to hugepage + * boundaries, even if huge pages are used. + */ + sz = add_size(sz, sysconf(_SC_PAGESIZE)); + sz = add_size(sz, mul_size(pgaio_uring_procs(), + pgaio_uring_caps.mem_init_size)); + } + + return sz; +} + static size_t pgaio_uring_shmem_size(void) { - return pgaio_uring_context_shmem_size(); + size_t sz; + + /* + * Kernel and liburing support for various features influences how much + * shmem we need, perform the necessary checks. + */ + pgaio_uring_check_capabilities(); + + sz = pgaio_uring_context_shmem_size(); + sz = add_size(sz, pgaio_uring_ring_shmem_size()); + + return sz; } static void @@ -128,13 +280,38 @@ pgaio_uring_shmem_init(bool first_time) { int TotalProcs = pgaio_uring_procs(); bool found; + char *shmem; + size_t ring_mem_remain = 0; + char *ring_mem_next = 0; - pgaio_uring_contexts = (PgAioUringContext *) - ShmemInitStruct("AioUring", pgaio_uring_shmem_size(), &found); - + /* + * We allocate memory for all PgAioUringContext instances and, if + * supported, the memory required for each of the io_uring instances, in + * one ShmemInitStruct(). + */ + shmem = ShmemInitStruct("AioUringContext", pgaio_uring_shmem_size(), &found); if (found) return; + pgaio_uring_contexts = (PgAioUringContext *) shmem; + shmem += pgaio_uring_context_shmem_size(); + + /* if supported, handle memory alignment / sizing for io_uring memory */ + if (pgaio_uring_caps.mem_init_size > 0) + { + ring_mem_remain = pgaio_uring_ring_shmem_size(); + ring_mem_next = (char *) shmem; + + /* align to page boundary, see also pgaio_uring_ring_shmem_size() */ + ring_mem_next = (char *) TYPEALIGN(sysconf(_SC_PAGESIZE), ring_mem_next); + + /* account for alignment */ + ring_mem_remain -= ring_mem_next - shmem; + shmem += ring_mem_next - shmem; + + shmem += ring_mem_remain; + } + for (int contextno = 0; contextno < TotalProcs; contextno++) { PgAioUringContext *context = &pgaio_uring_contexts[contextno]; @@ -158,7 +335,28 @@ pgaio_uring_shmem_init(bool first_time) * be worth using that - also need to evaluate if that causes * noticeable additional contention? */ - ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0); + + /* + * If supported (c.f. pgaio_uring_check_capabilities()), create ring + * with its data in shared memory. Otherwise fall back io_uring + * creating a memory mapping for each ring. + */ +#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP) + if (pgaio_uring_caps.mem_init_size > 0) + { + struct io_uring_params p = {0}; + + ret = io_uring_queue_init_mem(io_max_concurrency, &context->io_uring_ring, &p, ring_mem_next, ring_mem_remain); + + ring_mem_remain -= ret; + ring_mem_next += ret; + } + else +#endif + { + ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0); + } + if (ret < 0) { char *hint = NULL; diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 726a7c1be1f4d..c4dc5d72bdb78 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -229,6 +229,9 @@ /* Define to 1 if you have the global variable 'int timezone'. */ #undef HAVE_INT_TIMEZONE +/* Define to 1 if you have the `io_uring_queue_init_mem' function. */ +#undef HAVE_IO_URING_QUEUE_INIT_MEM + /* Define to 1 if __builtin_constant_p(x) implies "i"(x) acceptance. */ #undef HAVE_I_CONSTRAINT__BUILTIN_CONSTANT_P diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 32d6e718adca0..474bb639289dc 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2176,6 +2176,7 @@ PgAioReturn PgAioTargetData PgAioTargetID PgAioTargetInfo +PgAioUringCaps PgAioUringContext PgAioWaitRef PgArchData From 330db576f8c37479d472c3e9b0eb6d47ba1d97f4 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Tue, 8 Jul 2025 13:48:52 +0900 Subject: [PATCH 037/138] pg_walsummary: Improve stability of test checking statistics Per buildfarm member culicidae, the query checking for stats reported by the WAL summarizer related to WAL reads is proving to be unstable. Instead of a one-time query, this commit replaces the logic with a polling query checking for the WAL read stats, making the test more reliable on machines that could be slow with the stats reports. This test has been introduced in f4694e0f35b2, so backpatch down to v18. Reported-by: Alexander Lakhin Reviewed-by: Alexander Lakhin Discussion: https://postgr.es/m/f35ba3db-fca7-4693-bc35-6db64488e4b1@gmail.com Backpatch-through: 18 --- src/bin/pg_walsummary/t/002_blocks.pl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/bin/pg_walsummary/t/002_blocks.pl b/src/bin/pg_walsummary/t/002_blocks.pl index 270332780a453..0f98c7df82e6c 100644 --- a/src/bin/pg_walsummary/t/002_blocks.pl +++ b/src/bin/pg_walsummary/t/002_blocks.pl @@ -47,11 +47,12 @@ ok($result, "WAL summarization caught up after insert"); # The WAL summarizer should have generated some IO statistics. -my $stats_reads = $node1->safe_psql( +$node1->poll_query_until( 'postgres', - qq{SELECT sum(reads) > 0 FROM pg_stat_io - WHERE backend_type = 'walsummarizer' AND object = 'wal'}); -is($stats_reads, 't', "WAL summarizer generates statistics for WAL reads"); + q{SELECT sum(reads) > 0 FROM pg_stat_io + WHERE backend_type = 'walsummarizer' AND object = 'wal'}) + or die + "Timed out while waiting for WAL summarizer to generate statistics for WAL reads"; # Find the highest LSN that is summarized on disk. my $summarized_lsn = $node1->safe_psql('postgres', < Date: Tue, 8 Jul 2025 12:50:19 -0400 Subject: [PATCH 038/138] Fix low-probability memory leak in XMLSERIALIZE(... INDENT). xmltotext_with_options() did not consider the possibility that pg_xml_init() could fail --- most likely due to OOM. If that happened, the already-parsed xmlDoc structure would be leaked. Oversight in commit 483bdb2af. Bug: #18981 Author: Dmitry Kovalenko Reviewed-by: Tom Lane Discussion: https://postgr.es/m/18981-9bc3c80f107ae925@postgresql.org Backpatch-through: 16 --- src/backend/utils/adt/xml.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index a4150bff2eaea..056d811594909 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -663,7 +663,7 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent) volatile xmlBufferPtr buf = NULL; volatile xmlSaveCtxtPtr ctxt = NULL; ErrorSaveContext escontext = {T_ErrorSaveContext}; - PgXmlErrorContext *xmlerrcxt; + PgXmlErrorContext *volatile xmlerrcxt = NULL; #endif if (xmloption_arg != XMLOPTION_DOCUMENT && !indent) @@ -704,13 +704,18 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent) return (text *) data; } - /* Otherwise, we gotta spin up some error handling. */ - xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL); - + /* + * Otherwise, we gotta spin up some error handling. Unlike most other + * routines in this module, we already have a libxml "doc" structure to + * free, so we need to call pg_xml_init() inside the PG_TRY and be + * prepared for it to fail (typically due to palloc OOM). + */ PG_TRY(); { size_t decl_len = 0; + xmlerrcxt = pg_xml_init(PG_XML_STRICTNESS_ALL); + /* The serialized data will go into this buffer. */ buf = xmlBufferCreate(); @@ -838,10 +843,10 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent) xmlSaveClose(ctxt); if (buf) xmlBufferFree(buf); - if (doc) - xmlFreeDoc(doc); + xmlFreeDoc(doc); - pg_xml_done(xmlerrcxt, true); + if (xmlerrcxt) + pg_xml_done(xmlerrcxt, true); PG_RE_THROW(); } From fc3edb52fbb9b773442ce0a89116f893aaa766af Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Wed, 9 Jul 2025 12:46:18 +0900 Subject: [PATCH 039/138] libpq: Remove PQservice() This routine has been introduced as a shortcut to be able to retrieve a service name from an active connection, for psql. Per discussion, and as it is only used by psql, let's remove it to not clutter the libpq API more than necessary. The logic in psql is replaced by lookups of PQconninfoOption for the active connection, instead, updated each time the variables are synced by psql, the prompt shortcut relying on the variable synced. Reported-by: Noah Misch Discussion: https://postgr.es/m/20250706161319.c1.nmisch@google.com Backpatch-through: 18 --- doc/src/sgml/libpq.sgml | 20 ------------------ src/bin/psql/command.c | 7 ++++++- src/bin/psql/common.c | 35 +++++++++++++++++++++++++++++++ src/bin/psql/common.h | 1 + src/bin/psql/prompt.c | 8 +++++-- src/interfaces/libpq/exports.txt | 11 +++++----- src/interfaces/libpq/fe-connect.c | 8 ------- src/interfaces/libpq/libpq-fe.h | 1 - 8 files changed, 53 insertions(+), 38 deletions(-) diff --git a/doc/src/sgml/libpq.sgml b/doc/src/sgml/libpq.sgml index 298c4b38ef90a..b2c2cf9eac831 100644 --- a/doc/src/sgml/libpq.sgml +++ b/doc/src/sgml/libpq.sgml @@ -2740,26 +2740,6 @@ char *PQport(const PGconn *conn); - - PQservicePQservice - - - - Returns the service of the active connection. - - -char *PQservice(const PGconn *conn); - - - - - returns NULL if the - conn argument is NULL. - Otherwise, if there was no service provided, it returns an empty string. - - - - PQttyPQtty diff --git a/src/bin/psql/command.c b/src/bin/psql/command.c index 9fcd2db832656..0a55901b14e1e 100644 --- a/src/bin/psql/command.c +++ b/src/bin/psql/command.c @@ -4480,6 +4480,7 @@ SyncVariables(void) { char vbuf[32]; const char *server_version; + char *service_name; /* get stuff from connection */ pset.encoding = PQclientEncoding(pset.db); @@ -4489,12 +4490,16 @@ SyncVariables(void) setFmtEncoding(pset.encoding); SetVariable(pset.vars, "DBNAME", PQdb(pset.db)); - SetVariable(pset.vars, "SERVICE", PQservice(pset.db)); SetVariable(pset.vars, "USER", PQuser(pset.db)); SetVariable(pset.vars, "HOST", PQhost(pset.db)); SetVariable(pset.vars, "PORT", PQport(pset.db)); SetVariable(pset.vars, "ENCODING", pg_encoding_to_char(pset.encoding)); + service_name = get_conninfo_value("service"); + SetVariable(pset.vars, "SERVICE", service_name); + if (service_name) + pg_free(service_name); + /* this bit should match connection_warnings(): */ /* Try to get full text form of version, might include "devel" etc */ server_version = PQparameterStatus(pset.db, "server_version"); diff --git a/src/bin/psql/common.c b/src/bin/psql/common.c index d2c0a49c46c04..cd329ade12b5d 100644 --- a/src/bin/psql/common.c +++ b/src/bin/psql/common.c @@ -2531,6 +2531,41 @@ session_username(void) return PQuser(pset.db); } +/* + * Return the value of option for keyword in the current connection. + * + * The caller is responsible for freeing the result value allocated. + */ +char * +get_conninfo_value(const char *keyword) +{ + PQconninfoOption *opts; + PQconninfoOption *serviceopt = NULL; + char *res = NULL; + + if (pset.db == NULL) + return NULL; + + opts = PQconninfo(pset.db); + if (opts == NULL) + return NULL; + + for (PQconninfoOption *opt = opts; opt->keyword != NULL; ++opt) + { + if (strcmp(opt->keyword, keyword) == 0) + { + serviceopt = opt; + break; + } + } + + /* Take a copy of the value, as it is freed by PQconninfoFree(). */ + if (serviceopt && serviceopt->val != NULL) + res = pg_strdup(serviceopt->val); + PQconninfoFree(opts); + + return res; +} /* expand_tilde * diff --git a/src/bin/psql/common.h b/src/bin/psql/common.h index 7f1a23de1e82d..64762ab981755 100644 --- a/src/bin/psql/common.h +++ b/src/bin/psql/common.h @@ -39,6 +39,7 @@ extern bool SendQuery(const char *query); extern bool is_superuser(void); extern bool standard_strings(void); extern const char *session_username(void); +extern char *get_conninfo_value(const char *keyword); extern void expand_tilde(char **filename); extern void clean_extended_state(void); diff --git a/src/bin/psql/prompt.c b/src/bin/psql/prompt.c index 3aa7d2d06c80e..b08d7328fbfe7 100644 --- a/src/bin/psql/prompt.c +++ b/src/bin/psql/prompt.c @@ -169,8 +169,12 @@ get_prompt(promptStatus_t status, ConditionalStack cstack) break; /* service name */ case 's': - if (pset.db && PQservice(pset.db)) - strlcpy(buf, PQservice(pset.db), sizeof(buf)); + { + const char *service_name = GetVariable(pset.vars, "SERVICE"); + + if (service_name) + strlcpy(buf, service_name, sizeof(buf)); + } break; /* backend pid */ case 'p': diff --git a/src/interfaces/libpq/exports.txt b/src/interfaces/libpq/exports.txt index 0625cf39e9af3..dbbae642d769a 100644 --- a/src/interfaces/libpq/exports.txt +++ b/src/interfaces/libpq/exports.txt @@ -205,9 +205,8 @@ PQcancelFinish 202 PQsocketPoll 203 PQsetChunkedRowsMode 204 PQgetCurrentTimeUSec 205 -PQservice 206 -PQsetAuthDataHook 207 -PQgetAuthDataHook 208 -PQdefaultAuthDataHook 209 -PQfullProtocolVersion 210 -appendPQExpBufferVA 211 +PQsetAuthDataHook 206 +PQgetAuthDataHook 207 +PQdefaultAuthDataHook 208 +PQfullProtocolVersion 209 +appendPQExpBufferVA 210 diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c index 51a9c41658455..09eb79812ac6d 100644 --- a/src/interfaces/libpq/fe-connect.c +++ b/src/interfaces/libpq/fe-connect.c @@ -7461,14 +7461,6 @@ PQdb(const PGconn *conn) return conn->dbName; } -char * -PQservice(const PGconn *conn) -{ - if (!conn) - return NULL; - return conn->pgservice; -} - char * PQuser(const PGconn *conn) { diff --git a/src/interfaces/libpq/libpq-fe.h b/src/interfaces/libpq/libpq-fe.h index 7d3a9df6fd559..af8004f952a56 100644 --- a/src/interfaces/libpq/libpq-fe.h +++ b/src/interfaces/libpq/libpq-fe.h @@ -400,7 +400,6 @@ extern int PQrequestCancel(PGconn *conn); /* Accessor functions for PGconn objects */ extern char *PQdb(const PGconn *conn); -extern char *PQservice(const PGconn *conn); extern char *PQuser(const PGconn *conn); extern char *PQpass(const PGconn *conn); extern char *PQhost(const PGconn *conn); From 601a3133ae72ab24b27c96aa8053e227daa8fa08 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Wed, 9 Jul 2025 13:23:13 +0900 Subject: [PATCH 040/138] doc PG 18 relnotes: Remove item about PQservice() This libpq API has been removed in fc3edb52fbb9, commit that has forgotten one reference in the release notes. This applies only to v18. --- doc/src/sgml/release-18.sgml | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/doc/src/sgml/release-18.sgml b/doc/src/sgml/release-18.sgml index 66a6817a2be0f..b4bd3559a3946 100644 --- a/doc/src/sgml/release-18.sgml +++ b/doc/src/sgml/release-18.sgml @@ -2626,20 +2626,6 @@ Author: Heikki Linnakangas - - - - - Add libpq function PQservice() - to return the connection service name (Michael Banck) - § - - - Errors detected at semantic analysis or later, such as a misspelled table or column name, do not have this effect. + + + Lastly, note that all the statements within the Query message will + observe the same value of statement_timestamp(), + since that timestamp is updated only upon receipt of the Query + message. This will result in them all observing the same + value of transaction_timestamp() as well, + except in cases where the query string ends a previously-started + transaction and begins a new one. + From 0b6dfce0ce4dec2ddbf63e8d02f932b237a9f8c3 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 15 Jul 2025 18:11:18 -0400 Subject: [PATCH 060/138] Silence uninitialized-value warnings in compareJsonbContainers(). Because not every path through JsonbIteratorNext() sets val->type, some compilers complain that compareJsonbContainers() is comparing possibly-uninitialized values. The paths that don't set it return WJB_DONE, WJB_END_ARRAY, or WJB_END_OBJECT, so it's clear by manual inspection that the "(ra == rb)" code path is safe, and indeed we aren't seeing warnings about that. But the (ra != rb) case is much less obviously safe. In Assert-enabled builds it seems that the asserts rejecting WJB_END_ARRAY and WJB_END_OBJECT persuade gcc 15.x not to warn, which makes little sense because it's impossible to believe that the compiler can prove of its own accord that ra/rb aren't WJB_DONE here. (In fact they never will be, so the code isn't wrong, but why is there no warning?) Without Asserts, the appearance of warnings is quite unsurprising. We discussed fixing this by converting those two Asserts into pg_assume, but that seems not very satisfactory when it's so unclear why the compiler is or isn't warning: the warning could easily reappear with some other compiler version. Let's fix it in a less magical, more future-proof way by changing JsonbIteratorNext() so that it always does set val->type. The cost of that should be pretty negligible, and it makes the function's API spec less squishy. Reported-by: Erik Rijkers Author: Tom Lane Reviewed-by: Andres Freund Discussion: https://postgr.es/m/988bf1bc-3f1f-99f3-bf98-222f1cd9dc5e@xs4all.nl Discussion: https://postgr.es/m/0c623e8a204187b87b4736792398eaf1@postgrespro.ru Backpatch-through: 13 --- src/backend/utils/adt/jsonb_util.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c index c8b6c15e05975..136952861e14f 100644 --- a/src/backend/utils/adt/jsonb_util.c +++ b/src/backend/utils/adt/jsonb_util.c @@ -277,9 +277,6 @@ compareJsonbContainers(JsonbContainer *a, JsonbContainer *b) else { /* - * It's safe to assume that the types differed, and that the va - * and vb values passed were set. - * * If the two values were of the same container type, then there'd * have been a chance to observe the variation in the number of * elements/pairs (when processing WJB_BEGIN_OBJECT, say). They're @@ -852,15 +849,20 @@ JsonbIteratorInit(JsonbContainer *container) * It is our job to expand the jbvBinary representation without bothering them * with it. However, clients should not take it upon themselves to touch array * or Object element/pair buffers, since their element/pair pointers are - * garbage. Also, *val will not be set when returning WJB_END_ARRAY or - * WJB_END_OBJECT, on the assumption that it's only useful to access values - * when recursing in. + * garbage. + * + * *val is not meaningful when the result is WJB_DONE, WJB_END_ARRAY or + * WJB_END_OBJECT. However, we set val->type = jbvNull in those cases, + * so that callers may assume that val->type is always well-defined. */ JsonbIteratorToken JsonbIteratorNext(JsonbIterator **it, JsonbValue *val, bool skipNested) { if (*it == NULL) + { + val->type = jbvNull; return WJB_DONE; + } /* * When stepping into a nested container, we jump back here to start @@ -898,6 +900,7 @@ JsonbIteratorNext(JsonbIterator **it, JsonbValue *val, bool skipNested) * nesting). */ *it = freeAndGetParent(*it); + val->type = jbvNull; return WJB_END_ARRAY; } @@ -951,6 +954,7 @@ JsonbIteratorNext(JsonbIterator **it, JsonbValue *val, bool skipNested) * of nesting). */ *it = freeAndGetParent(*it); + val->type = jbvNull; return WJB_END_OBJECT; } else @@ -995,8 +999,10 @@ JsonbIteratorNext(JsonbIterator **it, JsonbValue *val, bool skipNested) return WJB_VALUE; } - elog(ERROR, "invalid iterator state"); - return -1; + elog(ERROR, "invalid jsonb iterator state"); + /* satisfy compilers that don't know that elog(ERROR) doesn't return */ + val->type = jbvNull; + return WJB_DONE; } /* From f8ce5dea433c073369e8c4c4b8375ffd3a761509 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 15 Jul 2025 18:53:00 -0400 Subject: [PATCH 061/138] Doc: clarify description of regexp fields in pg_ident.conf. The grammar was a little shaky and confusing here, so word-smith it a bit. Also, adjust the comments in pg_ident.conf.sample to use the same terminology as the SGML docs, in particular "DATABASE-USERNAME" not "PG-USERNAME". Back-patch appropriate subsets. I did not risk changing pg_ident.conf.sample in released branches, but it still seems OK to change it in v18. Reported-by: Alexey Shishkin Author: Tom Lane Reviewed-by: David G. Johnston Discussion: https://postgr.es/m/175206279327.3157504.12519088928605422253@wrigleys.postgresql.org Backpatch-through: 13 --- doc/src/sgml/client-auth.sgml | 16 ++++++++-------- src/backend/libpq/pg_ident.conf.sample | 26 +++++++++++++------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/doc/src/sgml/client-auth.sgml b/doc/src/sgml/client-auth.sgml index 832b616a7bbff..51b95ed04f399 100644 --- a/doc/src/sgml/client-auth.sgml +++ b/doc/src/sgml/client-auth.sgml @@ -1003,8 +1003,9 @@ local db1,db2,@demodbs all md5 the remainder of the field is treated as a regular expression. (See for details of PostgreSQL's regular expression syntax.) The regular - expression can include a single capture, or parenthesized subexpression, - which can then be referenced in the database-username + expression can include a single capture, or parenthesized subexpression. + The portion of the system user name that matched the capture can then + be referenced in the database-username field as \1 (backslash-one). This allows the mapping of multiple user names in a single line, which is particularly useful for simple syntax substitutions. For example, these entries @@ -1022,12 +1023,11 @@ mymap /^(.*)@otherdomain\.com$ guest If the database-username field starts with a slash (/), the remainder of the field is treated - as a regular expression (see - for details of PostgreSQL's regular - expression syntax). It is not possible to use \1 - to use a capture from regular expression on - system-username for a regular expression - on database-username. + as a regular expression. + When the database-username field is a regular + expression, it is not possible to use \1 within it to + refer to a capture from the system-username + field. diff --git a/src/backend/libpq/pg_ident.conf.sample b/src/backend/libpq/pg_ident.conf.sample index f5225f26cdf2c..8ee6c0ba31576 100644 --- a/src/backend/libpq/pg_ident.conf.sample +++ b/src/backend/libpq/pg_ident.conf.sample @@ -13,25 +13,25 @@ # user names to their corresponding PostgreSQL user names. Records # are of the form: # -# MAPNAME SYSTEM-USERNAME PG-USERNAME +# MAPNAME SYSTEM-USERNAME DATABASE-USERNAME # # (The uppercase quantities must be replaced by actual values.) # # MAPNAME is the (otherwise freely chosen) map name that was used in # pg_hba.conf. SYSTEM-USERNAME is the detected user name of the -# client. PG-USERNAME is the requested PostgreSQL user name. The -# existence of a record specifies that SYSTEM-USERNAME may connect as -# PG-USERNAME. +# client. DATABASE-USERNAME is the requested PostgreSQL user name. +# The existence of a record specifies that SYSTEM-USERNAME may connect +# as DATABASE-USERNAME. # -# If SYSTEM-USERNAME starts with a slash (/), it will be treated as a -# regular expression. Optionally this can contain a capture (a -# parenthesized subexpression). The substring matching the capture -# will be substituted for \1 (backslash-one) if present in -# PG-USERNAME. +# If SYSTEM-USERNAME starts with a slash (/), the rest of it will be +# treated as a regular expression. Optionally this can contain a capture +# (a parenthesized subexpression). The substring matching the capture +# will be substituted for \1 (backslash-one) if that appears in +# DATABASE-USERNAME. # -# PG-USERNAME can be "all", a user name, a group name prefixed with "+", or -# a regular expression (if it starts with a slash (/)). If it is a regular -# expression, the substring matching with \1 has no effect. +# DATABASE-USERNAME can be "all", a user name, a group name prefixed with "+", +# or a regular expression (if it starts with a slash (/)). If it is a regular +# expression, no substitution for \1 will occur. # # Multiple maps may be specified in this file and used by pg_hba.conf. # @@ -69,4 +69,4 @@ # Put your actual configuration here # ---------------------------------- -# MAPNAME SYSTEM-USERNAME PG-USERNAME +# MAPNAME SYSTEM-USERNAME DATABASE-USERNAME From da9a888da22c66764a2f97099c8f6c9462dbcbb7 Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Wed, 16 Jul 2025 08:03:36 +0900 Subject: [PATCH 062/138] doc: Clarify that total_vacuum_time excludes VACUUM FULL. The last_vacuum and vacuum_count fields in pg_stat_all_tables already state that they do not include VACUUM FULL. However, total_vacuum_time, which also excludes VACUUM FULL, did not mention this. This could mislead users into thinking VACUUM FULL time is included. To address this, this commit updates the documentation for pg_stat_all_tables to explicitly state that total_vacuum_time does not count VACUUM FULL. Back-patched to v18, where total_vacuum_time was introduced. Additionally, this commit clarifies that n_ins_since_vacuum also excludes VACUUM FULL. Although n_ins_since_vacuum was added in v13, we are not back-patching this change to stable branches, as it is a documentation improvement, not a bug fix. Author: Fujii Masao Reviewed-by: Robert Treat Reviewed-by: David G. Johnston Reviewed-by: Laurenz Albe Discussion: https://postgr.es/m/2ac375d1-591b-4f1b-a2af-f24335567866@oss.nttdata.com Backpatch-through: 18 --- doc/src/sgml/monitoring.sgml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 4265a22d4de35..823afe1b30b22 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -3980,6 +3980,7 @@ description | Waiting for a newly initialized WAL file to reach durable storage Estimated number of rows inserted since this table was last vacuumed + (not counting VACUUM FULL) @@ -4066,7 +4067,8 @@ description | Waiting for a newly initialized WAL file to reach durable storage total_vacuum_time double precision - Total time this table has been manually vacuumed, in milliseconds. + Total time this table has been manually vacuumed, in milliseconds + (not counting VACUUM FULL). (This includes the time spent sleeping due to cost-based delays.) From ac7c04483106d018f0ec69bfc06c8cc2ca749f36 Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Wed, 16 Jul 2025 08:32:52 +0900 Subject: [PATCH 063/138] doc: Fix confusing description of streaming option in START_REPLICATION. Previously, the documentation described the streaming option as a boolean, which is outdated since it's no longer a boolean as of protocol version 4. This could confuse users. This commit updates the description to remove the "boolean" reference and clearly list the valid values for the streaming option. Back-patch to v16, where the streaming option changed to a non-boolean. Author: Euler Taveira Reviewed-by: Fujii Masao Discussion: https://postgr.es/m/8d21fb98-5c25-4dee-8387-e5a62b01ea7d@app.fastmail.com Backpatch-through: 16 --- doc/src/sgml/protocol.sgml | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml index 8f31a9f80115a..6ca16523da1b5 100644 --- a/doc/src/sgml/protocol.sgml +++ b/doc/src/sgml/protocol.sgml @@ -3514,11 +3514,13 @@ psql "dbname=postgres replication=database" -c "IDENTIFY_SYSTEM;" - Boolean option to enable streaming of in-progress transactions. - It accepts an additional value "parallel" to enable sending extra - information with some messages to be used for parallelisation. - Minimum protocol version 2 is required to turn it on. Minimum protocol - version 4 is required for the "parallel" option. + Option to enable streaming of in-progress transactions. Valid values are + off (the default), on and + parallel. The setting parallel + enables sending extra information with some messages to be used for + parallelization. Minimum protocol version 2 is required to turn it + on. Minimum protocol version 4 is required for the + parallel value. From 40c66f8585bcb13e60b8c2323f1f96b34285bafb Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Wed, 16 Jul 2025 11:50:34 -0500 Subject: [PATCH 064/138] psql: Fix note on project naming in output of \copyright. This adjusts the wording to match the changes in commits 5987553fde, a233a603ba, and pgweb commit 2d764dbc08. Reviewed-by: Tom Lane Reviewed-by: Daniel Gustafsson Discussion: https://postgr.es/m/aHVo791guQR6uqwT%40nathan Backpatch-through: 13 --- src/bin/psql/help.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bin/psql/help.c b/src/bin/psql/help.c index a2e009ab9bea7..8c62729a0d124 100644 --- a/src/bin/psql/help.c +++ b/src/bin/psql/help.c @@ -748,7 +748,7 @@ void print_copyright(void) { puts("PostgreSQL Database Management System\n" - "(formerly known as Postgres, then as Postgres95)\n\n" + "(also known as Postgres, formerly known as Postgres95)\n\n" "Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group\n\n" "Portions Copyright (c) 1994, The Regents of the University of California\n\n" "Permission to use, copy, modify, and distribute this software and its\n" From 973caf7291c119ac1679734ae23721bbbb7df0da Mon Sep 17 00:00:00 2001 From: Jeff Davis Date: Wed, 16 Jul 2025 09:57:07 -0700 Subject: [PATCH 065/138] pg_dumpall: Skip global objects with --statistics-only or --no-schema. Previously, pg_dumpall would still dump global objects such as roles and tablespaces even when --statistics-only or --no-schema was specified. Since these global objects are treated as schema-level data, they should be skipped in these cases. This commit fixes the issue by ensuring that global objects are not dumped when either --statistics-only or --no-schema is used. Author: Fujii Masao Reviewed-by: Corey Huinker Discussion: https://postgr.es/m/08129593-6f3c-4fb9-94b7-5aa2eefb99b0@oss.nttdata.com Backpatch-through: 18 --- src/bin/pg_dump/pg_dumpall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bin/pg_dump/pg_dumpall.c b/src/bin/pg_dump/pg_dumpall.c index 3cbcad65c5fb5..100317b1aa949 100644 --- a/src/bin/pg_dump/pg_dumpall.c +++ b/src/bin/pg_dump/pg_dumpall.c @@ -632,7 +632,7 @@ main(int argc, char *argv[]) fprintf(OPF, "SET escape_string_warning = off;\n"); fprintf(OPF, "\n"); - if (!data_only) + if (!data_only && !statistics_only && !no_schema) { /* * If asked to --clean, do that first. We can avoid detailed From dca0e9693b71a3edd8c7956005af143bb5c59f1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Wed, 16 Jul 2025 19:22:53 +0200 Subject: [PATCH 066/138] Fix dumping of comments on invalid constraints on domains We skip dumping constraints together with domains if they are invalid ('separate') so that they appear after data -- but their comments were dumped together with the domain definition, which in effect leads to the comment being dumped when the constraint does not yet exist. Delay them in the same way. Oversight in 7eca575d1c28; backpatch all the way back. Author: jian he Discussion: https://postgr.es/m/CACJufxF_C2pe6J_+nPr6C5jf5rQnbYP8XOKr4HM8yHZtp2aQqQ@mail.gmail.com --- src/bin/pg_dump/pg_dump.c | 23 ++++++++++++++++++++++- src/test/regress/expected/constraints.out | 4 ++++ src/test/regress/sql/constraints.sql | 6 ++++++ 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 1937997ea674d..c6226175528bb 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -12583,8 +12583,13 @@ dumpDomain(Archive *fout, const TypeInfo *tyinfo) for (i = 0; i < tyinfo->nDomChecks; i++) { ConstraintInfo *domcheck = &(tyinfo->domChecks[i]); - PQExpBuffer conprefix = createPQExpBuffer(); + PQExpBuffer conprefix; + /* but only if the constraint itself was dumped here */ + if (domcheck->separate) + continue; + + conprefix = createPQExpBuffer(); appendPQExpBuffer(conprefix, "CONSTRAINT %s ON DOMAIN", fmtId(domcheck->dobj.name)); @@ -18488,6 +18493,22 @@ dumpConstraint(Archive *fout, const ConstraintInfo *coninfo) .section = SECTION_POST_DATA, .createStmt = q->data, .dropStmt = delq->data)); + + if (coninfo->dobj.dump & DUMP_COMPONENT_COMMENT) + { + PQExpBuffer conprefix = createPQExpBuffer(); + char *qtypname = pg_strdup(fmtId(tyinfo->dobj.name)); + + appendPQExpBuffer(conprefix, "CONSTRAINT %s ON DOMAIN", + fmtId(coninfo->dobj.name)); + + dumpComment(fout, conprefix->data, qtypname, + tyinfo->dobj.namespace->dobj.name, + tyinfo->rolname, + coninfo->dobj.catId, 0, tyinfo->dobj.dumpId); + destroyPQExpBuffer(conprefix); + free(qtypname); + } } } else diff --git a/src/test/regress/expected/constraints.out b/src/test/regress/expected/constraints.out index ccea883cffd65..3590d3274f05a 100644 --- a/src/test/regress/expected/constraints.out +++ b/src/test/regress/expected/constraints.out @@ -1701,3 +1701,7 @@ DROP TABLE constraint_comments_tbl; DROP DOMAIN constraint_comments_dom; DROP ROLE regress_constraint_comments; DROP ROLE regress_constraint_comments_noaccess; +-- Leave some constraints for the pg_upgrade test to pick up +CREATE DOMAIN constraint_comments_dom AS int; +ALTER DOMAIN constraint_comments_dom ADD CONSTRAINT inv_ck CHECK (value > 0) NOT VALID; +COMMENT ON CONSTRAINT inv_ck ON DOMAIN constraint_comments_dom IS 'comment on invalid constraint'; diff --git a/src/test/regress/sql/constraints.sql b/src/test/regress/sql/constraints.sql index 7487723ab8437..1f6dc8fd69f6d 100644 --- a/src/test/regress/sql/constraints.sql +++ b/src/test/regress/sql/constraints.sql @@ -1043,3 +1043,9 @@ DROP DOMAIN constraint_comments_dom; DROP ROLE regress_constraint_comments; DROP ROLE regress_constraint_comments_noaccess; + +-- Leave some constraints for the pg_upgrade test to pick up +CREATE DOMAIN constraint_comments_dom AS int; + +ALTER DOMAIN constraint_comments_dom ADD CONSTRAINT inv_ck CHECK (value > 0) NOT VALID; +COMMENT ON CONSTRAINT inv_ck ON DOMAIN constraint_comments_dom IS 'comment on invalid constraint'; From 409c63f9f634c1d9aa5a4afb17f54056b3de7c88 Mon Sep 17 00:00:00 2001 From: Daniel Gustafsson Date: Thu, 17 Jul 2025 00:21:18 +0200 Subject: [PATCH 067/138] doc: Add example file for COPY The paragraph for introducing INSERT and COPY discussed how a file could be used for bulk loading with COPY, without actually showing what the file would look like. This adds a programlisting for the file contents. Backpatch to all supported branches since this example has lacked the file contents since PostgreSQL 7.2. Author: Daniel Gustafsson Reviewed-by: Fujii Masao Reviewed-by: Tom Lane Discussion: https://postgr.es/m/158017814191.19852.15019251381150731439@wrigleys.postgresql.org Backpatch-through: 13 --- doc/src/sgml/query.sgml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/doc/src/sgml/query.sgml b/doc/src/sgml/query.sgml index 727a0cb185fb2..b190f28d41ea6 100644 --- a/doc/src/sgml/query.sgml +++ b/doc/src/sgml/query.sgml @@ -264,8 +264,18 @@ COPY weather FROM '/home/user/weather.txt'; where the file name for the source file must be available on the machine running the backend process, not the client, since the backend process - reads the file directly. You can read more about the - COPY command in . + reads the file directly. The data inserted above into the weather table + could also be inserted from a file containing (values are separated by a + tab character): + + +San Francisco 46 50 0.25 1994-11-27 +San Francisco 43 57 0.0 1994-11-29 +Hayward 37 54 \N 1994-11-29 + + + You can read more about the COPY command in + . From 4fcbe06aa8f825c118e7076ebde1ccf57c022570 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Thu, 17 Jul 2025 09:32:49 +0900 Subject: [PATCH 068/138] Fix inconsistent LWLock tranche names for MultiXact* MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The terms used in wait_event_names.txt and lwlock.c were inconsistent for MultiXactOffsetSLRU and MultiXactMemberSLRU, which could cause joins between pg_wait_events and pg_stat_activity to fail. lwlock.c is adjusted in this commit to what the historical name of the event has always been, and what is documented. Oversight in 53c2a97a9266. 08b9b9e043bb has fixed a similar inconsistency some time ago. Author: Bertrand Drouvot Reviewed-by: Álvaro Herrera Discussion: https://postgr.es/m/aHdxN0D0hKXzHFQG@ip-10-97-1-34.eu-west-3.compute.internal Backpatch-through: 17 --- src/backend/storage/lmgr/lwlock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 46f44bc45113f..2d43bf2cc1323 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -170,8 +170,8 @@ static const char *const BuiltinTrancheNames[] = { [LWTRANCHE_DSM_REGISTRY_DSA] = "DSMRegistryDSA", [LWTRANCHE_DSM_REGISTRY_HASH] = "DSMRegistryHash", [LWTRANCHE_COMMITTS_SLRU] = "CommitTsSLRU", - [LWTRANCHE_MULTIXACTOFFSET_SLRU] = "MultixactOffsetSLRU", - [LWTRANCHE_MULTIXACTMEMBER_SLRU] = "MultixactMemberSLRU", + [LWTRANCHE_MULTIXACTOFFSET_SLRU] = "MultiXactOffsetSLRU", + [LWTRANCHE_MULTIXACTMEMBER_SLRU] = "MultiXactMemberSLRU", [LWTRANCHE_NOTIFY_SLRU] = "NotifySLRU", [LWTRANCHE_SERIAL_SLRU] = "SerialSLRU", [LWTRANCHE_SUBTRANS_SLRU] = "SubtransSLRU", From 02d21cfd4b889fc476014e872c39632b019a038e Mon Sep 17 00:00:00 2001 From: Amit Langote Date: Thu, 17 Jul 2025 14:29:53 +0900 Subject: [PATCH 069/138] Remove duplicate line In 231b7d670b21, while copy-pasting some code into ExecEvalJsonCoercionFinish(), I (amitlan) accidentally introduced a duplicate line. Remove it. Reported-by: Jian He Discussion: https://postgr.es/m/CACJufxHcf=BpmRAJcjgfjOUfV76MwKnyz1x3ErXsWL26EAFmng@mail.gmail.com --- src/backend/executor/execExprInterp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c index 8a72b5e70a4ec..1a37737d4a235 100644 --- a/src/backend/executor/execExprInterp.c +++ b/src/backend/executor/execExprInterp.c @@ -5228,7 +5228,6 @@ ExecEvalJsonCoercionFinish(ExprState *state, ExprEvalStep *op) * JsonBehavior expression. */ jsestate->escontext.error_occurred = false; - jsestate->escontext.error_occurred = false; jsestate->escontext.details_wanted = true; } } From c4b5cd095675d9a7df121429f8bcb7ddb5d6d996 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Thu, 17 Jul 2025 10:25:59 -0500 Subject: [PATCH 070/138] doc: Add note about how to use pg_overexplain. This commit adds a note to the pg_overexplain page that describes how to use it (LOAD, session_preload_libraries, or shared_preload_libraries). The new text is mostly lifted from the auto_explain page. We should probably consider centralizing this information in the future. While at it, add a missing "module" to the opening sentence. Reviewed-by: "David G. Johnston" Reviewed-by: Robert Treat Reviewed-by: Dean Rasheed Discussion: https://postgr.es/m/aHVWKM8l8kLlZzgv%40nathan Backpatch-through: 18 --- doc/src/sgml/pgoverexplain.sgml | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/doc/src/sgml/pgoverexplain.sgml b/doc/src/sgml/pgoverexplain.sgml index 21930fbd3bd76..377ddc8139ecf 100644 --- a/doc/src/sgml/pgoverexplain.sgml +++ b/doc/src/sgml/pgoverexplain.sgml @@ -8,7 +8,7 @@ - The pg_overexplain extends EXPLAIN + The pg_overexplain module extends EXPLAIN with new options that provide additional output. It is mostly intended to assist with debugging of and development of the planner, rather than for general use. Since this module displays internal details of planner data @@ -17,6 +17,21 @@ often as) those data structures change. + + To use it, simply load it into the server. You can load it into an + individual session: + + +LOAD 'pg_overexplain'; + + + You can also preload it into some or all sessions by including + pg_overexplain in + or + in + postgresql.conf. + + EXPLAIN (DEBUG) From e0d3f3cfb6f23b46bab89b8d9e8d82cbed951f80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Thu, 17 Jul 2025 17:40:22 +0200 Subject: [PATCH 071/138] Remove assertion from PortalRunMulti MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We have an assertion to ensure that a command tag has been assigned by the time we're done executing, but if we happen to execute a command with no queries, the assertion would fail. Per discussion, rather than contort things to get a tag assigned, just remove the assertion. Oversight in 2f9661311b83. That commit also retained a comment that explained logic that had been adjacent to it but diffused into various places, leaving none apt to keep part of the comment. Remove that part, and rewrite what remains for extra clarity. Bug: #18984 Backpatch-through: 13 Reported-by: Aleksander Alekseev Reviewed-by: Tom Lane Reviewed-by: Michaël Paquier Discussion: https://postgr.es/m/18984-0f4778a6599ac3ae@postgresql.org --- src/backend/tcop/pquery.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c index d1593f38b35fd..08791b8f75ec2 100644 --- a/src/backend/tcop/pquery.c +++ b/src/backend/tcop/pquery.c @@ -1350,24 +1350,15 @@ PortalRunMulti(Portal portal, PopActiveSnapshot(); /* - * If a query completion data was supplied, use it. Otherwise use the - * portal's query completion data. - * - * Exception: Clients expect INSERT/UPDATE/DELETE tags to have counts, so - * fake them with zeros. This can happen with DO INSTEAD rules if there - * is no replacement query of the same type as the original. We print "0 - * 0" here because technically there is no query of the matching tag type, - * and printing a non-zero count for a different query type seems wrong, - * e.g. an INSERT that does an UPDATE instead should not print "0 1" if - * one row was updated. See QueryRewrite(), step 3, for details. + * If a command tag was requested and we did not fill in a run-time- + * determined tag above, copy the parse-time tag from the Portal. (There + * might not be any tag there either, in edge cases such as empty prepared + * statements. That's OK.) */ - if (qc && qc->commandTag == CMDTAG_UNKNOWN) - { - if (portal->qc.commandTag != CMDTAG_UNKNOWN) - CopyQueryCompletion(qc, &portal->qc); - /* If the caller supplied a qc, we should have set it by now. */ - Assert(qc->commandTag != CMDTAG_UNKNOWN); - } + if (qc && + qc->commandTag == CMDTAG_UNKNOWN && + portal->qc.commandTag != CMDTAG_UNKNOWN) + CopyQueryCompletion(qc, &portal->qc); } /* From bfa9b25c94fe65411e29c2798b6d3da5fa6e9158 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 17 Jul 2025 12:46:38 -0400 Subject: [PATCH 072/138] Fix PQport to never return NULL unless the connection is NULL. This is the documented behavior, and it worked that way before v10. However, addition of the connhost[] array created cases where conn->connhost[conn->whichhost].port is NULL. The rest of libpq is careful to substitute DEF_PGPORT[_STR] for a null or empty port string, but we failed to do so here, leading to possibly returning NULL. As of v18 that causes psql's \conninfo command to segfault. Older psql versions avoid that, but it's pretty likely that other clients have trouble with this, so we'd better back-patch the fix. In stable branches, just revert to our historical behavior of returning an empty string when there was no user-given port specification. However, it seems substantially more useful and indeed more correct to hand back DEF_PGPORT_STR in such cases, so let's make v18 and master do that. Author: Daniele Varrazzo Reviewed-by: Laurenz Albe Reviewed-by: Tom Lane Discussion: https://postgr.es/m/CA+mi_8YTS8WPZPO0PAb2aaGLwHuQ0DEQRF0ZMnvWss4y9FwDYQ@mail.gmail.com Backpatch-through: 13 --- src/interfaces/libpq/fe-connect.c | 6 ++++-- src/interfaces/libpq/libpq-int.h | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c index 09eb79812ac6d..d9e3da0495851 100644 --- a/src/interfaces/libpq/fe-connect.c +++ b/src/interfaces/libpq/fe-connect.c @@ -7528,10 +7528,12 @@ PQport(const PGconn *conn) if (!conn) return NULL; - if (conn->connhost != NULL) + if (conn->connhost != NULL && + conn->connhost[conn->whichhost].port != NULL && + conn->connhost[conn->whichhost].port[0] != '\0') return conn->connhost[conn->whichhost].port; - return ""; + return DEF_PGPORT_STR; } /* diff --git a/src/interfaces/libpq/libpq-int.h b/src/interfaces/libpq/libpq-int.h index a6cfd7f5c9d83..f975f03ca1e06 100644 --- a/src/interfaces/libpq/libpq-int.h +++ b/src/interfaces/libpq/libpq-int.h @@ -357,7 +357,8 @@ typedef struct pg_conn_host pg_conn_host_type type; /* type of host address */ char *host; /* host name or socket path */ char *hostaddr; /* host numeric IP address */ - char *port; /* port number (always provided) */ + char *port; /* port number (if NULL or empty, use + * DEF_PGPORT[_STR]) */ char *password; /* password for this host, read from the * password file; NULL if not sought or not * found in password file. */ From 27c7c11366f72b1933e298481954a24c742036de Mon Sep 17 00:00:00 2001 From: Dean Rasheed Date: Fri, 18 Jul 2025 09:59:40 +0100 Subject: [PATCH 073/138] Fix concurrent update trigger issues with MERGE in a CTE. If a MERGE inside a CTE attempts an UPDATE or DELETE on a table with BEFORE ROW triggers, and a concurrent UPDATE or DELETE happens, the merge code would fail (crashing in the case of an UPDATE action, and potentially executing the wrong action for a DELETE action). This is the same issue that 9321c79c86 attempted to fix, except now for a MERGE inside a CTE. As noted in 9321c79c86, what needs to happen is for the trigger code to exit early, returning the TM_Result and TM_FailureData information to the merge code, if a concurrent modification is detected, rather than attempting to do an EPQ recheck. The merge code will then do its own rechecking, and rescan the action list, potentially executing a different action in light of the concurrent update. In particular, the trigger code must never call ExecGetUpdateNewTuple() for MERGE, since that is bound to fail because MERGE has its own per-action projection information. Commit 9321c79c86 did this using estate->es_plannedstmt->commandType in the trigger code to detect that a MERGE was being executed, which is fine for a plain MERGE command, but does not work for a MERGE inside a CTE. Fix by passing that information to the trigger code as an additional parameter passed to ExecBRUpdateTriggers() and ExecBRDeleteTriggers(). Back-patch as far as v17 only, since MERGE cannot appear inside a CTE prior to that. Additionally, take care to preserve the trigger ABI in v17 (though not in v18, which is still in beta). Bug: #18986 Reported-by: Yaroslav Syrytsia Author: Dean Rasheed Reviewed-by: Michael Paquier Discussion: https://postgr.es/m/18986-e7a8aac3d339fa47@postgresql.org Backpatch-through: 17 --- src/backend/commands/trigger.c | 74 ++++++++++++------- src/backend/executor/execReplication.c | 4 +- src/backend/executor/nodeModifyTable.c | 6 +- src/include/commands/trigger.h | 6 +- .../expected/merge-match-recheck.out | 27 ++++--- .../isolation/specs/merge-match-recheck.spec | 22 +++--- 6 files changed, 89 insertions(+), 50 deletions(-) diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 67f8e70f9c166..7dc121f73f17e 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -80,6 +80,7 @@ static bool GetTupleForTrigger(EState *estate, ItemPointer tid, LockTupleMode lockmode, TupleTableSlot *oldslot, + bool do_epq_recheck, TupleTableSlot **epqslot, TM_Result *tmresultp, TM_FailureData *tmfdp); @@ -2693,7 +2694,8 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, HeapTuple fdw_trigtuple, TupleTableSlot **epqslot, TM_Result *tmresult, - TM_FailureData *tmfd) + TM_FailureData *tmfd, + bool is_merge_delete) { TupleTableSlot *slot = ExecGetTriggerOldSlot(estate, relinfo); TriggerDesc *trigdesc = relinfo->ri_TrigDesc; @@ -2708,9 +2710,17 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate, { TupleTableSlot *epqslot_candidate = NULL; + /* + * Get a copy of the on-disk tuple we are planning to delete. In + * general, if the tuple has been concurrently updated, we should + * recheck it using EPQ. However, if this is a MERGE DELETE action, + * we skip this EPQ recheck and leave it to the caller (it must do + * additional rechecking, and might end up executing a different + * action entirely). + */ if (!GetTupleForTrigger(estate, epqstate, relinfo, tupleid, - LockTupleExclusive, slot, &epqslot_candidate, - tmresult, tmfd)) + LockTupleExclusive, slot, !is_merge_delete, + &epqslot_candidate, tmresult, tmfd)) return false; /* @@ -2800,6 +2810,7 @@ ExecARDeleteTriggers(EState *estate, tupleid, LockTupleExclusive, slot, + false, NULL, NULL, NULL); @@ -2944,7 +2955,8 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, HeapTuple fdw_trigtuple, TupleTableSlot *newslot, TM_Result *tmresult, - TM_FailureData *tmfd) + TM_FailureData *tmfd, + bool is_merge_update) { TriggerDesc *trigdesc = relinfo->ri_TrigDesc; TupleTableSlot *oldslot = ExecGetTriggerOldSlot(estate, relinfo); @@ -2965,10 +2977,17 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate, { TupleTableSlot *epqslot_candidate = NULL; - /* get a copy of the on-disk tuple we are planning to update */ + /* + * Get a copy of the on-disk tuple we are planning to update. In + * general, if the tuple has been concurrently updated, we should + * recheck it using EPQ. However, if this is a MERGE UPDATE action, + * we skip this EPQ recheck and leave it to the caller (it must do + * additional rechecking, and might end up executing a different + * action entirely). + */ if (!GetTupleForTrigger(estate, epqstate, relinfo, tupleid, - lockmode, oldslot, &epqslot_candidate, - tmresult, tmfd)) + lockmode, oldslot, !is_merge_update, + &epqslot_candidate, tmresult, tmfd)) return false; /* cancel the update action */ /* @@ -3142,6 +3161,7 @@ ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, tupleid, LockTupleExclusive, oldslot, + false, NULL, NULL, NULL); @@ -3298,6 +3318,7 @@ GetTupleForTrigger(EState *estate, ItemPointer tid, LockTupleMode lockmode, TupleTableSlot *oldslot, + bool do_epq_recheck, TupleTableSlot **epqslot, TM_Result *tmresultp, TM_FailureData *tmfdp) @@ -3357,29 +3378,30 @@ GetTupleForTrigger(EState *estate, if (tmfd.traversed) { /* - * Recheck the tuple using EPQ. For MERGE, we leave this - * to the caller (it must do additional rechecking, and - * might end up executing a different action entirely). + * Recheck the tuple using EPQ, if requested. Otherwise, + * just return that it was concurrently updated. */ - if (estate->es_plannedstmt->commandType == CMD_MERGE) + if (do_epq_recheck) { - if (tmresultp) - *tmresultp = TM_Updated; - return false; + *epqslot = EvalPlanQual(epqstate, + relation, + relinfo->ri_RangeTableIndex, + oldslot); + + /* + * If PlanQual failed for updated tuple - we must not + * process this tuple! + */ + if (TupIsNull(*epqslot)) + { + *epqslot = NULL; + return false; + } } - - *epqslot = EvalPlanQual(epqstate, - relation, - relinfo->ri_RangeTableIndex, - oldslot); - - /* - * If PlanQual failed for updated tuple - we must not - * process this tuple! - */ - if (TupIsNull(*epqslot)) + else { - *epqslot = NULL; + if (tmresultp) + *tmresultp = TM_Updated; return false; } } diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c index 53ddd25c42db9..f262e7a66f771 100644 --- a/src/backend/executor/execReplication.c +++ b/src/backend/executor/execReplication.c @@ -670,7 +670,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_update_before_row) { if (!ExecBRUpdateTriggers(estate, epqstate, resultRelInfo, - tid, NULL, slot, NULL, NULL)) + tid, NULL, slot, NULL, NULL, false)) skip_tuple = true; /* "do nothing" */ } @@ -746,7 +746,7 @@ ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo, resultRelInfo->ri_TrigDesc->trig_delete_before_row) { skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo, - tid, NULL, NULL, NULL, NULL); + tid, NULL, NULL, NULL, NULL, false); } if (!skip_tuple) diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 54da8e7995bd3..7c6c2c1f6e42a 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -1474,7 +1474,8 @@ ExecDeletePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, return ExecBRDeleteTriggers(context->estate, context->epqstate, resultRelInfo, tupleid, oldtuple, - epqreturnslot, result, &context->tmfd); + epqreturnslot, result, &context->tmfd, + context->mtstate->operation == CMD_MERGE); } return true; @@ -2117,7 +2118,8 @@ ExecUpdatePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo, return ExecBRUpdateTriggers(context->estate, context->epqstate, resultRelInfo, tupleid, oldtuple, slot, - result, &context->tmfd); + result, &context->tmfd, + context->mtstate->operation == CMD_MERGE); } return true; diff --git a/src/include/commands/trigger.h b/src/include/commands/trigger.h index 2ed2c4bb3784b..cfd7daa20edac 100644 --- a/src/include/commands/trigger.h +++ b/src/include/commands/trigger.h @@ -213,7 +213,8 @@ extern bool ExecBRDeleteTriggers(EState *estate, HeapTuple fdw_trigtuple, TupleTableSlot **epqslot, TM_Result *tmresult, - TM_FailureData *tmfd); + TM_FailureData *tmfd, + bool is_merge_delete); extern void ExecARDeleteTriggers(EState *estate, ResultRelInfo *relinfo, ItemPointer tupleid, @@ -235,7 +236,8 @@ extern bool ExecBRUpdateTriggers(EState *estate, HeapTuple fdw_trigtuple, TupleTableSlot *newslot, TM_Result *tmresult, - TM_FailureData *tmfd); + TM_FailureData *tmfd, + bool is_merge_update); extern void ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo, ResultRelInfo *src_partinfo, diff --git a/src/test/isolation/expected/merge-match-recheck.out b/src/test/isolation/expected/merge-match-recheck.out index 9a44a5959270b..90300f1db5ab3 100644 --- a/src/test/isolation/expected/merge-match-recheck.out +++ b/src/test/isolation/expected/merge-match-recheck.out @@ -241,19 +241,28 @@ starting permutation: update_bal1_tg merge_bal_tg c2 select1_tg c1 s2: NOTICE: Update: (1,160,s1,setup) -> (1,50,s1,"setup updated by update_bal1_tg") step update_bal1_tg: UPDATE target_tg t SET balance = 50, val = t.val || ' updated by update_bal1_tg' WHERE t.key = 1; step merge_bal_tg: - MERGE INTO target_tg t - USING (SELECT 1 as key) s - ON s.key = t.key - WHEN MATCHED AND balance < 100 THEN - UPDATE SET balance = balance * 2, val = t.val || ' when1' - WHEN MATCHED AND balance < 200 THEN - UPDATE SET balance = balance * 4, val = t.val || ' when2' - WHEN MATCHED AND balance < 300 THEN - UPDATE SET balance = balance * 8, val = t.val || ' when3'; + WITH t AS ( + MERGE INTO target_tg t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3' + RETURNING t.* + ) + SELECT * FROM t; step c2: COMMIT; s1: NOTICE: Update: (1,50,s1,"setup updated by update_bal1_tg") -> (1,100,s1,"setup updated by update_bal1_tg when1") step merge_bal_tg: <... completed> +key|balance|status|val +---+-------+------+------------------------------------- + 1| 100|s1 |setup updated by update_bal1_tg when1 +(1 row) + step select1_tg: SELECT * FROM target_tg; key|balance|status|val ---+-------+------+------------------------------------- diff --git a/src/test/isolation/specs/merge-match-recheck.spec b/src/test/isolation/specs/merge-match-recheck.spec index 26266b8c2978e..15226e40c9efc 100644 --- a/src/test/isolation/specs/merge-match-recheck.spec +++ b/src/test/isolation/specs/merge-match-recheck.spec @@ -99,15 +99,19 @@ step "merge_bal_pa" } step "merge_bal_tg" { - MERGE INTO target_tg t - USING (SELECT 1 as key) s - ON s.key = t.key - WHEN MATCHED AND balance < 100 THEN - UPDATE SET balance = balance * 2, val = t.val || ' when1' - WHEN MATCHED AND balance < 200 THEN - UPDATE SET balance = balance * 4, val = t.val || ' when2' - WHEN MATCHED AND balance < 300 THEN - UPDATE SET balance = balance * 8, val = t.val || ' when3'; + WITH t AS ( + MERGE INTO target_tg t + USING (SELECT 1 as key) s + ON s.key = t.key + WHEN MATCHED AND balance < 100 THEN + UPDATE SET balance = balance * 2, val = t.val || ' when1' + WHEN MATCHED AND balance < 200 THEN + UPDATE SET balance = balance * 4, val = t.val || ' when2' + WHEN MATCHED AND balance < 300 THEN + UPDATE SET balance = balance * 8, val = t.val || ' when3' + RETURNING t.* + ) + SELECT * FROM t; } step "merge_delete" From 5449d5b7ae9c2355ce200253874b966e90392d81 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sat, 19 Jul 2025 13:44:01 +0300 Subject: [PATCH 074/138] Fix infinite wait when reading a partially written WAL record If a crash occurs while writing a WAL record that spans multiple pages, the recovery process marks the page with the XLP_FIRST_IS_OVERWRITE_CONTRECORD flag. However, logical decoding currently attempts to read the full WAL record based on its expected size before checking this flag, which can lead to an infinite wait if the remaining data is never written (e.g., no activity after crash). This patch updates the logic first to read the page header and check for the XLP_FIRST_IS_OVERWRITE_CONTRECORD flag before attempting to reconstruct the full WAL record. If the flag is set, decoding correctly identifies the record as incomplete and avoids waiting for WAL data that will never arrive. Discussion: https://postgr.es/m/CAAKRu_ZCOzQpEumLFgG_%2Biw3FTa%2BhJ4SRpxzaQBYxxM_ZAzWcA%40mail.gmail.com Discussion: https://postgr.es/m/CALDaNm34m36PDHzsU_GdcNXU0gLTfFY5rzh9GSQv%3Dw6B%2BQVNRQ%40mail.gmail.com Author: Vignesh C Reviewed-by: Hayato Kuroda Reviewed-by: Dilip Kumar Reviewed-by: Michael Paquier Reviewed-by: Alexander Korotkov Backpatch-through: 13 --- src/backend/access/transam/xlogreader.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 2790ade1f91e8..5c26d33a60380 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -723,11 +723,12 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) /* Calculate pointer to beginning of next page */ targetPagePtr += XLOG_BLCKSZ; - /* Wait for the next page to become available */ - readOff = ReadPageInternal(state, targetPagePtr, - Min(total_len - gotlen + SizeOfXLogShortPHD, - XLOG_BLCKSZ)); - + /* + * Read the page header before processing the record data, so we + * can handle the case where the previous record ended as being a + * partial one. + */ + readOff = ReadPageInternal(state, targetPagePtr, SizeOfXLogShortPHD); if (readOff == XLREAD_WOULDBLOCK) return XLREAD_WOULDBLOCK; else if (readOff < 0) @@ -776,6 +777,15 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) goto err; } + /* Wait for the next page to become available */ + readOff = ReadPageInternal(state, targetPagePtr, + Min(total_len - gotlen + SizeOfXLogShortPHD, + XLOG_BLCKSZ)); + if (readOff == XLREAD_WOULDBLOCK) + return XLREAD_WOULDBLOCK; + else if (readOff < 0) + goto err; + /* Append the continuation from this page to the buffer */ pageHeaderSize = XLogPageHeaderSize(pageHeader); From c71c702f067b9191332c07acadd2297a42915e5f Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sat, 19 Jul 2025 13:51:07 +0300 Subject: [PATCH 075/138] Improve the stability of the recovery test 047_checkpoint_physical_slot Currently, the comments in 047_checkpoint_physical_slot. It shows an incomplete intention to wait for checkpoint completion before performing an immediate database stop. However, an immediate node stop can occur both before and after checkpoint completion. Both cases should work correctly. But we would like the test to be more stable and deterministic. This is why this commit makes this test explicitly wait for the checkpoint completion log message. Discussion: https://postgr.es/m/CAPpHfdurV-j_e0pb%3DUFENAy3tyzxfF%2ByHveNDNQk2gM82WBU5A%40mail.gmail.com Discussion: https://postgr.es/m/aHXLep3OaX_vRTNQ%40paquier.xyz Author: Alexander Korotkov Reviewed-by: Michael Paquier Backpatch-through: 17 --- src/test/recovery/t/047_checkpoint_physical_slot.pl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/test/recovery/t/047_checkpoint_physical_slot.pl b/src/test/recovery/t/047_checkpoint_physical_slot.pl index a1332b5d44cbe..9e98383e30ea9 100644 --- a/src/test/recovery/t/047_checkpoint_physical_slot.pl +++ b/src/test/recovery/t/047_checkpoint_physical_slot.pl @@ -94,9 +94,11 @@ q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())} ); -# Continue the checkpoint. +# Continue the checkpoint and wait for its completion. +my $log_offset = -s $node->logfile; $node->safe_psql('postgres', q{select injection_points_wakeup('checkpoint-before-old-wal-removal')}); +$node->wait_for_log(qr/checkpoint complete/, $log_offset); my $restart_lsn_old = $node->safe_psql('postgres', q{select restart_lsn from pg_replication_slots where slot_name = 'slot_physical'} @@ -104,8 +106,7 @@ chomp($restart_lsn_old); note("restart lsn before stop: $restart_lsn_old"); -# Abruptly stop the server (1 second should be enough for the checkpoint -# to finish; it would be better). +# Abruptly stop the server. $node->stop('immediate'); $node->start; From 226c5674540f8bc5144e9ba9a83f12977ba4203c Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sat, 19 Jul 2025 13:59:17 +0300 Subject: [PATCH 076/138] Reintroduce test 046_checkpoint_logical_slot This commit is only for HEAD and v18, where the test has been removed. It also incorporates improvements below to stability and coverage of the original test, which were already backpatched to v17. - Add one pg_logical_emit_message() call to force the creation of a record that spawns across two pages. - Make the logic wait for the checkpoint completion. Author: Alexander Korotkov Co-authored-by: Hayato Kuroda Reviewed-by: Michael Paquier Backpatch-through: 18 --- src/test/recovery/meson.build | 1 + .../recovery/t/046_checkpoint_logical_slot.pl | 142 ++++++++++++++++++ 2 files changed, 143 insertions(+) create mode 100644 src/test/recovery/t/046_checkpoint_logical_slot.pl diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build index 6e78ff1a030b3..52993c32dbba4 100644 --- a/src/test/recovery/meson.build +++ b/src/test/recovery/meson.build @@ -54,6 +54,7 @@ tests += { 't/043_no_contrecord_switch.pl', 't/044_invalidate_inactive_slots.pl', 't/045_archive_restartpoint.pl', + 't/046_checkpoint_logical_slot.pl', 't/047_checkpoint_physical_slot.pl', 't/048_vacuum_horizon_floor.pl' ], diff --git a/src/test/recovery/t/046_checkpoint_logical_slot.pl b/src/test/recovery/t/046_checkpoint_logical_slot.pl new file mode 100644 index 0000000000000..4fd709e3a0312 --- /dev/null +++ b/src/test/recovery/t/046_checkpoint_logical_slot.pl @@ -0,0 +1,142 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group +# +# This test verifies the case when the logical slot is advanced during +# checkpoint. The test checks that the logical slot's restart_lsn still refers +# to an existed WAL segment after immediate restart. +# +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; + +use Test::More; + +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +my ($node, $result); + +$node = PostgreSQL::Test::Cluster->new('mike'); +$node->init; +$node->append_conf('postgresql.conf', "wal_level = 'logical'"); +$node->start; + +# Check if the extension injection_points is available, as it may be +# possible that this script is run with installcheck, where the module +# would not be installed by default. +if (!$node->check_extension('injection_points')) +{ + plan skip_all => 'Extension injection_points not installed'; +} + +$node->safe_psql('postgres', q(CREATE EXTENSION injection_points)); + +# Create the two slots we'll need. +$node->safe_psql('postgres', + q{select pg_create_logical_replication_slot('slot_logical', 'test_decoding')} +); +$node->safe_psql('postgres', + q{select pg_create_physical_replication_slot('slot_physical', true)}); + +# Advance both slots to the current position just to have everything "valid". +$node->safe_psql('postgres', + q{select count(*) from pg_logical_slot_get_changes('slot_logical', null, null)} +); +$node->safe_psql('postgres', + q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())} +); + +# Run checkpoint to flush current state to disk and set a baseline. +$node->safe_psql('postgres', q{checkpoint}); + +# Generate some transactions to get RUNNING_XACTS. +my $xacts = $node->background_psql('postgres'); +$xacts->query_until( + qr/run_xacts/, + q(\echo run_xacts +SELECT 1 \watch 0.1 +\q +)); + +$node->advance_wal(20); + +# Run another checkpoint to set a new restore LSN. +$node->safe_psql('postgres', q{checkpoint}); + +$node->advance_wal(20); + +# Run another checkpoint, this time in the background, and make it wait +# on the injection point) so that the checkpoint stops right before +# removing old WAL segments. +note('starting checkpoint'); + +my $checkpoint = $node->background_psql('postgres'); +$checkpoint->query_safe( + q(select injection_points_attach('checkpoint-before-old-wal-removal','wait')) +); +$checkpoint->query_until( + qr/starting_checkpoint/, + q(\echo starting_checkpoint +checkpoint; +\q +)); + +# Wait until the checkpoint stops right before removing WAL segments. +note('waiting for injection_point'); +$node->wait_for_event('checkpointer', 'checkpoint-before-old-wal-removal'); +note('injection_point is reached'); + +# Try to advance the logical slot, but make it stop when it moves to the next +# WAL segment (this has to happen in the background, too). +my $logical = $node->background_psql('postgres'); +$logical->query_safe( + q{select injection_points_attach('logical-replication-slot-advance-segment','wait');} +); +$logical->query_until( + qr/get_changes/, + q( +\echo get_changes +select count(*) from pg_logical_slot_get_changes('slot_logical', null, null) \watch 1 +\q +)); + +# Wait until the slot's restart_lsn points to the next WAL segment. +note('waiting for injection_point'); +$node->wait_for_event('client backend', + 'logical-replication-slot-advance-segment'); +note('injection_point is reached'); + +# OK, we're in the right situation: time to advance the physical slot, which +# recalculates the required LSN, and then unblock the checkpoint, which +# removes the WAL still needed by the logical slot. +$node->safe_psql('postgres', + q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())} +); + +# Generate a long WAL record, spawning at least two pages for the follow-up +# post-recovery check. +$node->safe_psql('postgres', + q{select pg_logical_emit_message(false, '', repeat('123456789', 1000))}); + +# Continue the checkpoint and wait for its completion. +my $log_offset = -s $node->logfile; +$node->safe_psql('postgres', + q{select injection_points_wakeup('checkpoint-before-old-wal-removal')}); +$node->wait_for_log(qr/checkpoint complete/, $log_offset); + +# Abruptly stop the server. +$node->stop('immediate'); + +$node->start; + +eval { + $node->safe_psql('postgres', + q{select count(*) from pg_logical_slot_get_changes('slot_logical', null, null);} + ); +}; +is($@, '', "Logical slot still valid"); + +done_testing(); From 6cf5b10ce96c0aa24ce2129fcc7b53fc836608e5 Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Sun, 20 Jul 2025 11:58:31 +0900 Subject: [PATCH 077/138] doc: Document reopen of output file via SIGHUP in pg_recvlogical. When pg_recvlogical receives a SIGHUP signal, it closes the current output file and reopens a new one. This is useful since it allows us to rotate the output file by renaming the current file and sending a SIGHUP. This behavior was previously undocumented. This commit adds the missing documentation. Back-patch to all supported versions. Author: Fujii Masao Reviewed-by: Shinya Kato Discussion: https://postgr.es/m/0977fc4f-1523-4ecd-8a0e-391af4976367@oss.nttdata.com Backpatch-through: 13 --- doc/src/sgml/ref/pg_recvlogical.sgml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/doc/src/sgml/ref/pg_recvlogical.sgml b/doc/src/sgml/ref/pg_recvlogical.sgml index f68182266a9fa..263ebdeeab4a8 100644 --- a/doc/src/sgml/ref/pg_recvlogical.sgml +++ b/doc/src/sgml/ref/pg_recvlogical.sgml @@ -53,6 +53,16 @@ PostgreSQL documentation (ControlC) or SIGTERM signal. + + + When pg_recvlogical receives + a SIGHUP signal, it closes the current output file + and opens a new one using the filename specified by + the option. This allows us to rotate + the output file by first renaming the current file and then sending + a SIGHUP signal to + pg_recvlogical. + From f9545e95c5e7ead0c19676ef8c966eb21f573954 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Herrera?= Date: Mon, 21 Jul 2025 11:34:10 +0200 Subject: [PATCH 078/138] pg_dump: include comments on not-null constraints on domains, too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit e5da0fe3c22b introduced catalog entries for not-null constraints on domains; but because commit b0e96f311985 (the original work for catalogued not-null constraints on tables) forgot to teach pg_dump to process the comments for them, this one also forgot. Add that now. We also need to teach repairDependencyLoop() about the new type of constraints being possible for domains. Backpatch-through: 17 Co-authored-by: jian he Co-authored-by: Álvaro Herrera Reported-by: jian he Discussion: https://postgr.es/m/CACJufxF-0bqVR=j4jonS6N2Ka6hHUpFyu3_3TWKNhOW_4yFSSg@mail.gmail.com --- src/bin/pg_dump/pg_dump.c | 160 +++++++++++++++++++++++-------- src/bin/pg_dump/pg_dump.h | 4 +- src/bin/pg_dump/pg_dump_sort.c | 15 +-- src/bin/pg_dump/t/002_pg_dump.pl | 30 +++++- 4 files changed, 160 insertions(+), 49 deletions(-) diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index c6226175528bb..d9864294fe2b8 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -47,6 +47,7 @@ #include "catalog/pg_authid_d.h" #include "catalog/pg_cast_d.h" #include "catalog/pg_class_d.h" +#include "catalog/pg_constraint_d.h" #include "catalog/pg_default_acl_d.h" #include "catalog/pg_largeobject_d.h" #include "catalog/pg_proc_d.h" @@ -6122,6 +6123,7 @@ getTypes(Archive *fout) */ tyinfo[i].nDomChecks = 0; tyinfo[i].domChecks = NULL; + tyinfo[i].notnull = NULL; if ((tyinfo[i].dobj.dump & DUMP_COMPONENT_DEFINITION) && tyinfo[i].typtype == TYPTYPE_DOMAIN) getDomainConstraints(fout, &(tyinfo[i])); @@ -8247,27 +8249,33 @@ addConstrChildIdxDeps(DumpableObject *dobj, const IndxInfo *refidx) static void getDomainConstraints(Archive *fout, TypeInfo *tyinfo) { - int i; ConstraintInfo *constrinfo; PQExpBuffer query = createPQExpBuffer(); PGresult *res; int i_tableoid, i_oid, i_conname, - i_consrc; + i_consrc, + i_convalidated, + i_contype; int ntups; if (!fout->is_prepared[PREPQUERY_GETDOMAINCONSTRAINTS]) { - /* Set up query for constraint-specific details */ - appendPQExpBufferStr(query, - "PREPARE getDomainConstraints(pg_catalog.oid) AS\n" - "SELECT tableoid, oid, conname, " - "pg_catalog.pg_get_constraintdef(oid) AS consrc, " - "convalidated " - "FROM pg_catalog.pg_constraint " - "WHERE contypid = $1 AND contype = 'c' " - "ORDER BY conname"); + /* + * Set up query for constraint-specific details. For servers 17 and + * up, domains have constraints of type 'n' as well as 'c', otherwise + * just the latter. + */ + appendPQExpBuffer(query, + "PREPARE getDomainConstraints(pg_catalog.oid) AS\n" + "SELECT tableoid, oid, conname, " + "pg_catalog.pg_get_constraintdef(oid) AS consrc, " + "convalidated, contype " + "FROM pg_catalog.pg_constraint " + "WHERE contypid = $1 AND contype IN (%s) " + "ORDER BY conname", + fout->remoteVersion < 170000 ? "'c'" : "'c', 'n'"); ExecuteSqlStatement(fout, query->data); @@ -8286,33 +8294,50 @@ getDomainConstraints(Archive *fout, TypeInfo *tyinfo) i_oid = PQfnumber(res, "oid"); i_conname = PQfnumber(res, "conname"); i_consrc = PQfnumber(res, "consrc"); + i_convalidated = PQfnumber(res, "convalidated"); + i_contype = PQfnumber(res, "contype"); constrinfo = (ConstraintInfo *) pg_malloc(ntups * sizeof(ConstraintInfo)); - - tyinfo->nDomChecks = ntups; tyinfo->domChecks = constrinfo; - for (i = 0; i < ntups; i++) + /* 'i' tracks result rows; 'j' counts CHECK constraints */ + for (int i = 0, j = 0; i < ntups; i++) { - bool validated = PQgetvalue(res, i, 4)[0] == 't'; - - constrinfo[i].dobj.objType = DO_CONSTRAINT; - constrinfo[i].dobj.catId.tableoid = atooid(PQgetvalue(res, i, i_tableoid)); - constrinfo[i].dobj.catId.oid = atooid(PQgetvalue(res, i, i_oid)); - AssignDumpId(&constrinfo[i].dobj); - constrinfo[i].dobj.name = pg_strdup(PQgetvalue(res, i, i_conname)); - constrinfo[i].dobj.namespace = tyinfo->dobj.namespace; - constrinfo[i].contable = NULL; - constrinfo[i].condomain = tyinfo; - constrinfo[i].contype = 'c'; - constrinfo[i].condef = pg_strdup(PQgetvalue(res, i, i_consrc)); - constrinfo[i].confrelid = InvalidOid; - constrinfo[i].conindex = 0; - constrinfo[i].condeferrable = false; - constrinfo[i].condeferred = false; - constrinfo[i].conislocal = true; - - constrinfo[i].separate = !validated; + bool validated = PQgetvalue(res, i, i_convalidated)[0] == 't'; + char contype = (PQgetvalue(res, i, i_contype))[0]; + ConstraintInfo *constraint; + + if (contype == CONSTRAINT_CHECK) + { + constraint = &constrinfo[j++]; + tyinfo->nDomChecks++; + } + else + { + Assert(contype == CONSTRAINT_NOTNULL); + Assert(tyinfo->notnull == NULL); + /* use last item in array for the not-null constraint */ + tyinfo->notnull = &(constrinfo[ntups - 1]); + constraint = tyinfo->notnull; + } + + constraint->dobj.objType = DO_CONSTRAINT; + constraint->dobj.catId.tableoid = atooid(PQgetvalue(res, i, i_tableoid)); + constraint->dobj.catId.oid = atooid(PQgetvalue(res, i, i_oid)); + AssignDumpId(&(constraint->dobj)); + constraint->dobj.name = pg_strdup(PQgetvalue(res, i, i_conname)); + constraint->dobj.namespace = tyinfo->dobj.namespace; + constraint->contable = NULL; + constraint->condomain = tyinfo; + constraint->contype = contype; + constraint->condef = pg_strdup(PQgetvalue(res, i, i_consrc)); + constraint->confrelid = InvalidOid; + constraint->conindex = 0; + constraint->condeferrable = false; + constraint->condeferred = false; + constraint->conislocal = true; + + constraint->separate = !validated; /* * Make the domain depend on the constraint, ensuring it won't be @@ -8321,8 +8346,7 @@ getDomainConstraints(Archive *fout, TypeInfo *tyinfo) * anyway, so this doesn't matter. */ if (validated) - addObjectDependency(&tyinfo->dobj, - constrinfo[i].dobj.dumpId); + addObjectDependency(&tyinfo->dobj, constraint->dobj.dumpId); } PQclear(res); @@ -12517,8 +12541,36 @@ dumpDomain(Archive *fout, const TypeInfo *tyinfo) appendPQExpBuffer(q, " COLLATE %s", fmtQualifiedDumpable(coll)); } + /* + * Print a not-null constraint if there's one. In servers older than 17 + * these don't have names, so just print it unadorned; in newer ones they + * do, but most of the time it's going to be the standard generated one, + * so omit the name in that case also. + */ if (typnotnull[0] == 't') - appendPQExpBufferStr(q, " NOT NULL"); + { + if (fout->remoteVersion < 170000 || tyinfo->notnull == NULL) + appendPQExpBufferStr(q, " NOT NULL"); + else + { + ConstraintInfo *notnull = tyinfo->notnull; + + if (!notnull->separate) + { + char *default_name; + + /* XXX should match ChooseConstraintName better */ + default_name = psprintf("%s_not_null", tyinfo->dobj.name); + + if (strcmp(default_name, notnull->dobj.name) == 0) + appendPQExpBufferStr(q, " NOT NULL"); + else + appendPQExpBuffer(q, " CONSTRAINT %s %s", + fmtId(notnull->dobj.name), notnull->condef); + free(default_name); + } + } + } if (typdefault != NULL) { @@ -12538,7 +12590,7 @@ dumpDomain(Archive *fout, const TypeInfo *tyinfo) { ConstraintInfo *domcheck = &(tyinfo->domChecks[i]); - if (!domcheck->separate) + if (!domcheck->separate && domcheck->contype == 'c') appendPQExpBuffer(q, "\n\tCONSTRAINT %s %s", fmtId(domcheck->dobj.name), domcheck->condef); } @@ -12602,6 +12654,25 @@ dumpDomain(Archive *fout, const TypeInfo *tyinfo) destroyPQExpBuffer(conprefix); } + /* + * And a comment on the not-null constraint, if there's one -- but only if + * the constraint itself was dumped here + */ + if (tyinfo->notnull != NULL && !tyinfo->notnull->separate) + { + PQExpBuffer conprefix = createPQExpBuffer(); + + appendPQExpBuffer(conprefix, "CONSTRAINT %s ON DOMAIN", + fmtId(tyinfo->notnull->dobj.name)); + + if (tyinfo->notnull->dobj.dump & DUMP_COMPONENT_COMMENT) + dumpComment(fout, conprefix->data, qtypname, + tyinfo->dobj.namespace->dobj.name, + tyinfo->rolname, + tyinfo->notnull->dobj.catId, 0, tyinfo->dobj.dumpId); + destroyPQExpBuffer(conprefix); + } + destroyPQExpBuffer(q); destroyPQExpBuffer(delq); destroyPQExpBuffer(query); @@ -18463,14 +18534,23 @@ dumpConstraint(Archive *fout, const ConstraintInfo *coninfo) .dropStmt = delq->data)); } } - else if (coninfo->contype == 'c' && tbinfo == NULL) + else if (tbinfo == NULL) { - /* CHECK constraint on a domain */ + /* CHECK, NOT NULL constraint on a domain */ TypeInfo *tyinfo = coninfo->condomain; + Assert(coninfo->contype == 'c' || coninfo->contype == 'n'); + /* Ignore if not to be dumped separately */ if (coninfo->separate) { + const char *keyword; + + if (coninfo->contype == 'c') + keyword = "CHECK CONSTRAINT"; + else + keyword = "CONSTRAINT"; + appendPQExpBuffer(q, "ALTER DOMAIN %s\n", fmtQualifiedDumpable(tyinfo)); appendPQExpBuffer(q, " ADD CONSTRAINT %s %s;\n", @@ -18489,7 +18569,7 @@ dumpConstraint(Archive *fout, const ConstraintInfo *coninfo) ARCHIVE_OPTS(.tag = tag, .namespace = tyinfo->dobj.namespace->dobj.name, .owner = tyinfo->rolname, - .description = "CHECK CONSTRAINT", + .description = keyword, .section = SECTION_POST_DATA, .createStmt = q->data, .dropStmt = delq->data)); diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h index 39eef1d6617f4..2370c98d192a6 100644 --- a/src/bin/pg_dump/pg_dump.h +++ b/src/bin/pg_dump/pg_dump.h @@ -222,7 +222,9 @@ typedef struct _typeInfo bool isDefined; /* true if typisdefined */ /* If needed, we'll create a "shell type" entry for it; link that here: */ struct _shellTypeInfo *shellType; /* shell-type entry, or NULL */ - /* If it's a domain, we store links to its constraints here: */ + /* If it's a domain, its not-null constraint is here: */ + struct _constraintInfo *notnull; + /* If it's a domain, we store links to its CHECK constraints here: */ int nDomChecks; struct _constraintInfo *domChecks; } TypeInfo; diff --git a/src/bin/pg_dump/pg_dump_sort.c b/src/bin/pg_dump/pg_dump_sort.c index 538e7dcb49357..f99a0797ea7fb 100644 --- a/src/bin/pg_dump/pg_dump_sort.c +++ b/src/bin/pg_dump/pg_dump_sort.c @@ -907,7 +907,7 @@ repairTableAttrDefMultiLoop(DumpableObject *tableobj, } /* - * CHECK constraints on domains work just like those on tables ... + * CHECK, NOT NULL constraints on domains work just like those on tables ... */ static void repairDomainConstraintLoop(DumpableObject *domainobj, @@ -1173,11 +1173,12 @@ repairDependencyLoop(DumpableObject **loop, } } - /* Domain and CHECK constraint */ + /* Domain and CHECK or NOT NULL constraint */ if (nLoop == 2 && loop[0]->objType == DO_TYPE && loop[1]->objType == DO_CONSTRAINT && - ((ConstraintInfo *) loop[1])->contype == 'c' && + (((ConstraintInfo *) loop[1])->contype == 'c' || + ((ConstraintInfo *) loop[1])->contype == 'n') && ((ConstraintInfo *) loop[1])->condomain == (TypeInfo *) loop[0]) { repairDomainConstraintLoop(loop[0], loop[1]); @@ -1186,14 +1187,15 @@ repairDependencyLoop(DumpableObject **loop, if (nLoop == 2 && loop[1]->objType == DO_TYPE && loop[0]->objType == DO_CONSTRAINT && - ((ConstraintInfo *) loop[0])->contype == 'c' && + (((ConstraintInfo *) loop[0])->contype == 'c' || + ((ConstraintInfo *) loop[0])->contype == 'n') && ((ConstraintInfo *) loop[0])->condomain == (TypeInfo *) loop[1]) { repairDomainConstraintLoop(loop[1], loop[0]); return; } - /* Indirect loop involving domain and CHECK constraint */ + /* Indirect loop involving domain and CHECK or NOT NULL constraint */ if (nLoop > 2) { for (i = 0; i < nLoop; i++) @@ -1203,7 +1205,8 @@ repairDependencyLoop(DumpableObject **loop, for (j = 0; j < nLoop; j++) { if (loop[j]->objType == DO_CONSTRAINT && - ((ConstraintInfo *) loop[j])->contype == 'c' && + (((ConstraintInfo *) loop[j])->contype == 'c' || + ((ConstraintInfo *) loop[j])->contype == 'n') && ((ConstraintInfo *) loop[j])->condomain == (TypeInfo *) loop[i]) { repairDomainConstraintMultiLoop(loop[i], loop[j]); diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl index 2485d8f360e5a..771cdcecb6042 100644 --- a/src/bin/pg_dump/t/002_pg_dump.pl +++ b/src/bin/pg_dump/t/002_pg_dump.pl @@ -2377,17 +2377,19 @@ create_sql => 'CREATE DOMAIN dump_test.us_postal_code AS TEXT COLLATE "C" DEFAULT \'10014\' + CONSTRAINT nn NOT NULL CHECK(VALUE ~ \'^\d{5}$\' OR VALUE ~ \'^\d{5}-\d{4}$\'); + COMMENT ON CONSTRAINT nn + ON DOMAIN dump_test.us_postal_code IS \'not null\'; COMMENT ON CONSTRAINT us_postal_code_check ON DOMAIN dump_test.us_postal_code IS \'check it\';', regexp => qr/^ - \QCREATE DOMAIN dump_test.us_postal_code AS text COLLATE pg_catalog."C" DEFAULT '10014'::text\E\n\s+ + \QCREATE DOMAIN dump_test.us_postal_code AS text COLLATE pg_catalog."C" CONSTRAINT nn NOT NULL DEFAULT '10014'::text\E\n\s+ \QCONSTRAINT us_postal_code_check CHECK \E \Q(((VALUE ~ '^\d{5}\E \$\Q'::text) OR (VALUE ~ '^\d{5}-\d{4}\E\$ \Q'::text)));\E(.|\n)* - \QCOMMENT ON CONSTRAINT us_postal_code_check ON DOMAIN dump_test.us_postal_code IS 'check it';\E /xm, like => { %full_runs, %dump_test_schema_runs, section_pre_data => 1, }, @@ -2397,6 +2399,30 @@ }, }, + 'COMMENT ON CONSTRAINT ON DOMAIN (1)' => { + regexp => qr/^ + \QCOMMENT ON CONSTRAINT nn ON DOMAIN dump_test.us_postal_code IS 'not null';\E + /xm, + like => + { %full_runs, %dump_test_schema_runs, section_pre_data => 1, }, + unlike => { + exclude_dump_test_schema => 1, + only_dump_measurement => 1, + }, + }, + + 'COMMENT ON CONSTRAINT ON DOMAIN (2)' => { + regexp => qr/^ + \QCOMMENT ON CONSTRAINT us_postal_code_check ON DOMAIN dump_test.us_postal_code IS 'check it';\E + /xm, + like => + { %full_runs, %dump_test_schema_runs, section_pre_data => 1, }, + unlike => { + exclude_dump_test_schema => 1, + only_dump_measurement => 1, + }, + }, + 'CREATE FUNCTION dump_test.pltestlang_call_handler' => { create_order => 17, create_sql => 'CREATE FUNCTION dump_test.pltestlang_call_handler() From 0ded7615d8cafbb1a5ae3d0ba0a931eb146e63a5 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Tue, 22 Jul 2025 14:00:04 +0900 Subject: [PATCH 079/138] ecpg: Fix NULL pointer dereference during connection lookup ECPGconnect() caches established connections to the server, supporting the case of a NULL connection name when a database name is not specified by its caller. A follow-up call to ECPGget_PGconn() to get an established connection from the cached set with a non-NULL name could cause a NULL pointer dereference if a NULL connection was listed in the cache and checked for a match. At least two connections are necessary to reproduce the issue: one with a NULL name and one with a non-NULL name. Author: Aleksander Alekseev Discussion: https://postgr.es/m/CAJ7c6TNvFTPUTZQuNAoqgzaSGz-iM4XR61D7vEj5PsQXwg2RyA@mail.gmail.com Backpatch-through: 13 --- src/interfaces/ecpg/ecpglib/connect.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/interfaces/ecpg/ecpglib/connect.c b/src/interfaces/ecpg/ecpglib/connect.c index 2bbb70333dcb4..713cbbf6360be 100644 --- a/src/interfaces/ecpg/ecpglib/connect.c +++ b/src/interfaces/ecpg/ecpglib/connect.c @@ -58,7 +58,12 @@ ecpg_get_connection_nr(const char *connection_name) for (con = all_connections; con != NULL; con = con->next) { - if (strcmp(connection_name, con->name) == 0) + /* + * Check for the case of a NULL connection name, stored as such in + * the connection information by ECPGconnect() when the database + * name is not specified by its caller. + */ + if (con->name != NULL && strcmp(connection_name, con->name) == 0) break; } ret = con; From 282b10cb055e79d49c73f4652e841432321afcd9 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Tue, 22 Jul 2025 14:34:19 +0900 Subject: [PATCH 080/138] doc: Inform about aminsertcleanup optional NULLness This index AM callback has been introduced in c1ec02be1d79 and it is optional, currently only being used by BRIN. Optional callbacks are documented with NULL as possible value in amapi.h and indexam.sgml, but this callback has missed this part of the description. Reported-by: Peter Smith Reviewed-by: Japin Li Discussion: https://postgr.es/m/CAHut+PvgYcPmPDi1YdHMJY5upnyGRpc0N8pk1xNB11xDSBwNog@mail.gmail.com Backpatch-through: 17 --- doc/src/sgml/indexam.sgml | 2 +- src/include/access/amapi.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml index 1aa4741a8eaee..63d7e376f195e 100644 --- a/doc/src/sgml/indexam.sgml +++ b/doc/src/sgml/indexam.sgml @@ -147,7 +147,7 @@ typedef struct IndexAmRoutine ambuild_function ambuild; ambuildempty_function ambuildempty; aminsert_function aminsert; - aminsertcleanup_function aminsertcleanup; + aminsertcleanup_function aminsertcleanup; /* can be NULL */ ambulkdelete_function ambulkdelete; amvacuumcleanup_function amvacuumcleanup; amcanreturn_function amcanreturn; /* can be NULL */ diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 52916bab7a31f..70949de56ac70 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -293,7 +293,7 @@ typedef struct IndexAmRoutine ambuild_function ambuild; ambuildempty_function ambuildempty; aminsert_function aminsert; - aminsertcleanup_function aminsertcleanup; + aminsertcleanup_function aminsertcleanup; /* can be NULL */ ambulkdelete_function ambulkdelete; amvacuumcleanup_function amvacuumcleanup; amcanreturn_function amcanreturn; /* can be NULL */ From 0e8c6565513d2c8e473de29b317474b277311fc0 Mon Sep 17 00:00:00 2001 From: Amit Kapila Date: Tue, 22 Jul 2025 05:56:22 +0000 Subject: [PATCH 081/138] Doc: Fix logical replication examples. The definition of \dRp+ was modified in commit 7054186c4e. This patch updates the column list and row filter examples to align with the revised definition. Author: Shlok Kyal Reviewed by: Peter Smith Backpatch-through: 18, where it was introduced Discussion: https://postgr.es/m/CANhcyEUvqkSO6b9zi_fs_BBPEge5acj4mf8QKmq2TX-7axa7EQ@mail.gmail.com --- doc/src/sgml/logical-replication.sgml | 42 +++++++++++++-------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/doc/src/sgml/logical-replication.sgml b/doc/src/sgml/logical-replication.sgml index a2e49ef7eab54..c08435c764f4b 100644 --- a/doc/src/sgml/logical-replication.sgml +++ b/doc/src/sgml/logical-replication.sgml @@ -1048,28 +1048,28 @@ HINT: To initiate replication, you must manually create the replication slot, e defined) for each publication. 5) AND (c = 'NSW'::text)) + "public.t1" WHERE ((a > 5) AND (c = 'NSW'::text)) - Publication p2 - Owner | All tables | Inserts | Updates | Deletes | Truncates | Via root -----------+------------+---------+---------+---------+-----------+---------- - postgres | f | t | t | t | t | f + Publication p2 + Owner | All tables | Inserts | Updates | Deletes | Truncates | Generated columns | Via root +----------+------------+---------+---------+---------+-----------+-------------------+---------- + postgres | f | t | t | t | t | none | f Tables: - "public.t1" - "public.t2" WHERE (e = 99) + "public.t1" + "public.t2" WHERE (e = 99) - Publication p3 - Owner | All tables | Inserts | Updates | Deletes | Truncates | Via root -----------+------------+---------+---------+---------+-----------+---------- - postgres | f | t | t | t | t | f + Publication p3 + Owner | All tables | Inserts | Updates | Deletes | Truncates | Generated columns | Via root +----------+------------+---------+---------+---------+-----------+-------------------+---------- + postgres | f | t | t | t | t | none | f Tables: - "public.t2" WHERE (d = 10) - "public.t3" WHERE (g = 10) + "public.t2" WHERE (d = 10) + "public.t3" WHERE (g = 10) ]]> @@ -1491,10 +1491,10 @@ Publications: for each publication. /* pub # */ \dRp+ - Publication p1 - Owner | All tables | Inserts | Updates | Deletes | Truncates | Via root -----------+------------+---------+---------+---------+-----------+---------- - postgres | f | t | t | t | t | f + Publication p1 + Owner | All tables | Inserts | Updates | Deletes | Truncates | Generated columns | Via root +----------+------------+---------+---------+---------+-----------+-------------------+---------- + postgres | f | t | t | t | t | none | f Tables: "public.t1" (id, a, b, d) From 7b98c5536818287998ca868dc96a8dc20ea78f0b Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Tue, 22 Jul 2025 08:30:52 -0400 Subject: [PATCH 082/138] aio: Fix assertion, clarify README The assertion wouldn't have triggered for a long while yet, but this won't accidentally fail to detect the issue if/when it occurs. Author: Matthias van de Meent Discussion: https://postgr.es/m/CAEze2Wj-43JV4YufW23gm=Uwr7Lkj+p0yKctKHxNm1rwFC+_DQ@mail.gmail.com Backpatch-through: 18 --- src/backend/storage/aio/README.md | 5 +++-- src/include/storage/aio.h | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/backend/storage/aio/README.md b/src/backend/storage/aio/README.md index f10b5c7e31ec7..72ae3b3737d51 100644 --- a/src/backend/storage/aio/README.md +++ b/src/backend/storage/aio/README.md @@ -94,7 +94,7 @@ pgaio_io_register_callbacks(ioh, PGAIO_HCB_SHARED_BUFFER_READV, 0); * * In this example we're reading only a single buffer, hence the 1. */ -pgaio_io_set_handle_data_32(ioh, (uint32 *) buffer, 1); +pgaio_io_set_handle_data_32(ioh, (uint32 *) &buffer, 1); /* * Pass the AIO handle to lower-level function. When operating on the level of @@ -119,8 +119,9 @@ pgaio_io_set_handle_data_32(ioh, (uint32 *) buffer, 1); * e.g. due to reaching a limit on the number of unsubmitted IOs, and even * complete before smgrstartreadv() returns. */ +void *page = BufferGetBlock(buffer); smgrstartreadv(ioh, operation->smgr, forknum, blkno, - BufferGetBlock(buffer), 1); + &page, 1); /* * To benefit from AIO, it is beneficial to perform other work, including diff --git a/src/include/storage/aio.h b/src/include/storage/aio.h index e7a0a234b6cf2..2933eea064910 100644 --- a/src/include/storage/aio.h +++ b/src/include/storage/aio.h @@ -201,7 +201,7 @@ typedef enum PgAioHandleCallbackID } PgAioHandleCallbackID; #define PGAIO_HCB_MAX PGAIO_HCB_LOCAL_BUFFER_READV -StaticAssertDecl(PGAIO_HCB_MAX <= (1 << PGAIO_RESULT_ID_BITS), +StaticAssertDecl(PGAIO_HCB_MAX < (1 << PGAIO_RESULT_ID_BITS), "PGAIO_HCB_MAX is too big for PGAIO_RESULT_ID_BITS"); From 3d039b53a1339791f89b68580ba462f2bb4dfa52 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 23 Jul 2025 15:44:29 -0400 Subject: [PATCH 083/138] Fix build breakage on Solaris-alikes with late-model GCC. Solaris has never bothered to add "const" to the second argument of PAM conversation procs, as all other Unixen did decades ago. This resulted in an "incompatible pointer" compiler warning when building --with-pam, but had no more serious effect than that, so we never did anything about it. However, as of GCC 14 the case is an error not warning by default. To complicate matters, recent OpenIndiana (and maybe illumos in general?) *does* supply the "const" by default, so we can't just assume that platforms using our solaris template need help. What we can do, short of building a configure-time probe, is to make solaris.h #define _PAM_LEGACY_NONCONST, which causes OpenIndiana's pam_appl.h to revert to the traditional definition, and hopefully will have no effect anywhere else. Then we can use that same symbol to control whether we include "const" in the declaration of pam_passwd_conv_proc(). Bug: #18995 Reported-by: Andrew Watkins Author: Tom Lane Discussion: https://postgr.es/m/18995-82058da9ab4337a7@postgresql.org Backpatch-through: 13 --- src/backend/libpq/auth.c | 12 ++++++++++-- src/include/port/solaris.h | 9 +++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/backend/libpq/auth.c b/src/backend/libpq/auth.c index 9f4d05ffbd453..4da46666439db 100644 --- a/src/backend/libpq/auth.c +++ b/src/backend/libpq/auth.c @@ -94,8 +94,16 @@ static int auth_peer(hbaPort *port); #define PGSQL_PAM_SERVICE "postgresql" /* Service name passed to PAM */ +/* Work around original Solaris' lack of "const" in the conv_proc signature */ +#ifdef _PAM_LEGACY_NONCONST +#define PG_PAM_CONST +#else +#define PG_PAM_CONST const +#endif + static int CheckPAMAuth(Port *port, const char *user, const char *password); -static int pam_passwd_conv_proc(int num_msg, const struct pam_message **msg, +static int pam_passwd_conv_proc(int num_msg, + PG_PAM_CONST struct pam_message **msg, struct pam_response **resp, void *appdata_ptr); static struct pam_conv pam_passw_conv = { @@ -1917,7 +1925,7 @@ auth_peer(hbaPort *port) */ static int -pam_passwd_conv_proc(int num_msg, const struct pam_message **msg, +pam_passwd_conv_proc(int num_msg, PG_PAM_CONST struct pam_message **msg, struct pam_response **resp, void *appdata_ptr) { const char *passwd; diff --git a/src/include/port/solaris.h b/src/include/port/solaris.h index e63a3bd824d6d..8ff40007c7f6a 100644 --- a/src/include/port/solaris.h +++ b/src/include/port/solaris.h @@ -24,3 +24,12 @@ #if defined(__i386__) #include #endif + +/* + * On original Solaris, PAM conversation procs lack a "const" in their + * declaration; but recent OpenIndiana versions put it there by default. + * The least messy way to deal with this is to define _PAM_LEGACY_NONCONST, + * which causes OpenIndiana to declare pam_conv per the Solaris tradition, + * and also use that symbol to control omitting the "const" in our own code. + */ +#define _PAM_LEGACY_NONCONST 1 From a8acfb133cf91e53d28ec7025188bfffd747a11a Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Thu, 24 Jul 2025 11:43:20 +0900 Subject: [PATCH 084/138] doc: Add missing index entries and fix title formatting in pg_buffercache docs. This commit adds missing index entries for the functions pg_buffercache_numa() and pg_buffercache_usage_counts() in the pg_buffercache documentation. It also makes the function titles consistent by adding parentheses after function names where they were previously missing. Author: Fujii Masao Reviewed-by: Michael Paquier Discussion: https://postgr.es/m/7d19af4b-7da3-4862-9f52-ff958960bd8d@oss.nttdata.com Backpatch-through: 18 --- doc/src/sgml/pgbuffercache.sgml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml index 546ace8369e28..eeb85a0e04908 100644 --- a/doc/src/sgml/pgbuffercache.sgml +++ b/doc/src/sgml/pgbuffercache.sgml @@ -19,10 +19,18 @@ pg_buffercache_pages + + pg_buffercache_numa + + pg_buffercache_summary + + pg_buffercache_usage_counts + + pg_buffercache_evict @@ -489,7 +497,7 @@ - The <structname>pg_buffercache_evict_relation</structname> Function + The <structname>pg_buffercache_evict_relation()</structname> Function The pg_buffercache_evict_relation() function is very similar to the pg_buffercache_evict() function. The @@ -507,7 +515,7 @@ - The <structname>pg_buffercache_evict_all</structname> Function + The <structname>pg_buffercache_evict_all()</structname> Function The pg_buffercache_evict_all() function is very similar to the pg_buffercache_evict() function. The From 33f74b806ce3facfc0deb1412ed85d19a44c0553 Mon Sep 17 00:00:00 2001 From: Amit Kapila Date: Thu, 24 Jul 2025 08:50:40 +0000 Subject: [PATCH 085/138] Fix duplicate transaction replay during pg_createsubscriber. Previously, the tool could replay the same transaction twice, once during recovery, then again during replication after the subscriber was set up. This occurred because the same recovery_target_lsn was used both to finalize recovery and to start replication. If recovery_target_inclusive = true, the transaction at that LSN would be applied during recovery and then sent again by the publisher leading to duplication. To prevent this, we now set recovery_target_inclusive = false. This ensures the transaction at recovery_target_lsn is not reapplied during recovery, avoiding duplication when replication begins. Bug #18897 Reported-by: Zane Duffield Author: Shlok Kyal Reviewed-by: vignesh C Reviewed-by: Amit Kapila Backpatch-through: 17, where it was introduced Discussion: https://postgr.es/m/18897-d3db67535860dddb@postgresql.org --- src/bin/pg_basebackup/pg_createsubscriber.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/bin/pg_basebackup/pg_createsubscriber.c b/src/bin/pg_basebackup/pg_createsubscriber.c index 11f71c0380181..1d0fe44b6d33a 100644 --- a/src/bin/pg_basebackup/pg_createsubscriber.c +++ b/src/bin/pg_basebackup/pg_createsubscriber.c @@ -1250,8 +1250,17 @@ setup_recovery(const struct LogicalRepInfo *dbinfo, const char *datadir, const c appendPQExpBufferStr(recoveryconfcontents, "recovery_target = ''\n"); appendPQExpBufferStr(recoveryconfcontents, "recovery_target_timeline = 'latest'\n"); + + /* + * Set recovery_target_inclusive = false to avoid reapplying the + * transaction committed at 'lsn' after subscription is enabled. This is + * because the provided 'lsn' is also used as the replication start point + * for the subscription. So, the server can send the transaction committed + * at that 'lsn' after replication is started which can lead to applying + * the same transaction twice if we keep recovery_target_inclusive = true. + */ appendPQExpBufferStr(recoveryconfcontents, - "recovery_target_inclusive = true\n"); + "recovery_target_inclusive = false\n"); appendPQExpBufferStr(recoveryconfcontents, "recovery_target_action = promote\n"); appendPQExpBufferStr(recoveryconfcontents, "recovery_target_name = ''\n"); From 2973b1cd3a8005a35a9303c37602468aeb01dfeb Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Fri, 25 Jul 2025 11:17:51 +0900 Subject: [PATCH 086/138] Lower bounds related to pgstats kinds This commit changes stats kinds to have the following bounds, making their handling in core cheaper by default: - PGSTAT_KIND_CUSTOM_MIN 128 -> 24 - PGSTAT_KIND_MAX 256 -> 32 The original numbers were rather high, and showed an impact on performance in pgstat_report_stat() for the case of simple queries with its early-exit path if there are no pending statistics to flush. This logic will be improved more in a follow-up commit to bring the performance of pgstat_report_stat() on par with v17 and older versions. Lowering the bounds is a change worth doing on its own, independently of the other improvement. These new numbers should be enough to leave some room for the following years for built-in and custom stats kinds, with stable ID numbers. At least that should be enough to start with this facility for extension developers. It can be always increased in the tree depending on the requirements wanted. Per discussion with Andres Freund and Bertrand Drouvot. Discussion: https://postgr.es/m/eb224uegsga2hgq7dfq3ps5cduhpqej7ir2hjxzzozjthrekx5@dysei6buqthe Backpatch-through: 18 --- src/include/utils/pgstat_kind.h | 6 +++--- src/test/modules/injection_points/injection_stats.c | 2 +- src/test/modules/injection_points/injection_stats_fixed.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/include/utils/pgstat_kind.h b/src/include/utils/pgstat_kind.h index f44169fd5a3c7..eb5f0b3ae6db7 100644 --- a/src/include/utils/pgstat_kind.h +++ b/src/include/utils/pgstat_kind.h @@ -18,7 +18,7 @@ /* Range of IDs allowed, for built-in and custom kinds */ #define PGSTAT_KIND_MIN 1 /* Minimum ID allowed */ -#define PGSTAT_KIND_MAX 256 /* Maximum ID allowed */ +#define PGSTAT_KIND_MAX 32 /* Maximum ID allowed */ /* use 0 for INVALID, to catch zero-initialized data */ #define PGSTAT_KIND_INVALID 0 @@ -46,7 +46,7 @@ /* Custom stats kinds */ /* Range of IDs allowed for custom stats kinds */ -#define PGSTAT_KIND_CUSTOM_MIN 128 +#define PGSTAT_KIND_CUSTOM_MIN 24 #define PGSTAT_KIND_CUSTOM_MAX PGSTAT_KIND_MAX #define PGSTAT_KIND_CUSTOM_SIZE (PGSTAT_KIND_CUSTOM_MAX - PGSTAT_KIND_CUSTOM_MIN + 1) @@ -55,7 +55,7 @@ * development and have not reserved their own unique kind ID yet. See: * https://wiki.postgresql.org/wiki/CustomCumulativeStats */ -#define PGSTAT_KIND_EXPERIMENTAL 128 +#define PGSTAT_KIND_EXPERIMENTAL 24 static inline bool pgstat_is_kind_builtin(PgStat_Kind kind) diff --git a/src/test/modules/injection_points/injection_stats.c b/src/test/modules/injection_points/injection_stats.c index 14903c629e0d1..e3947b23ba573 100644 --- a/src/test/modules/injection_points/injection_stats.c +++ b/src/test/modules/injection_points/injection_stats.c @@ -59,7 +59,7 @@ static const PgStat_KindInfo injection_stats = { /* * Kind ID reserved for statistics of injection points. */ -#define PGSTAT_KIND_INJECTION 129 +#define PGSTAT_KIND_INJECTION 25 /* Track if stats are loaded */ static bool inj_stats_loaded = false; diff --git a/src/test/modules/injection_points/injection_stats_fixed.c b/src/test/modules/injection_points/injection_stats_fixed.c index 3d0c01bdd05ab..bc54c79d190b9 100644 --- a/src/test/modules/injection_points/injection_stats_fixed.c +++ b/src/test/modules/injection_points/injection_stats_fixed.c @@ -64,7 +64,7 @@ static const PgStat_KindInfo injection_stats_fixed = { /* * Kind ID reserved for statistics of injection points. */ -#define PGSTAT_KIND_INJECTION_FIXED 130 +#define PGSTAT_KIND_INJECTION_FIXED 26 /* Track if fixed-numbered stats are loaded */ static bool inj_fixed_loaded = false; From f7dfccf9605dab54956321e236de3415a2ba2fa3 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Fri, 25 Jul 2025 16:17:31 +0900 Subject: [PATCH 087/138] Fix assertion failure with latch wait in single-user mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LatchWaitSetPostmasterDeathPos, the latch event position for the postmaster death event, is initialized under IsUnderPostmaster. WaitLatch() considered it as a valid wait target in single-user mode (!IsUnderPostmaster), which was incorrect. One code path found to fail with an assertion failure is a database drop in single-user mode while waiting in WaitForProcSignalBarrier() after the drop. Oversight in commit 84e5b2f07a5e. Author: Patrick Stählin Co-authored-by: Ronan Dunklau Discussion: https://postgr.es/m/18996-3a2744c8140488de@postgresql.org Backpatch-through: 18 --- src/backend/storage/ipc/latch.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c index c6aefd2f688dd..beadeb5e46afa 100644 --- a/src/backend/storage/ipc/latch.c +++ b/src/backend/storage/ipc/latch.c @@ -187,9 +187,11 @@ WaitLatch(Latch *latch, int wakeEvents, long timeout, if (!(wakeEvents & WL_LATCH_SET)) latch = NULL; ModifyWaitEvent(LatchWaitSet, LatchWaitSetLatchPos, WL_LATCH_SET, latch); - ModifyWaitEvent(LatchWaitSet, LatchWaitSetPostmasterDeathPos, - (wakeEvents & (WL_EXIT_ON_PM_DEATH | WL_POSTMASTER_DEATH)), - NULL); + + if (IsUnderPostmaster) + ModifyWaitEvent(LatchWaitSet, LatchWaitSetPostmasterDeathPos, + (wakeEvents & (WL_EXIT_ON_PM_DEATH | WL_POSTMASTER_DEATH)), + NULL); if (WaitEventSetWait(LatchWaitSet, (wakeEvents & WL_TIMEOUT) ? timeout : -1, From 75f633f54aaae4d20ea0ade9a953afa73e3a66e8 Mon Sep 17 00:00:00 2001 From: Fujii Masao Date: Fri, 25 Jul 2025 18:38:36 +0900 Subject: [PATCH 088/138] Fix background worker not restarting after crash-and-restart cycle. Previously, if a background worker crashed (e.g., due to a SIGKILL) and the server restarted due to restart_after_crash being enabled, the worker was not restarted as expected. Background workers without the never-restart flag should automatically restart in this case. This issue was introduced in commit 28a520c0b77, which failed to reset the rw_pid field in the RegisteredBgWorker struct for the crashed worker. This commit fixes the problem by resetting rw_pid for all eligible background workers during the crash-and-restart cycle. Back-patched to v18, where the bug was introduced. Bug fix patches were proposed by Andrey Rudometov and ChangAo Chen, but this commit uses a different approach. Reported-by: Andrey Rudometov Reported-by: ChangAo Chen Author: Andrey Rudometov Author: ChangAo Chen Co-authored-by: Fujii Masao Reviewed-by: ChangAo Chen Reviewed-by: Shveta Malik Discussion: https://postgr.es/m/CAF6JsWiO=i24qYitWe6ns1sXqcL86rYxdyU+pNYk-WueKPSySg@mail.gmail.com Discussion: https://postgr.es/m/tencent_E00A056B3953EE6440F0F40F80EC30427D09@qq.com Backpatch-through: 18 --- src/backend/postmaster/bgworker.c | 1 + src/backend/postmaster/postmaster.c | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c index 116ddf7b835f1..1ad65c237c34e 100644 --- a/src/backend/postmaster/bgworker.c +++ b/src/backend/postmaster/bgworker.c @@ -613,6 +613,7 @@ ResetBackgroundWorkerCrashTimes(void) * resetting. */ rw->rw_crashed_at = 0; + rw->rw_pid = 0; /* * If there was anyone waiting for it, they're history. diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index cca9b946e5384..e01d9f0cfe81e 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -2630,6 +2630,13 @@ CleanupBackend(PMChild *bp, } bp = NULL; + /* + * In a crash case, exit immediately without resetting background worker + * state. However, if restart_after_crash is enabled, the background + * worker state (e.g., rw_pid) still needs be reset so the worker can + * restart after crash recovery. This reset is handled in + * ResetBackgroundWorkerCrashTimes(), not here. + */ if (crashed) { HandleChildCrash(bp_pid, exitstatus, procname); From bae50782170c9de8aa13700423923a5bb9d6b9e9 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Sun, 27 Jul 2025 15:10:01 +0300 Subject: [PATCH 089/138] Limit checkpointer requests queue size If the number of sync requests is big enough, the palloc() call in AbsorbSyncRequests() will attempt to allocate more than 1 GB of memory, resulting in failure. This can lead to an infinite loop in the checkpointer process, as it repeatedly fails to absorb the pending requests. This commit limits the checkpointer requests queue size to 10M items. In addition to preventing the palloc() failure, this change helps to avoid long queue processing time. Also, this commit is for backpathing only. The master branch receives a more invasive yet comprehensive fix for this problem. Discussion: https://postgr.es/m/db4534f83a22a29ab5ee2566ad86ca92%40postgrespro.ru Backpatch-through: 13 --- src/backend/postmaster/checkpointer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index fda91ffd1ce2d..903d83e7dea0f 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -135,6 +135,9 @@ static CheckpointerShmemStruct *CheckpointerShmem; /* interval for calling AbsorbSyncRequests in CheckpointWriteDelay */ #define WRITES_PER_ABSORB 1000 +/* Max number of requests the checkpointer request queue can hold */ +#define MAX_CHECKPOINT_REQUESTS 10000000 + /* * GUC parameters */ @@ -970,7 +973,7 @@ CheckpointerShmemInit(void) */ MemSet(CheckpointerShmem, 0, size); SpinLockInit(&CheckpointerShmem->ckpt_lck); - CheckpointerShmem->max_requests = NBuffers; + CheckpointerShmem->max_requests = Min(NBuffers, MAX_CHECKPOINT_REQUESTS); ConditionVariableInit(&CheckpointerShmem->start_cv); ConditionVariableInit(&CheckpointerShmem->done_cv); } From 13eb6bb76d5de1d5c3ae3a80684e6a0da5314817 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Mon, 28 Jul 2025 08:15:16 +0900 Subject: [PATCH 090/138] Fix performance regression with flush of pending fixed-numbered stats The callback added in fc415edf8ca8 used to check if there is any pending data to flush for fixed-numbered statistics, done by looping across all the builtin and custom stats kinds with a call to have_fixed_pending_cb, is proving to able to show in workloads that do not report any stats (read-only, no function calls, no WAL, no IO, etc). The code used in v17 was cheaper than that what HEAD has introduced, relying on three boolean checks for WAL, SLRU and IO stats. This commit switches the code to use a more efficient approach than fc415edf8ca8, with a single boolean flag that can be switched to "true" by any fixed-numbered stats kinds to force pgstat_report_stat() to go through one round of reports. The flag is reset by pgstat_report_stat() once a full round of reports is done. The flag being false means that fixed-numbered stats kinds saw no activity, and that there is no pending data to flush. ac000fca743e took one step in improving the performance by reducing the number of stats kinds that the backend can hold. This commit takes a more drastic step by bringing back the code efficiency to what it was before v18 with a cheap check at the beginning of pgstat_report_stat() for its fast-exit path. The callback have_static_pending_cb is removed as an effect of all that. Reported-by: Andres Freund Reviewed-by: Bertrand Drouvot Discussion: https://postgr.es/m/eb224uegsga2hgq7dfq3ps5cduhpqej7ir2hjxzzozjthrekx5@dysei6buqthe Backpatch-through: 18 --- src/backend/access/transam/xlog.c | 10 ++++ src/backend/utils/activity/pgstat.c | 52 ++++++++------------- src/backend/utils/activity/pgstat_backend.c | 14 +----- src/backend/utils/activity/pgstat_io.c | 10 +--- src/backend/utils/activity/pgstat_slru.c | 10 +--- src/backend/utils/activity/pgstat_wal.c | 20 ++++---- src/include/utils/pgstat_internal.h | 34 ++++++++------ 7 files changed, 62 insertions(+), 88 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index c6c380df95684..184de54f3a189 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -96,6 +96,7 @@ #include "utils/guc_hooks.h" #include "utils/guc_tables.h" #include "utils/injection_point.h" +#include "utils/pgstat_internal.h" #include "utils/ps_status.h" #include "utils/relmapper.h" #include "utils/snapmgr.h" @@ -1092,6 +1093,9 @@ XLogInsertRecord(XLogRecData *rdata, pgWalUsage.wal_bytes += rechdr->xl_tot_len; pgWalUsage.wal_records++; pgWalUsage.wal_fpi += num_fpi; + + /* Required for the flush of pending stats WAL data */ + pgstat_report_fixed = true; } return EndPos; @@ -2109,6 +2113,12 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic) LWLockRelease(WALWriteLock); pgWalUsage.wal_buffers_full++; TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE(); + + /* + * Required for the flush of pending stats WAL data, per + * update of pgWalUsage. + */ + pgstat_report_fixed = true; } } } diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c index 8b57845e8709f..6bc91ce0dadda 100644 --- a/src/backend/utils/activity/pgstat.c +++ b/src/backend/utils/activity/pgstat.c @@ -212,6 +212,11 @@ int pgstat_fetch_consistency = PGSTAT_FETCH_CONSISTENCY_CACHE; PgStat_LocalState pgStatLocal; +/* + * Track pending reports for fixed-numbered stats, used by + * pgstat_report_stat(). + */ +bool pgstat_report_fixed = false; /* ---------- * Local data @@ -370,7 +375,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .shared_data_off = offsetof(PgStatShared_Backend, stats), .shared_data_len = sizeof(((PgStatShared_Backend *) 0)->stats), - .have_static_pending_cb = pgstat_backend_have_pending_cb, .flush_static_cb = pgstat_backend_flush_cb, .reset_timestamp_cb = pgstat_backend_reset_timestamp_cb, }, @@ -437,7 +441,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .shared_data_len = sizeof(((PgStatShared_IO *) 0)->stats), .flush_static_cb = pgstat_io_flush_cb, - .have_static_pending_cb = pgstat_io_have_pending_cb, .init_shmem_cb = pgstat_io_init_shmem_cb, .reset_all_cb = pgstat_io_reset_all_cb, .snapshot_cb = pgstat_io_snapshot_cb, @@ -455,7 +458,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .shared_data_len = sizeof(((PgStatShared_SLRU *) 0)->stats), .flush_static_cb = pgstat_slru_flush_cb, - .have_static_pending_cb = pgstat_slru_have_pending_cb, .init_shmem_cb = pgstat_slru_init_shmem_cb, .reset_all_cb = pgstat_slru_reset_all_cb, .snapshot_cb = pgstat_slru_snapshot_cb, @@ -474,7 +476,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE] .init_backend_cb = pgstat_wal_init_backend_cb, .flush_static_cb = pgstat_wal_flush_cb, - .have_static_pending_cb = pgstat_wal_have_pending_cb, .init_shmem_cb = pgstat_wal_init_shmem_cb, .reset_all_cb = pgstat_wal_reset_all_cb, .snapshot_cb = pgstat_wal_snapshot_cb, @@ -708,29 +709,10 @@ pgstat_report_stat(bool force) } /* Don't expend a clock check if nothing to do */ - if (dlist_is_empty(&pgStatPending)) + if (dlist_is_empty(&pgStatPending) && + !pgstat_report_fixed) { - bool do_flush = false; - - /* Check for pending stats */ - for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++) - { - const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); - - if (!kind_info) - continue; - if (!kind_info->have_static_pending_cb) - continue; - - if (kind_info->have_static_pending_cb()) - { - do_flush = true; - break; - } - } - - if (!do_flush) - return 0; + return 0; } /* @@ -784,16 +766,19 @@ pgstat_report_stat(bool force) partial_flush |= pgstat_flush_pending_entries(nowait); /* flush of other stats kinds */ - for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++) + if (pgstat_report_fixed) { - const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); + for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++) + { + const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind); - if (!kind_info) - continue; - if (!kind_info->flush_static_cb) - continue; + if (!kind_info) + continue; + if (!kind_info->flush_static_cb) + continue; - partial_flush |= kind_info->flush_static_cb(nowait); + partial_flush |= kind_info->flush_static_cb(nowait); + } } last_flush = now; @@ -815,6 +800,7 @@ pgstat_report_stat(bool force) } pending_since = 0; + pgstat_report_fixed = false; return 0; } diff --git a/src/backend/utils/activity/pgstat_backend.c b/src/backend/utils/activity/pgstat_backend.c index 51256277e8d37..8714a85e2d936 100644 --- a/src/backend/utils/activity/pgstat_backend.c +++ b/src/backend/utils/activity/pgstat_backend.c @@ -66,6 +66,7 @@ pgstat_count_backend_io_op_time(IOObject io_object, IOContext io_context, io_time); backend_has_iostats = true; + pgstat_report_fixed = true; } void @@ -81,6 +82,7 @@ pgstat_count_backend_io_op(IOObject io_object, IOContext io_context, PendingBackendStats.pending_io.bytes[io_object][io_context][io_op] += bytes; backend_has_iostats = true; + pgstat_report_fixed = true; } /* @@ -301,18 +303,6 @@ pgstat_flush_backend(bool nowait, bits32 flags) return false; } -/* - * Check if there are any backend stats waiting for flush. - */ -bool -pgstat_backend_have_pending_cb(void) -{ - if (!pgstat_tracks_backend_bktype(MyBackendType)) - return false; - - return (backend_has_iostats || pgstat_backend_wal_have_pending()); -} - /* * Callback to flush out locally pending backend statistics. * diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c index d8d26379a571e..13ae57ed6498d 100644 --- a/src/backend/utils/activity/pgstat_io.c +++ b/src/backend/utils/activity/pgstat_io.c @@ -80,6 +80,7 @@ pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, pgstat_count_backend_io_op(io_object, io_context, io_op, cnt, bytes); have_iostats = true; + pgstat_report_fixed = true; } /* @@ -167,15 +168,6 @@ pgstat_fetch_stat_io(void) return &pgStatLocal.snapshot.io; } -/* - * Check if there any IO stats waiting for flush. - */ -bool -pgstat_io_have_pending_cb(void) -{ - return have_iostats; -} - /* * Simpler wrapper of pgstat_io_flush_cb() */ diff --git a/src/backend/utils/activity/pgstat_slru.c b/src/backend/utils/activity/pgstat_slru.c index b9e940dde45b6..7bd8744accb0e 100644 --- a/src/backend/utils/activity/pgstat_slru.c +++ b/src/backend/utils/activity/pgstat_slru.c @@ -143,15 +143,6 @@ pgstat_get_slru_index(const char *name) return (SLRU_NUM_ELEMENTS - 1); } -/* - * Check if there are any SLRU stats entries waiting for flush. - */ -bool -pgstat_slru_have_pending_cb(void) -{ - return have_slrustats; -} - /* * Flush out locally pending SLRU stats entries * @@ -247,6 +238,7 @@ get_slru_entry(int slru_idx) Assert((slru_idx >= 0) && (slru_idx < SLRU_NUM_ELEMENTS)); have_slrustats = true; + pgstat_report_fixed = true; return &pending_SLRUStats[slru_idx]; } diff --git a/src/backend/utils/activity/pgstat_wal.c b/src/backend/utils/activity/pgstat_wal.c index 16a1ecb4d90d2..0d04480d2f6d0 100644 --- a/src/backend/utils/activity/pgstat_wal.c +++ b/src/backend/utils/activity/pgstat_wal.c @@ -71,6 +71,15 @@ pgstat_fetch_stat_wal(void) return &pgStatLocal.snapshot.wal; } +/* + * To determine whether WAL usage happened. + */ +static inline bool +pgstat_wal_have_pending(void) +{ + return pgWalUsage.wal_records != prevWalUsage.wal_records; +} + /* * Calculate how much WAL usage counters have increased by subtracting the * previous counters from the current ones. @@ -92,7 +101,7 @@ pgstat_wal_flush_cb(bool nowait) * This function can be called even if nothing at all has happened. Avoid * taking lock for nothing in that case. */ - if (!pgstat_wal_have_pending_cb()) + if (!pgstat_wal_have_pending()) return false; /* @@ -136,15 +145,6 @@ pgstat_wal_init_backend_cb(void) prevWalUsage = pgWalUsage; } -/* - * To determine whether WAL usage happened. - */ -bool -pgstat_wal_have_pending_cb(void) -{ - return pgWalUsage.wal_records != prevWalUsage.wal_records; -} - void pgstat_wal_init_shmem_cb(void *stats) { diff --git a/src/include/utils/pgstat_internal.h b/src/include/utils/pgstat_internal.h index d5557e6e998cd..6cf00008f6333 100644 --- a/src/include/utils/pgstat_internal.h +++ b/src/include/utils/pgstat_internal.h @@ -295,18 +295,11 @@ typedef struct PgStat_KindInfo * * Returns true if some of the stats could not be flushed, due to lock * contention for example. Optional. - */ - bool (*flush_static_cb) (bool nowait); - - /* - * For fixed-numbered or variable-numbered statistics: Check for pending - * stats in need of flush with flush_static_cb, when these do not use - * PgStat_EntryRef->pending. * - * Returns true if there are any stats pending for flush, triggering - * flush_static_cb. Optional. + * "pgstat_report_fixed" needs to be set to trigger the flush of pending + * stats. */ - bool (*have_static_pending_cb) (void); + bool (*flush_static_cb) (bool nowait); /* * For fixed-numbered statistics: Reset All. @@ -627,7 +620,6 @@ extern void pgstat_archiver_snapshot_cb(void); extern bool pgstat_flush_backend(bool nowait, bits32 flags); extern bool pgstat_backend_flush_cb(bool nowait); -extern bool pgstat_backend_have_pending_cb(void); extern void pgstat_backend_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts); @@ -676,7 +668,6 @@ extern bool pgstat_function_flush_cb(PgStat_EntryRef *entry_ref, bool nowait); extern void pgstat_flush_io(bool nowait); -extern bool pgstat_io_have_pending_cb(void); extern bool pgstat_io_flush_cb(bool nowait); extern void pgstat_io_init_shmem_cb(void *stats); extern void pgstat_io_reset_all_cb(TimestampTz ts); @@ -738,7 +729,6 @@ extern PgStatShared_Common *pgstat_init_entry(PgStat_Kind kind, * Functions in pgstat_slru.c */ -extern bool pgstat_slru_have_pending_cb(void); extern bool pgstat_slru_flush_cb(bool nowait); extern void pgstat_slru_init_shmem_cb(void *stats); extern void pgstat_slru_reset_all_cb(TimestampTz ts); @@ -750,7 +740,6 @@ extern void pgstat_slru_snapshot_cb(void); */ extern void pgstat_wal_init_backend_cb(void); -extern bool pgstat_wal_have_pending_cb(void); extern bool pgstat_wal_flush_cb(bool nowait); extern void pgstat_wal_init_shmem_cb(void *stats); extern void pgstat_wal_reset_all_cb(TimestampTz ts); @@ -778,8 +767,23 @@ extern void pgstat_create_transactional(PgStat_Kind kind, Oid dboid, uint64 obji * Variables in pgstat.c */ -extern PGDLLIMPORT PgStat_LocalState pgStatLocal; +/* + * Track if *any* pending fixed-numbered statistics should be flushed to + * shared memory. + * + * This flag can be switched to true by fixed-numbered statistics to let + * pgstat_report_stat() know if it needs to go through one round of + * reports, calling flush_static_cb for each fixed-numbered statistics + * kind. When this flag is not set, pgstat_report_stat() is able to do + * a fast exit, knowing that there are no pending fixed-numbered statistics. + * + * Statistics callbacks should never reset this flag; pgstat_report_stat() + * is in charge of doing that. + */ +extern PGDLLIMPORT bool pgstat_report_fixed; +/* Backend-local stats state */ +extern PGDLLIMPORT PgStat_LocalState pgStatLocal; /* * Implementation of inline functions declared above. From 44e135ad57b242a89266b0aebaaf523a01a3484c Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Thu, 24 Jul 2025 13:30:43 -0400 Subject: [PATCH 091/138] Avoid throwing away the error message in syncrep_yyerror. Commit 473a575e05979b4dbb28b3f2544f4ec8f184ce65 purported to make this function stash the error message in *syncrep_parse_result_p, but it didn't actually. As a result, an attempt to set synchronous_standby_names to any value that does not parse resulted in a generic "parser failed." message rather than anything more specific. This fixes that. Discussion: http://postgr.es/m/CA+TgmoYF9wPNZ-Q_EMfib_espgHycY-eX__6Tzo2GpYpVXqCdQ@mail.gmail.com Backpatch-through: 18 --- src/backend/replication/syncrep_scanner.l | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/backend/replication/syncrep_scanner.l b/src/backend/replication/syncrep_scanner.l index 7dec1f869c745..02004d621e73d 100644 --- a/src/backend/replication/syncrep_scanner.l +++ b/src/backend/replication/syncrep_scanner.l @@ -157,17 +157,16 @@ syncrep_yyerror(SyncRepConfigData **syncrep_parse_result_p, char **syncrep_parse { struct yyguts_t *yyg = (struct yyguts_t *) yyscanner; /* needed for yytext * macro */ - char *syncrep_parse_error_msg = *syncrep_parse_error_msg_p; /* report only the first error in a parse operation */ - if (syncrep_parse_error_msg) + if (*syncrep_parse_error_msg_p) return; if (yytext[0]) - syncrep_parse_error_msg = psprintf("%s at or near \"%s\"", - message, yytext); + *syncrep_parse_error_msg_p = psprintf("%s at or near \"%s\"", + message, yytext); else - syncrep_parse_error_msg = psprintf("%s at end of input", - message); + *syncrep_parse_error_msg_p = psprintf("%s at end of input", + message); } void From 637ead2e1aa1fe955f9f095f791a38ef7797c959 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 28 Jul 2025 16:50:41 -0400 Subject: [PATCH 092/138] Avoid regression in the size of XML input that we will accept. This mostly reverts commit 6082b3d5d, "Use xmlParseInNodeContext not xmlParseBalancedChunkMemory". It turns out that xmlParseInNodeContext will reject text chunks exceeding 10MB, while (in most libxml2 versions) xmlParseBalancedChunkMemory will not. The bleeding-edge libxml2 bug that we needed to work around a year ago is presumably no longer a factor, and the argument that xmlParseBalancedChunkMemory is semi-deprecated is not enough to justify a functionality regression. Hence, go back to doing it the old way. Reported-by: Michael Paquier Author: Michael Paquier Co-authored-by: Erik Wienhold Reviewed-by: Tom Lane Discussion: https://postgr.es/m/aIGknLuc8b8ega2X@paquier.xyz Backpatch-through: 13 --- src/backend/utils/adt/xml.c | 68 ++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 38 deletions(-) diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index 056d811594909..28af16fe93eea 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -1730,7 +1730,7 @@ xml_doctype_in_content(const xmlChar *str) * xmloption_arg, but a DOCTYPE node in the input can force DOCUMENT mode). * * If parsed_nodes isn't NULL and we parse in CONTENT mode, the list - * of parsed nodes from the xmlParseInNodeContext call will be returned + * of parsed nodes from the xmlParseBalancedChunkMemory call will be returned * to *parsed_nodes. (It is caller's responsibility to free that.) * * Errors normally result in ereport(ERROR), but if escontext is an @@ -1756,6 +1756,7 @@ xml_parse(text *data, XmlOptionType xmloption_arg, PgXmlErrorContext *xmlerrcxt; volatile xmlParserCtxtPtr ctxt = NULL; volatile xmlDocPtr doc = NULL; + volatile int save_keep_blanks = -1; /* * This step looks annoyingly redundant, but we must do it to have a @@ -1783,7 +1784,6 @@ xml_parse(text *data, XmlOptionType xmloption_arg, PG_TRY(); { bool parse_as_document = false; - int options; int res_code; size_t count = 0; xmlChar *version = NULL; @@ -1814,18 +1814,6 @@ xml_parse(text *data, XmlOptionType xmloption_arg, parse_as_document = true; } - /* - * Select parse options. - * - * Note that here we try to apply DTD defaults (XML_PARSE_DTDATTR) - * according to SQL/XML:2008 GR 10.16.7.d: 'Default values defined by - * internal DTD are applied'. As for external DTDs, we try to support - * them too (see SQL/XML:2008 GR 10.16.7.e), but that doesn't really - * happen because xmlPgEntityLoader prevents it. - */ - options = XML_PARSE_NOENT | XML_PARSE_DTDATTR - | (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS); - /* initialize output parameters */ if (parsed_xmloptiontype != NULL) *parsed_xmloptiontype = parse_as_document ? XMLOPTION_DOCUMENT : @@ -1835,11 +1823,26 @@ xml_parse(text *data, XmlOptionType xmloption_arg, if (parse_as_document) { + int options; + + /* set up parser context used by xmlCtxtReadDoc */ ctxt = xmlNewParserCtxt(); if (ctxt == NULL || xmlerrcxt->err_occurred) xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, "could not allocate parser context"); + /* + * Select parse options. + * + * Note that here we try to apply DTD defaults (XML_PARSE_DTDATTR) + * according to SQL/XML:2008 GR 10.16.7.d: 'Default values defined + * by internal DTD are applied'. As for external DTDs, we try to + * support them too (see SQL/XML:2008 GR 10.16.7.e), but that + * doesn't really happen because xmlPgEntityLoader prevents it. + */ + options = XML_PARSE_NOENT | XML_PARSE_DTDATTR + | (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS); + doc = xmlCtxtReadDoc(ctxt, utf8string, NULL, /* no URL */ "UTF-8", @@ -1861,10 +1864,7 @@ xml_parse(text *data, XmlOptionType xmloption_arg, } else { - xmlNodePtr root; - xmlNodePtr oldroot PG_USED_FOR_ASSERTS_ONLY; - - /* set up document with empty root node to be the context node */ + /* set up document that xmlParseBalancedChunkMemory will add to */ doc = xmlNewDoc(version); if (doc == NULL || xmlerrcxt->err_occurred) xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, @@ -1877,36 +1877,23 @@ xml_parse(text *data, XmlOptionType xmloption_arg, "could not allocate XML document"); doc->standalone = standalone; - root = xmlNewNode(NULL, (const xmlChar *) "content-root"); - if (root == NULL || xmlerrcxt->err_occurred) - xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, - "could not allocate xml node"); - - /* - * This attaches root to doc, so we need not free it separately; - * and there can't yet be any old root to free. - */ - oldroot = xmlDocSetRootElement(doc, root); - Assert(oldroot == NULL); + /* set parse options --- have to do this the ugly way */ + save_keep_blanks = xmlKeepBlanksDefault(preserve_whitespace ? 1 : 0); /* allow empty content */ if (*(utf8string + count)) { xmlNodePtr node_list = NULL; - xmlParserErrors res; - - res = xmlParseInNodeContext(root, - (char *) utf8string + count, - strlen((char *) utf8string + count), - options, - &node_list); - if (res != XML_ERR_OK || xmlerrcxt->err_occurred) + res_code = xmlParseBalancedChunkMemory(doc, NULL, NULL, 0, + utf8string + count, + &node_list); + if (res_code != 0 || xmlerrcxt->err_occurred) { - xmlFreeNodeList(node_list); xml_errsave(escontext, xmlerrcxt, ERRCODE_INVALID_XML_CONTENT, "invalid XML content"); + xmlFreeNodeList(node_list); goto fail; } @@ -1922,6 +1909,8 @@ xml_parse(text *data, XmlOptionType xmloption_arg, } PG_CATCH(); { + if (save_keep_blanks != -1) + xmlKeepBlanksDefault(save_keep_blanks); if (doc != NULL) xmlFreeDoc(doc); if (ctxt != NULL) @@ -1933,6 +1922,9 @@ xml_parse(text *data, XmlOptionType xmloption_arg, } PG_END_TRY(); + if (save_keep_blanks != -1) + xmlKeepBlanksDefault(save_keep_blanks); + if (ctxt != NULL) xmlFreeParserCtxt(ctxt); From 1fe9e3822c4e574aa526b99af723e61e03f36d4f Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Tue, 29 Jul 2025 10:41:13 +0300 Subject: [PATCH 093/138] Clarify documentation for the initcap function This commit documents differences in the definition of word separators for the initcap function between libc and ICU locale providers. Backpatch to all supported branches. Discussion: https://postgr.es/m/804cc10ef95d4d3b298e76b181fd9437%40postgrespro.ru Author: Oleg Tselebrovskiy Backpatch-through: 13 --- doc/src/sgml/func.sgml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 479b66b0a5b34..05e6d78ad6a05 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -3148,8 +3148,11 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in Converts the first letter of each word to upper case and the - rest to lower case. Words are sequences of alphanumeric - characters separated by non-alphanumeric characters. + rest to lower case. When using the libc locale + provider, words are sequences of alphanumeric characters separated + by non-alphanumeric characters; when using the ICU locale provider, + words are separated according to + Unicode Standard Annex #29. initcap('hi THOMAS') From d5f014d897c81d1a610d8ee8084524aacf6ae3fb Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 29 Jul 2025 12:47:19 -0400 Subject: [PATCH 094/138] Remove unnecessary complication around xmlParseBalancedChunkMemory. When I prepared 71c0921b6 et al yesterday, I was thinking that the logic involving explicitly freeing the node_list output was still needed to dodge leakage bugs in libxml2. But I was misremembering: we introduced that only because with early 2.13.x releases we could not trust xmlParseBalancedChunkMemory's result code, so we had to look to see if a node list was returned or not. There's no reason to believe that xmlParseBalancedChunkMemory will fail to clean up the node list when required, so simplify. (This essentially completes reverting all the non-cosmetic changes in 6082b3d5d.) Reported-by: Jim Jones Author: Tom Lane Discussion: https://postgr.es/m/997668.1753802857@sss.pgh.pa.us Backpatch-through: 13 --- src/backend/utils/adt/xml.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index 28af16fe93eea..cf793b6fc6698 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -1883,24 +1883,16 @@ xml_parse(text *data, XmlOptionType xmloption_arg, /* allow empty content */ if (*(utf8string + count)) { - xmlNodePtr node_list = NULL; - res_code = xmlParseBalancedChunkMemory(doc, NULL, NULL, 0, utf8string + count, - &node_list); + parsed_nodes); if (res_code != 0 || xmlerrcxt->err_occurred) { xml_errsave(escontext, xmlerrcxt, ERRCODE_INVALID_XML_CONTENT, "invalid XML content"); - xmlFreeNodeList(node_list); goto fail; } - - if (parsed_nodes != NULL) - *parsed_nodes = node_list; - else - xmlFreeNodeList(node_list); } } From 8e5e3ff5564104b5e1c3c459d626967a702ad9fb Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Tue, 29 Jul 2025 15:17:40 -0400 Subject: [PATCH 095/138] Don't put library-supplied -L/-I switches before user-supplied ones. For many optional libraries, we extract the -L and -l switches needed to link the library from a helper program such as llvm-config. In some cases we put the resulting -L switches into LDFLAGS ahead of -L switches specified via --with-libraries. That risks breaking the user's intention for --with-libraries. It's not such a problem if the library's -L switch points to a directory containing only that library, but on some platforms a library helper may "helpfully" offer a switch such as -L/usr/lib that points to a directory holding all standard libraries. If the user specified --with-libraries in hopes of overriding the standard build of some library, the -L/usr/lib switch prevents that from happening since it will come before the user-specified directory. To fix, avoid inserting these switches directly into LDFLAGS during configure, instead adding them to LIBDIRS or SHLIB_LINK. They will still eventually get added to LDFLAGS, but only after the switches coming from --with-libraries. The same problem exists for -I switches: those coming from --with-includes should appear before any coming from helper programs such as llvm-config. We have not heard field complaints about this case, but it seems certain that a user attempting to override a standard library could have issues. The changes for this go well beyond configure itself, however, because many Makefiles have occasion to manipulate CPPFLAGS to insert locally-desirable -I switches, and some of them got it wrong. The correct ordering is any -I switches pointing at within-the- source-tree-or-build-tree directories, then those from the tree-wide CPPFLAGS, then those from helper programs. There were several places that risked pulling in a system-supplied copy of libpq headers, for example, instead of the in-tree files. (Commit cb36f8ec2 fixed one instance of that a few months ago, but this exercise found more.) The Meson build scripts may or may not have any comparable problems, but I'll leave it to someone else to investigate that. Reported-by: Charles Samborski Author: Tom Lane Discussion: https://postgr.es/m/70f2155f-27ca-4534-b33d-7750e20633d7@demurgos.net Backpatch-through: 13 --- config/llvm.m4 | 4 ++-- config/programs.m4 | 4 ++-- configure | 24 ++++++++++++------------ configure.ac | 18 +++++++++--------- src/Makefile.global.in | 2 +- src/backend/jit/llvm/Makefile | 2 +- src/bin/initdb/Makefile | 2 +- src/common/Makefile | 2 +- src/interfaces/libpq-oauth/Makefile | 2 +- src/interfaces/libpq/Makefile | 2 +- src/pl/plpython/Makefile | 2 +- src/pl/tcl/Makefile | 2 +- 12 files changed, 33 insertions(+), 33 deletions(-) diff --git a/config/llvm.m4 b/config/llvm.m4 index fa4bedd9370fc..9d6fe8199e364 100644 --- a/config/llvm.m4 +++ b/config/llvm.m4 @@ -4,7 +4,7 @@ # ----------------- # # Look for the LLVM installation, check that it's new enough, set the -# corresponding LLVM_{CFLAGS,CXXFLAGS,BINPATH} and LDFLAGS +# corresponding LLVM_{CFLAGS,CXXFLAGS,BINPATH,LIBS} # variables. Also verify that CLANG is available, to transform C # into bitcode. # @@ -55,7 +55,7 @@ AC_DEFUN([PGAC_LLVM_SUPPORT], for pgac_option in `$LLVM_CONFIG --ldflags`; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LLVM_LIBS="$LLVM_LIBS $pgac_option";; esac done diff --git a/config/programs.m4 b/config/programs.m4 index c73d9307ea8a9..e57fe4907b844 100644 --- a/config/programs.m4 +++ b/config/programs.m4 @@ -290,8 +290,8 @@ AC_DEFUN([PGAC_CHECK_LIBCURL], pgac_save_LDFLAGS=$LDFLAGS pgac_save_LIBS=$LIBS - CPPFLAGS="$LIBCURL_CPPFLAGS $CPPFLAGS" - LDFLAGS="$LIBCURL_LDFLAGS $LDFLAGS" + CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS" + LDFLAGS="$LDFLAGS $LIBCURL_LDFLAGS" AC_CHECK_HEADER(curl/curl.h, [], [AC_MSG_ERROR([header file is required for --with-libcurl])]) diff --git a/configure b/configure index 1b9980226c5d2..d0db84867b93e 100755 --- a/configure +++ b/configure @@ -5194,7 +5194,7 @@ fi for pgac_option in `$LLVM_CONFIG --ldflags`; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LLVM_LIBS="$LLVM_LIBS $pgac_option";; esac done @@ -9436,12 +9436,12 @@ fi # Note the user could also set XML2_CFLAGS/XML2_LIBS directly for pgac_option in $XML2_CFLAGS; do case $pgac_option in - -I*|-D*) CPPFLAGS="$CPPFLAGS $pgac_option";; + -I*|-D*) INCLUDES="$INCLUDES $pgac_option";; esac done for pgac_option in $XML2_LIBS; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LIBDIRS="$LIBDIRS $pgac_option";; esac done fi @@ -9666,12 +9666,12 @@ fi # note that -llz4 will be added by AC_CHECK_LIB below. for pgac_option in $LZ4_CFLAGS; do case $pgac_option in - -I*|-D*) CPPFLAGS="$CPPFLAGS $pgac_option";; + -I*|-D*) INCLUDES="$INCLUDES $pgac_option";; esac done for pgac_option in $LZ4_LIBS; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LIBDIRS="$LIBDIRS $pgac_option";; esac done fi @@ -9807,12 +9807,12 @@ fi # note that -lzstd will be added by AC_CHECK_LIB below. for pgac_option in $ZSTD_CFLAGS; do case $pgac_option in - -I*|-D*) CPPFLAGS="$CPPFLAGS $pgac_option";; + -I*|-D*) INCLUDES="$INCLUDES $pgac_option";; esac done for pgac_option in $ZSTD_LIBS; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LIBDIRS="$LIBDIRS $pgac_option";; esac done fi @@ -12723,8 +12723,8 @@ if test "$with_libcurl" = yes ; then pgac_save_LDFLAGS=$LDFLAGS pgac_save_LIBS=$LIBS - CPPFLAGS="$LIBCURL_CPPFLAGS $CPPFLAGS" - LDFLAGS="$LIBCURL_LDFLAGS $LDFLAGS" + CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS" + LDFLAGS="$LDFLAGS $LIBCURL_LDFLAGS" ac_fn_c_check_header_mongrel "$LINENO" "curl/curl.h" "ac_cv_header_curl_curl_h" "$ac_includes_default" if test "x$ac_cv_header_curl_curl_h" = xyes; then : @@ -16658,7 +16658,7 @@ fi if test "$with_icu" = yes; then ac_save_CPPFLAGS=$CPPFLAGS - CPPFLAGS="$ICU_CFLAGS $CPPFLAGS" + CPPFLAGS="$CPPFLAGS $ICU_CFLAGS" # Verify we have ICU's header files ac_fn_c_check_header_mongrel "$LINENO" "unicode/ucol.h" "ac_cv_header_unicode_ucol_h" "$ac_includes_default" @@ -18876,7 +18876,7 @@ Use --without-tcl to disable building PL/Tcl." "$LINENO" 5 fi # now that we have TCL_INCLUDE_SPEC, we can check for ac_save_CPPFLAGS=$CPPFLAGS - CPPFLAGS="$TCL_INCLUDE_SPEC $CPPFLAGS" + CPPFLAGS="$CPPFLAGS $TCL_INCLUDE_SPEC" ac_fn_c_check_header_mongrel "$LINENO" "tcl.h" "ac_cv_header_tcl_h" "$ac_includes_default" if test "x$ac_cv_header_tcl_h" = xyes; then : @@ -18945,7 +18945,7 @@ fi # check for if test "$with_python" = yes; then ac_save_CPPFLAGS=$CPPFLAGS - CPPFLAGS="$python_includespec $CPPFLAGS" + CPPFLAGS="$CPPFLAGS $python_includespec" ac_fn_c_check_header_mongrel "$LINENO" "Python.h" "ac_cv_header_Python_h" "$ac_includes_default" if test "x$ac_cv_header_Python_h" = xyes; then : diff --git a/configure.ac b/configure.ac index 3e3fcfa98314a..f3fb8794645a8 100644 --- a/configure.ac +++ b/configure.ac @@ -1103,12 +1103,12 @@ if test "$with_libxml" = yes ; then # Note the user could also set XML2_CFLAGS/XML2_LIBS directly for pgac_option in $XML2_CFLAGS; do case $pgac_option in - -I*|-D*) CPPFLAGS="$CPPFLAGS $pgac_option";; + -I*|-D*) INCLUDES="$INCLUDES $pgac_option";; esac done for pgac_option in $XML2_LIBS; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LIBDIRS="$LIBDIRS $pgac_option";; esac done fi @@ -1152,12 +1152,12 @@ if test "$with_lz4" = yes; then # note that -llz4 will be added by AC_CHECK_LIB below. for pgac_option in $LZ4_CFLAGS; do case $pgac_option in - -I*|-D*) CPPFLAGS="$CPPFLAGS $pgac_option";; + -I*|-D*) INCLUDES="$INCLUDES $pgac_option";; esac done for pgac_option in $LZ4_LIBS; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LIBDIRS="$LIBDIRS $pgac_option";; esac done fi @@ -1177,12 +1177,12 @@ if test "$with_zstd" = yes; then # note that -lzstd will be added by AC_CHECK_LIB below. for pgac_option in $ZSTD_CFLAGS; do case $pgac_option in - -I*|-D*) CPPFLAGS="$CPPFLAGS $pgac_option";; + -I*|-D*) INCLUDES="$INCLUDES $pgac_option";; esac done for pgac_option in $ZSTD_LIBS; do case $pgac_option in - -L*) LDFLAGS="$LDFLAGS $pgac_option";; + -L*) LIBDIRS="$LIBDIRS $pgac_option";; esac done fi @@ -1944,7 +1944,7 @@ fi if test "$with_icu" = yes; then ac_save_CPPFLAGS=$CPPFLAGS - CPPFLAGS="$ICU_CFLAGS $CPPFLAGS" + CPPFLAGS="$CPPFLAGS $ICU_CFLAGS" # Verify we have ICU's header files AC_CHECK_HEADER(unicode/ucol.h, [], @@ -2344,7 +2344,7 @@ Use --without-tcl to disable building PL/Tcl.]) fi # now that we have TCL_INCLUDE_SPEC, we can check for ac_save_CPPFLAGS=$CPPFLAGS - CPPFLAGS="$TCL_INCLUDE_SPEC $CPPFLAGS" + CPPFLAGS="$CPPFLAGS $TCL_INCLUDE_SPEC" AC_CHECK_HEADER(tcl.h, [], [AC_MSG_ERROR([header file is required for Tcl])]) CPPFLAGS=$ac_save_CPPFLAGS fi @@ -2381,7 +2381,7 @@ fi # check for if test "$with_python" = yes; then ac_save_CPPFLAGS=$CPPFLAGS - CPPFLAGS="$python_includespec $CPPFLAGS" + CPPFLAGS="$CPPFLAGS $python_includespec" AC_CHECK_HEADER(Python.h, [], [AC_MSG_ERROR([header file is required for Python])]) CPPFLAGS=$ac_save_CPPFLAGS fi diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 04952b533ded9..8b1b357beaa04 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -254,7 +254,7 @@ CPP = @CPP@ CPPFLAGS = @CPPFLAGS@ PG_SYSROOT = @PG_SYSROOT@ -override CPPFLAGS := $(ICU_CFLAGS) $(LIBNUMA_CFLAGS) $(LIBURING_CFLAGS) $(CPPFLAGS) +override CPPFLAGS += $(ICU_CFLAGS) $(LIBNUMA_CFLAGS) $(LIBURING_CFLAGS) ifdef PGXS override CPPFLAGS := -I$(includedir_server) -I$(includedir_internal) $(CPPFLAGS) diff --git a/src/backend/jit/llvm/Makefile b/src/backend/jit/llvm/Makefile index e8c12060b93df..68677ba42e189 100644 --- a/src/backend/jit/llvm/Makefile +++ b/src/backend/jit/llvm/Makefile @@ -31,7 +31,7 @@ endif # All files in this directory use LLVM. CFLAGS += $(LLVM_CFLAGS) CXXFLAGS += $(LLVM_CXXFLAGS) -override CPPFLAGS := $(LLVM_CPPFLAGS) $(CPPFLAGS) +override CPPFLAGS += $(LLVM_CPPFLAGS) SHLIB_LINK += $(LLVM_LIBS) # Because this module includes C++ files, we need to use a C++ diff --git a/src/bin/initdb/Makefile b/src/bin/initdb/Makefile index 997e0a013e956..c0470efda92a3 100644 --- a/src/bin/initdb/Makefile +++ b/src/bin/initdb/Makefile @@ -20,7 +20,7 @@ include $(top_builddir)/src/Makefile.global # from libpq, else we have risks of version skew if we run with a libpq # shared library from a different PG version. Define # USE_PRIVATE_ENCODING_FUNCS to ensure that that happens. -override CPPFLAGS := -DUSE_PRIVATE_ENCODING_FUNCS -I$(libpq_srcdir) -I$(top_srcdir)/src/timezone $(ICU_CFLAGS) $(CPPFLAGS) +override CPPFLAGS := -DUSE_PRIVATE_ENCODING_FUNCS -I$(libpq_srcdir) -I$(top_srcdir)/src/timezone $(CPPFLAGS) $(ICU_CFLAGS) # We need libpq only because fe_utils does. LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport) $(ICU_LIBS) diff --git a/src/common/Makefile b/src/common/Makefile index 1e2b91c83c4c4..2c720caa50972 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -163,7 +163,7 @@ libpgcommon_shlib.a: $(OBJS_SHLIB) # The JSON API normally exits on out-of-memory; disable that behavior for shared # library builds. This requires libpq's pqexpbuffer.h. jsonapi_shlib.o: override CPPFLAGS += -DJSONAPI_USE_PQEXPBUFFER -jsonapi_shlib.o: override CPPFLAGS += -I$(libpq_srcdir) +jsonapi_shlib.o: override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS) # Because this uses its own compilation rule, it doesn't use the # dependency tracking logic from Makefile.global. To make sure that diff --git a/src/interfaces/libpq-oauth/Makefile b/src/interfaces/libpq-oauth/Makefile index 270fc0cf2d9d9..682f17413b3a4 100644 --- a/src/interfaces/libpq-oauth/Makefile +++ b/src/interfaces/libpq-oauth/Makefile @@ -24,7 +24,7 @@ NAME = pq-oauth-$(MAJORVERSION) override shlib := lib$(NAME)$(DLSUFFIX) override stlib := libpq-oauth.a -override CPPFLAGS := -I$(libpq_srcdir) -I$(top_builddir)/src/port $(LIBCURL_CPPFLAGS) $(CPPFLAGS) +override CPPFLAGS := -I$(libpq_srcdir) -I$(top_builddir)/src/port $(CPPFLAGS) $(LIBCURL_CPPFLAGS) OBJS = \ $(WIN32RES) diff --git a/src/interfaces/libpq/Makefile b/src/interfaces/libpq/Makefile index 47d6781150944..da6650066d46e 100644 --- a/src/interfaces/libpq/Makefile +++ b/src/interfaces/libpq/Makefile @@ -24,7 +24,7 @@ NAME= pq SO_MAJOR_VERSION= 5 SO_MINOR_VERSION= $(MAJORVERSION) -override CPPFLAGS := -I$(srcdir) $(CPPFLAGS) -I$(top_builddir)/src/port -I$(top_srcdir)/src/port +override CPPFLAGS := -I$(srcdir) -I$(top_builddir)/src/port -I$(top_srcdir)/src/port $(CPPFLAGS) ifneq ($(PORTNAME), win32) override CFLAGS += $(PTHREAD_CFLAGS) endif diff --git a/src/pl/plpython/Makefile b/src/pl/plpython/Makefile index f959083a0bdec..25f295c3709e2 100644 --- a/src/pl/plpython/Makefile +++ b/src/pl/plpython/Makefile @@ -11,7 +11,7 @@ ifeq ($(PORTNAME), win32) override python_libspec = endif -override CPPFLAGS := -I. -I$(srcdir) $(python_includespec) $(CPPFLAGS) +override CPPFLAGS := -I. -I$(srcdir) $(CPPFLAGS) $(python_includespec) rpathdir = $(python_libdir) diff --git a/src/pl/tcl/Makefile b/src/pl/tcl/Makefile index ea52a2efc229d..dd57f7d694c82 100644 --- a/src/pl/tcl/Makefile +++ b/src/pl/tcl/Makefile @@ -11,7 +11,7 @@ top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -override CPPFLAGS := -I. -I$(srcdir) $(TCL_INCLUDE_SPEC) $(CPPFLAGS) +override CPPFLAGS := -I. -I$(srcdir) $(CPPFLAGS) $(TCL_INCLUDE_SPEC) # On Windows, we don't link directly with the Tcl library; see below ifneq ($(PORTNAME), win32) From fce7da1e73853b21a8084e645ac39354c1476261 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 30 Jul 2025 00:39:49 +0300 Subject: [PATCH 096/138] Handle cancel requests with PID 0 gracefully If the client sent a query cancel request with backend PID 0, it tripped an assertion. With assertions disabled, you got this in the log instead: LOG: invalid cancel request with PID 0 LOG: wrong key in cancel request for process 0 Query cancellations don't even require authentication, so we better tolerate bogus requests. Fix by turning the assertion into a regular runtime check. Spotted while testing libpq behavior with a modified server that didn't send BackendKeyData to the client. Backpatch-through: 18 --- src/backend/storage/ipc/procsignal.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index a9bb540b55ac2..087821311cceb 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -728,7 +728,11 @@ procsignal_sigusr1_handler(SIGNAL_ARGS) void SendCancelRequest(int backendPID, const uint8 *cancel_key, int cancel_key_len) { - Assert(backendPID != 0); + if (backendPID == 0) + { + ereport(LOG, (errmsg("invalid cancel request with PID 0"))); + return; + } /* * See if we have a matching backend. Reading the pss_pid and From a60691eb201dfaa8a2c3aec8815d08ed3371aae7 Mon Sep 17 00:00:00 2001 From: Bruce Momjian Date: Tue, 29 Jul 2025 22:27:01 -0400 Subject: [PATCH 097/138] doc PG 18 relnotes: update to current Backpatch-through: 18 only --- doc/src/sgml/release-18.sgml | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/doc/src/sgml/release-18.sgml b/doc/src/sgml/release-18.sgml index b4bd3559a3946..c1f111f6fd780 100644 --- a/doc/src/sgml/release-18.sgml +++ b/doc/src/sgml/release-18.sgml @@ -6,7 +6,7 @@ Release date: - 2025-??-??, CURRENT AS OF 2025-06-20 + 2025-??-??, CURRENT AS OF 2025-07-29 @@ -1181,15 +1181,18 @@ Author: Álvaro Herrera 2025-03-18 [62d712ecf] Introduce squashing of constant lists in query jumbling Author: Álvaro Herrera 2025-03-27 [9fbd53dea] Remove the query_id_squash_values GUC +Author: Álvaro Herrera +Branch: master Release: REL_18_BR [c2da1a5d6] 2025-06-24 19:36:32 +0200 --> Have query id computation - of arrays consider only the first and last array elements (Dmitry + of constant lists consider only the first and last constants (Dmitry Dolgov, Sami Imseih) § § + § @@ -1930,6 +1933,8 @@ Author: Peter Eisentraut @@ -1940,6 +1945,7 @@ Author: Álvaro Herrera linkend="catalog-pg-constraint">pg_constraint (Álvaro Herrera, Bernd Helmle) § + § @@ -2717,6 +2723,8 @@ Author: Thomas Munro @@ -2724,6 +2732,7 @@ Author: Michael Paquier Allow psql to parse, bind, and close named prepared statements (Anthonin Bonnefoy, Michael Paquier) § + § @@ -3271,13 +3280,16 @@ Author: Amit Kapila Add pg_createsubscriber option - to remove publications (Shubham Khanna) + to remove publications (Shubham Khanna) § + § @@ -3303,9 +3315,14 @@ Author: Masahiko Sawada Add option - to specify failover slots (Hayato Kuroda) + to specify failover slots (Hayato Kuroda) § + + + Also add option as a synonym + for , and deprecate the latter. + - - - - Allow to dump in the same output - formats as pg_dump supports (Mahendra - Singh Thalor, Andrew Dunstan) - § - - - - Also modify to handle such dumps. - Previously pg_dumpall only supported - text format. - - -

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy